# 🎭 Theaters & Cinemas in Berlin — ntegration of a new data layer on theaters into the database.


This issue outlines the integration of a new data layer on Theaters in Berlin into the database.

This work is part of EPIC 2: Data Foundation & Frontend Context, which focuses on building the data layers for the MVP.

✅ CSV saved to: /Users/mariannagokova/Documents/DA_PROJECT/Thetres_Berlin_Task/clear_ready_git/source/theaters_berlin_db_ready.csv


✅ GeoJSON saved to: /Users/mariannagokova/Documents/DA_PROJECT/Thetres_Berlin_Task/clear_ready_git/source/theaters_berlin_db_ready.geojson


📦 CSV size: 77.8 KB
📦 GeoJSON size: 226.9 KB

🧾 Saved dataset summary:
Rows: 282
Columns: 26
CRS: epsg:4326

In [None]:
"""
CREATE TABLE theaters (
    theater_id         VARCHAR(64) PRIMARY KEY,         -- Unique stable ID (e.g., hash or UUID)
    name               VARCHAR(255) NOT NULL,           -- Official name of the theatre or cinema
    name_key           VARCHAR(255),                    -- Normalized lowercase name (slug, key)
    place_type         VARCHAR(50),                     -- Type: "cinema", "theatre", etc.
    operator           VARCHAR(255),                    -- Organization or company operating it
    opening_hours      VARCHAR(255),                    -- OSM-style hours string
    wheelchair         VARCHAR(50),                     -- Accessibility info: "yes", "no", "limited"
    screen             INTEGER,                         -- Number of screens (for cinemas)
    website            VARCHAR(255),                    -- Official website URL
    phone              VARCHAR(100),                    -- Contact phone number
    email              VARCHAR(255),                    -- Contact email address
    addr_full          VARCHAR(255),                    -- Full formatted address
    addr_street        VARCHAR(255),                    -- Street name
    addr_housenumber   VARCHAR(50),                     -- House or building number
    addr_postcode      VARCHAR(20),                     -- Postal code
    addr_city          VARCHAR(100),                    -- City (usually "Berlin")
    addr_country       VARCHAR(100),                    -- Country (usually "Germany")
    theatre_tags       TEXT,                            -- Raw tags or classification info from OSM/Wikidata
    theatre_category   VARCHAR(100),                    -- Derived label: e.g., "performing arts", "independent cinema"
    district_id        VARCHAR(10),                     -- LOR district code
    district           VARCHAR(100),                    -- LOR district name
    neighborhood_id    VARCHAR(10),                     -- LOR neighborhood (Ortsteil) code
    longitude          DECIMAL(9,6),                    -- WGS84 coordinate (lon)
    latitude           DECIMAL(9,6),                    -- WGS84 coordinate (lat)
    last_updated       TIMESTAMP DEFAULT CURRENT_TIMESTAMP  -- Timestamp when data last updated
    CONSTRAINT district_id_fk FOREIGN KEY (district_id)
        REFERENCES berlin_data.districts(district_id)
        ON DELETE RESTRICT
        ON UPDATE CASCADE
);
"""

Make sure Python environment has psycopg2 or SQLAlchemy installed:

In [None]:
#pip install psycopg2-binary sqlalchemy pandas


In [24]:
import psycopg2
import pandas as pd

conn = psycopg2.connect(
    host="127.0.0.1",
    port=5433,                     # tunnel port
    user="marianna_gokova",              # <-- DB user
    password="6n2b8nw9IfmNdyYY",      # <-- DB password
    dbname="layereddb",         # <-- database name
    sslmode="require"              # use "verify-full" if RDS enforces cert validation
)

# test the connection
with conn.cursor() as cur:
    cur.execute("SELECT current_database(), current_user;")
    print(cur.fetchall())


[('layereddb', 'marianna_gokova')]


In [25]:
# list schemas
query = """
SELECT schema_name
FROM information_schema.schemata
ORDER BY schema_name;
"""
schemas = pd.read_sql(query, conn)
schemas

  schemas = pd.read_sql(query, conn)


Unnamed: 0,schema_name
0,berlin_labels
1,berlin_recommender
2,berlin_source_data
3,dashboard_data
4,information_schema
5,pg_catalog
6,public


In [26]:
#See tables per schema
import pandas as pd

def show_tables(schema):
    return pd.read_sql(f"""
        SELECT table_schema, table_name
        FROM information_schema.tables
        WHERE table_schema = '{schema}'
        ORDER BY table_name;
    """, conn)

show_tables('berlin_labels')



  return pd.read_sql(f"""


Unnamed: 0,table_schema,table_name
0,berlin_labels,district_labels
1,berlin_labels,neighborhood_labels


In [27]:
#Inspect the source districts table (just to be sure)

import pandas as pd
pd.read_sql("""
SELECT column_name, data_type
FROM information_schema.columns
WHERE table_schema='berlin_source_data' AND table_name='districts'
ORDER BY ordinal_position;
""", conn)

  pd.read_sql("""


Unnamed: 0,column_name,data_type
0,district_id,character varying
1,district,character varying
2,geometry,USER-DEFINED


In [28]:
show_tables('berlin_source_data')

  return pd.read_sql(f"""


Unnamed: 0,table_schema,table_name
0,berlin_source_data,banks
1,berlin_source_data,bus_tram_stops
2,berlin_source_data,crime_statistics
3,berlin_source_data,dental_offices
4,berlin_source_data,district_level_aggregated
5,berlin_source_data,districts
6,berlin_source_data,districts_pop_stat
7,berlin_source_data,gyms
8,berlin_source_data,hospitals
9,berlin_source_data,hospitals_refactored


In [29]:
with conn, conn.cursor() as cur:
    cur.execute("""
    
CREATE TABLE IF NOT EXISTS berlin_source_data.theaters (
    theater_id         VARCHAR(64) PRIMARY KEY,         -- Unique stable ID (e.g., hash or UUID)
    name               VARCHAR(255) NOT NULL,           -- Official name of the theatre or cinema
    name_key           VARCHAR(255),                    -- Normalized lowercase name (slug, key)
    place_type         VARCHAR(50),                     -- Type: "cinema", "theatre", etc.
    operator           VARCHAR(255),                    -- Organization or company operating it
    opening_hours      VARCHAR(255),                    -- OSM-style hours string
    wheelchair         VARCHAR(50),                     -- Accessibility info: "yes", "no", "limited"
    screen             INTEGER,                         -- Number of screens (for cinemas)
    website            VARCHAR(255),                    -- Official website URL
    phone              VARCHAR(100),                    -- Contact phone number
    email              VARCHAR(255),                    -- Contact email address
    addr_full          VARCHAR(255),                    -- Full formatted address
    addr_street        VARCHAR(255),                    -- Street name
    addr_housenumber   VARCHAR(50),                     -- House or building number
    addr_postcode      VARCHAR(20),                     -- Postal code
    addr_city          VARCHAR(100),                    -- City (usually "Berlin")
    addr_country       VARCHAR(100),                    -- Country (usually "Germany")
    theatre_tags       TEXT,                            -- Raw tags or classification info from OSM/Wikidata
    theatre_category   VARCHAR(100),                    -- Derived label: e.g., "performing arts", "independent cinema"
    district_id        VARCHAR(10),                     -- LOR district code
    district           VARCHAR(100),                    -- LOR district name
    neighborhood_id    VARCHAR(10),                     -- LOR neighborhood (Ortsteil) code
    longitude          DECIMAL(9,6),                    -- WGS84 coordinate (lon)
    latitude           DECIMAL(9,6),                    -- WGS84 coordinate (lat)
    last_updated       TIMESTAMP DEFAULT CURRENT_TIMESTAMP,  -- Timestamp when data last updated
    CONSTRAINT district_id_fk FOREIGN KEY (district_id)
        REFERENCES berlin_source_data.districts(district_id)
        ON DELETE RESTRICT
        ON UPDATE CASCADE
);
    """)



In [30]:
#Quick check:
import pandas as pd

pd.read_sql("""
SELECT table_schema, table_name
FROM information_schema.tables
WHERE table_schema='berlin_source_data' AND table_name='theaters';
""", conn)



  pd.read_sql("""


Unnamed: 0,table_schema,table_name
0,berlin_source_data,theaters


In [None]:

import pandas as pd
from sqlalchemy import create_engine

# 1) Read CSV and fix column types for screen-Integer, not float
df = pd.read_csv("/Users/mariannagokova/Documents/DA_PROJECT/Thetres_Berlin_Task/clear_ready_git/source/theaters_berlin_db_ready.csv")

# If "screen" exists, convert to integer safely
if "screen" in df.columns:
    df["screen"] = pd.to_numeric(df["screen"], errors="coerce").fillna(0).astype(int)

# 2) Connect via SQLAlchemy (using your existing tunnel)
engine = create_engine("postgresql+psycopg2://marianna_gokova:6n2b8nw9IfmNdyYY@127.0.0.1:5433/layereddb?sslmode=require")

# 3) Load directly
df.to_sql(
    "theaters",
    schema="berlin_source_data",
    con=engine,
    if_exists="append",  # add data to existing table
    index=False
)

# 4) Check result
pd.read_sql("SELECT COUNT(*) FROM berlin_source_data.theaters;", engine)


Unnamed: 0,count
0,282


# 🧩 Sanity check block

In [33]:
import pandas as pd

# ✅ 1. Count total rows
total = pd.read_sql("""
SELECT COUNT(*) AS total_rows
FROM berlin_source_data.theaters;
""", engine)

# ✅ 2. Check for invalid or missing district_id (FK consistency)
missing_fk = pd.read_sql("""
SELECT COUNT(*) AS missing_districts
FROM berlin_source_data.theaters t
LEFT JOIN berlin_source_data.districts d
  ON d.district_id = t.district_id
WHERE d.district_id IS NULL OR t.district_id IS NULL;
""", engine)

# ✅ 3. Check for duplicate theater_id (should be unique)
dups_id = pd.read_sql("""
SELECT COUNT(*) AS duplicate_ids
FROM (
  SELECT theater_id
  FROM berlin_source_data.theaters
  GROUP BY theater_id
  HAVING COUNT(*) > 1
) dup;
""", engine)

# ✅ 4. Check for potential duplicates by name + address
dups_name = pd.read_sql("""
SELECT COUNT(*) AS duplicate_name_address
FROM (
  SELECT name, addr_street, addr_housenumber
  FROM berlin_source_data.theaters
  GROUP BY name, addr_street, addr_housenumber
  HAVING COUNT(*) > 1
) dup;
""", engine)

# ✅ 5. Distribution by district
by_district = pd.read_sql("""
SELECT district_id, COUNT(*) AS num_theaters
FROM berlin_source_data.theaters
GROUP BY district_id
ORDER BY num_theaters DESC;
""", engine)

# ✅ 6. Distribution by place_type
by_type = pd.read_sql("""
SELECT place_type, COUNT(*) AS num
FROM berlin_source_data.theaters
GROUP BY place_type
ORDER BY num DESC;
""", engine)

# ✅ 7. Quick geographic sanity check (any missing coordinates?)
geo_check = pd.read_sql("""
SELECT
  COUNT(*) FILTER (WHERE longitude IS NULL OR latitude IS NULL) AS missing_coordinates,
  COUNT(*) FILTER (WHERE longitude < 5 OR longitude > 20 OR latitude < 47 OR latitude > 56) AS out_of_bounds
FROM berlin_source_data.theaters;
""", engine)

# 🔍 Display results
print("===== Sanity Check Summary =====")
display(total)
display(missing_fk)
display(dups_id)
display(dups_name)
display(geo_check)

print("===== Distribution by District =====")
display(by_district.head(10))

print("===== Distribution by Place Type =====")
display(by_type.head(10))


===== Sanity Check Summary =====


Unnamed: 0,total_rows
0,282


Unnamed: 0,missing_districts
0,0


Unnamed: 0,duplicate_ids
0,0


Unnamed: 0,duplicate_name_address
0,6


Unnamed: 0,missing_coordinates,out_of_bounds
0,0,0


===== Distribution by District =====


Unnamed: 0,district_id,num_theaters
0,11001001,63
1,11002002,44
2,11004004,38
3,11003003,29
4,11007007,22
5,11009009,22
6,11008008,20
7,11006006,15
8,11010010,9
9,11011011,8


===== Distribution by Place Type =====


Unnamed: 0,place_type,num
0,theatre,190
1,cinema,92
