# 🎭 Theaters & Cinemas in Berlin 

# Data Source Discovery
**Step 0:** Fetch OSM data for Berlin (cinemas & theatres) and create a slim CSV (`osm_slim`) **without normalization**.

In [2]:
import geopandas as gpd
import osmnx as ox

#Fetch OSM features (cinemas & theatres) for Berlin, Germany
# --- Settings: cache + polite rate limiting + reproducibility (optional date snapshot) ---
ox.settings.use_cache = True
ox.settings.cache_folder = "cache_berlin"
ox.settings.log_console = True
# Optional: freeze Overpass to a specific snapshot date for reproducibility
# ox.settings.overpass_settings = '[out:json][timeout:180][date:"2025-10-01T00:00:00Z"];'

# --- Get Berlin boundary explicitly (avoids geocode ambiguities) ---
berlin_boundary = ox.geocode_to_gdf("Berlin, Germany")  # Berlin is a city-state; this polygon works well
berlin_poly = berlin_boundary.geometry.iloc[0]

# --- Define the OSM tag filter (union of values) ---
tags = {"amenity": ["theatre", "cinema"]}

# --- Fetch features within the polygon ---
theaters_osm_raw = ox.features_from_polygon(berlin_poly, tags)

print(f"Fetched rows: {len(theaters_osm_raw)}")
theaters_osm_raw.columns.tolist()



Fetched rows: 280


['geometry',
 'addr:city',
 'addr:country',
 'addr:housenumber',
 'addr:postcode',
 'addr:street',
 'addr:suburb',
 'alt_name',
 'amenity',
 'contact:facebook',
 'contact:phone',
 'contact:website',
 'email',
 'level',
 'name',
 'operator',
 'operator:type',
 'payment:cash',
 'payment:debit_cards',
 'screen',
 'toilets:wheelchair',
 'wheelchair',
 'contact:instagram',
 'contact:youtube',
 'phone',
 'website',
 'wheelchair:description',
 'wikidata',
 'wikimedia_commons',
 'wikipedia',
 'check_date',
 'theatre:genre',
 'contact:email',
 'contact:fax',
 'payment:credit_cards',
 'name:de',
 'name:ru',
 'brand',
 'brand:wikidata',
 'brand:wikipedia',
 'opening_hours',
 'brand:short',
 'description',
 'short_name',
 'old_name',
 'check_date:opening_hours',
 'contact:twitter',
 'drink:club-mate',
 'start_date',
 'name:etymology:wikidata',
 'name:he',
 'theatre:type',
 'image',
 'opening_hours:signed',
 'fax',
 'name:en',
 'name:es',
 'name:nl',
 'contact:tiktok',
 'payment:girocard',
 'paymen

In [3]:
# --- Imports ---
import pandas as pd
import geopandas as gpd
from pathlib import Path
import warnings

# --- Start with the raw OSM GeoDataFrame ---
gdf = theaters_osm_raw.copy()  # replace with loaded GeoDataFrame

# --- 1) Flatten index if needed ---
gdf = gdf.reset_index()
if "osmid" not in gdf.columns:
    rename_map = {}
    if "level_0" in gdf.columns and "level_1" in gdf.columns:
        rename_map = {"level_0": "element_type", "level_1": "osmid"}
    if rename_map:
        gdf = gdf.rename(columns=rename_map)

# --- 2) Keep only useful geometry types ---
allowed = {"Point", "Polygon", "MultiPolygon"}
gdf = gdf[gdf.geometry.geom_type.isin(allowed)].copy()

# --- 3) Deduplicate by osmid ---
if "osmid" in gdf.columns:
    gdf = gdf[~gdf["osmid"].duplicated(keep="first")].copy()

# --- 4) Define columns to keep for DB-ready table --
KEEP_COLS = [
    "name",
    "amenity",
    "operator",
    "opening_hours",
    "wheelchair",
    "toilets:wheelchair",
    "wheelchair:description",
    "cinema",
    "cinema:type",
    "cinema:3D",
    "screen",
    "theatre:type",
    "theatre:genre",
    "capacity",
    "start_date",
    "description",
    "website", "contact:website",
    "phone", "contact:phone",
    "email", "contact:email",
    "facebook", "instagram", "twitter", "youtube", "tiktok", "linkedin", "mastodon",
    "wikipedia", "wikidata",
    "addr:housenumber", "addr:street", "addr:postcode", "addr:city",
    "addr:country",
    "check_date"
]
base_cols = ["geometry"]
for c in ("element_type", "osmid"):
    if c in gdf.columns:
        base_cols.append(c)

existing = base_cols + [c for c in KEEP_COLS if c in gdf.columns]
osm_slim = gdf[existing].copy()
osm_slim.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 280 entries, 0 to 279
Data columns (total 33 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   geometry                280 non-null    geometry
 1   name                    276 non-null    object  
 2   amenity                 280 non-null    object  
 3   operator                87 non-null     object  
 4   opening_hours           48 non-null     object  
 5   wheelchair              206 non-null    object  
 6   toilets:wheelchair      76 non-null     object  
 7   wheelchair:description  20 non-null     object  
 8   cinema                  1 non-null      object  
 9   cinema:type             1 non-null      object  
 10  cinema:3D               1 non-null      object  
 11  screen                  36 non-null     object  
 12  theatre:type            22 non-null     object  
 13  theatre:genre           51 non-null     object  
 14  capacity          

In [4]:
osm_slim.columns.tolist()

['geometry',
 'name',
 'amenity',
 'operator',
 'opening_hours',
 'wheelchair',
 'toilets:wheelchair',
 'wheelchair:description',
 'cinema',
 'cinema:type',
 'cinema:3D',
 'screen',
 'theatre:type',
 'theatre:genre',
 'capacity',
 'start_date',
 'description',
 'website',
 'contact:website',
 'phone',
 'contact:phone',
 'email',
 'contact:email',
 'facebook',
 'instagram',
 'wikipedia',
 'wikidata',
 'addr:housenumber',
 'addr:street',
 'addr:postcode',
 'addr:city',
 'addr:country',
 'check_date']

In [5]:
import pandas as pd
import geopandas as gpd
import numpy as np
import re
# choose + keep a minimal raw set (include contact fallbacks for coalesce)
raw_keep = [
    "geometry","name","amenity","operator","opening_hours","wheelchair",
    "cinema","cinema:type","cinema:3D","screen","theatre:type","theatre:genre",
    "website","contact:website","phone","contact:phone","email","contact:email",
    "wikipedia","wikidata","addr:housenumber","addr:street","addr:postcode","addr:city","addr:country"
]

gdf = osm_slim[ [c for c in raw_keep if c in osm_slim.columns] ].copy()
# helpers
def _norm_text(s):
    if pd.isna(s):
        return pd.NA
    s = str(s).strip()
    s = re.sub(r"\s+", " ", s)
    return s if s else pd.NA

def _coalesce(*series):
    out = series[0].copy()
    for s in series[1:]:
        out = out.fillna(s)
    return out

def _name_key(s):
    if pd.isna(s): return pd.NA
    s = str(s).lower().strip()
    s = re.sub(r"\s+", " ", s)
    s = re.sub(r"[^\wäöüß\- ]", "", s, flags=re.IGNORECASE)  # keep common german chars
    s = re.sub(r"\b(theater|theatre|kino|lichtspiele|film|haus)\b", "", s).strip()
    s = re.sub(r"\s+", "-", s)  # slug-like
    return s or pd.NA

def _norm_phone(s):
    if pd.isna(s): return pd.NA
    s = re.sub(r"[^\d+]", "", str(s))  # keep digits and leading +
    # heuristic: add +49 if it looks local and starts with 0
    if s.startswith("0"):
        s = "+49" + s[1:]
    return s or pd.NA

def _norm_url(s):
    if pd.isna(s): return pd.NA
    s = str(s).strip()
    if not re.match(r"^https?://", s):
        s = "https://" + s
    return s


In [6]:
# CRS & coordinates
if gdf.crs is None:
    gdf.set_crs(epsg=4326, inplace=True)

centroids_wgs84 = gdf.to_crs(3857).geometry.centroid.to_crs(4326)
gdf["lon_num"] = centroids_wgs84.x.values
gdf["lat_num"] = centroids_wgs84.y.values

In [7]:
#  normalize main columns with helper
for col in ["name","operator","opening_hours","wheelchair","cinema","cinema:type",
            "screen","theatre:type","theatre:genre","wikipedia","wikidata",
            "addr:housenumber","addr:street","addr:postcode","addr:city","addr:country"]:
    if col in gdf.columns:
        gdf[col] = gdf[col].apply(_norm_text)


In [8]:
# wikidata: ensure bare QID
if "wikidata" in gdf.columns:
    gdf["wikidata"] = gdf["wikidata"].apply(lambda x: pd.NA if pd.isna(x) else str(x).rstrip("/").rsplit("/",1)[-1])


In [9]:
# coalesce contact fields
gdf["website"] = _coalesce(gdf.get("website", pd.Series(pd.NA, index=gdf.index)),
                           gdf.get("contact:website", pd.Series(pd.NA, index=gdf.index))).apply(_norm_url)
gdf["phone"]   = _coalesce(gdf.get("phone", pd.Series(pd.NA, index=gdf.index)),
                           gdf.get("contact:phone", pd.Series(pd.NA, index=gdf.index))).apply(_norm_phone)
gdf["email"]   = _coalesce(gdf.get("email", pd.Series(pd.NA, index=gdf.index)),
                           gdf.get("contact:email", pd.Series(pd.NA, index=gdf.index))).apply(_norm_text)

In [10]:
#  derive keys & tidy address 
gdf["name_key"] = gdf["name"].apply(_name_key)

In [11]:
#  addr_full for convenience
# join with spaces, collapse multiple spaces, strip ends; empty -> NA
gdf["addr_full"] = (
    gdf[["addr:street","addr:housenumber","addr:postcode","addr:city"]]
      .fillna("")                         # avoid NA booleans
      .astype(str)
      .agg(" ".join, axis=1)              # "street hn pc city"
      .str.replace(r"\s+", " ", regex=True)
      .str.strip()
      .replace({",": pd.NA})               # empty string back to NA
)

Add lat_num / lon_num from geometry (safe for points & polygons)

In [12]:
import geopandas as gpd

# 1) Ensure CRS
if gdf.crs is None:
    gdf = gdf.set_crs(epsg=4326)  # OSM is usually WGS84

# 2) Centroid in meters to avoid the GeoPandas warning, then back to WGS84
centroids = gdf.to_crs(3857).geometry.centroid.to_crs(4326)

gdf["lon_num"] = centroids.x
gdf["lat_num"] = centroids.y

# (optional) round for stable dedupe keys
gdf["lon_r"] = gdf["lon_num"].round(6)
gdf["lat_r"] = gdf["lat_num"].round(6)



In [13]:
#  dedupe (same normalized name at same coords)
gdf = gdf.drop_duplicates(subset=["name_key","lat_num","lon_num"])

In [14]:
#apply renaming before selecting columns
rename_map = {
    "addr:street": "addr_street",
    "addr:housenumber": "addr_housenumber",
    "addr:postcode": "addr_postcode",
    "addr:city": "addr_city",
    "addr:country": "addr_country",
    "cinema:type": "cinema_type",
    "cinema:3D": "cinema_3d",
    "theatre:type": "theatre_type",
    "theatre:genre": "theatre_genre",
}

gdf = gdf.rename(columns=rename_map)

# final column order (your list + helpful deriveds)
osm_cols_keep = [
    "geometry", "name", "name_key", "amenity", "operator",
    "opening_hours", "wheelchair",
    "cinema", "cinema_type", "cinema_3d", "screen",
    "theatre_type", "theatre_genre",
    "website", "phone", "email", "wikipedia", "wikidata",
    "addr_housenumber", "addr_street", "addr_postcode",
    "addr_city", "addr_country", "addr_full",
    "lat_num", "lon_num"
]
gdf_final = gdf[[c for c in osm_cols_keep if c in gdf.columns]].copy()

In [15]:
#lowercase everything for consistency
gdf_final.columns = [c.lower() for c in gdf_final.columns]

gdf_final.info()


<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 280 entries, 0 to 279
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   geometry          280 non-null    geometry
 1   name              276 non-null    object  
 2   name_key          276 non-null    object  
 3   amenity           280 non-null    object  
 4   operator          87 non-null     object  
 5   opening_hours     48 non-null     object  
 6   wheelchair        206 non-null    object  
 7   cinema            1 non-null      object  
 8   cinema_type       1 non-null      object  
 9   cinema_3d         1 non-null      object  
 10  screen            36 non-null     object  
 11  theatre_type      22 non-null     object  
 12  theatre_genre     51 non-null     object  
 13  website           230 non-null    object  
 14  phone             160 non-null    object  
 15  email             78 non-null     object  
 16  wikipedia         

Drop rows without name_key

In [16]:
# --- Drop rows where name_key is missing or empty ---
gdf_final = gdf_final[
    gdf_final["name_key"].notna() & (gdf_final["name_key"].str.strip() != "")
].copy()

print(f"✅ Remaining after drop: {len(gdf_final)} rows")

 


✅ Remaining after drop: 276 rows


Add source column for merge tracking 


In [17]:
gdf_final["source"] = "osm"

# --- Quick check ---
gdf_final[["name", "name_key", "source"]].head()

Unnamed: 0,name,name_key,source
0,Filmrauschpalast,filmrauschpalast,osm
1,Friedrichstadt-Palast,friedrichstadt-palast,osm
2,Quatsch Comedy Club,quatsch-comedy-club,osm
3,Kabarett-Theater Distel,kabarett--distel,osm
4,Admiralspalast,admiralspalast,osm


## ✅ Quick Sanity Check for gdf_final

In [18]:
import pandas as pd

print("🔍 --- BASIC INFO ---")
print(f"Rows: {len(gdf_final)}")
print(f"Columns: {len(gdf_final.columns)}")
print()

# 1️⃣ CRS & geometry
print("CRS:", gdf_final.crs)
print("Geometry type counts:")
print(gdf_final.geom_type.value_counts())
print()

# 2️⃣ Missing key fields
print("🧩 Missing values summary (top 10):")
nulls = gdf_final.isna().sum().sort_values(ascending=False)
print(nulls.head(10))
print()

# 3️⃣ Coordinate sanity
if {"lat_num", "lon_num"}.issubset(gdf_final.columns):
    print("📍 Coordinate ranges:")
    print("  lat:", gdf_final["lat_num"].min(), "→", gdf_final["lat_num"].max())
    print("  lon:", gdf_final["lon_num"].min(), "→", gdf_final["lon_num"].max())
    print()

# 4️⃣ Name consistency
print("🎭 Unique name_keys:", gdf_final["name_key"].nunique())
dups = gdf_final[gdf_final.duplicated(subset=["name_key"], keep=False)]
print("  Potential duplicate name_keys:", len(dups))
print()

# 5️⃣ Sample rows
print("📋 Sample data:")
display(gdf_final.sample(5, random_state=42)[
    ["name", "name_key", "amenity", "operator", "addr_full", "website", "source"]
])
# ✅ Wheelchair Accessibility Check Block
print("✅ --- Wheelchair Accessibility ---")

if "wheelchair" in gdf_final.columns:
    # 1️⃣ Frequency of values (including NaN)
    print("\nWheelchair value counts:")
    print(gdf_final["wheelchair"].fillna("MISSING").value_counts())

    # 2️⃣ Normalize text (optional, preview only)
    unique_vals = gdf_final["wheelchair"].dropna().unique().tolist()
    print("\nUnique raw wheelchair values:", unique_vals)

    # 3️⃣ Check for inconsistent / misspelled values
    expected = {"yes", "no", "limited", "designated", "unknown"}
    unexpected = {
        v for v in unique_vals if str(v).lower().strip() not in expected
    }
    if unexpected:
        print(f"\n⚠️ Unexpected wheelchair values found: {unexpected}")
    else:
        print("\n✅ All wheelchair values look standardized.")
else:
    print("⚠️ 'wheelchair' column not found in gdf_final.")



🔍 --- BASIC INFO ---
Rows: 276
Columns: 27

CRS: epsg:4326
Geometry type counts:
Point      206
Polygon     70
Name: count, dtype: int64

🧩 Missing values summary (top 10):
cinema           275
cinema_type      275
cinema_3d        275
theatre_type     257
screen           240
opening_hours    228
theatre_genre    226
wikipedia        204
email            198
operator         190
dtype: int64

📍 Coordinate ranges:
  lat: 52.40393401598455 → 52.62590689999999
  lon: 13.2015024 → 13.62591097482147

🎭 Unique name_keys: 270
  Potential duplicate name_keys: 10

📋 Sample data:


Unnamed: 0,name,name_key,amenity,operator,addr_full,website,source
30,Cineplex,cineplex,cinema,,,https://www.cineplex.de/berlin-neukoelln/,osm
124,Volksbühne am Rosa-Luxemburg-Platz,volksbühne-am-rosa-luxemburg-platz,theatre,,Linienstraße 227 10178 Berlin,http://www.volksbuehne-berlin.de/,osm
197,Expedition Metropolis,expedition-metropolis,theatre,,Ohlauer Straße 41 10999 Berlin,https://expedition-metropolis.de,osm
127,Studiobühne,studiobühne,theatre,,,,osm
217,Cineplex Alhambra,cineplex-alhambra,cinema,,Seestraße 94 13353 Berlin,https://www.cineplex.de/berlin-alhambra/,osm


✅ --- Wheelchair Accessibility ---

Wheelchair value counts:
wheelchair
yes        93
MISSING    71
limited    65
no         47
Name: count, dtype: int64

Unique raw wheelchair values: ['no', 'yes', 'limited']

✅ All wheelchair values look standardized.


🧾 Opening_hours normalized human-readable OSM format

In [19]:
#Keep it as cleaned text (object):
import re

def _norm_opening_hours(s):
    if pd.isna(s): return pd.NA
    s = str(s).strip()
    s = re.sub(r"\s+", " ", s)
    s = s.replace(";", "; ")  # space after semicolons
    return s if s else pd.NA

gdf_final["opening_hours"] = gdf_final["opening_hours"].apply(_norm_opening_hours)
gdf_final["opening_hours"] = gdf_final["opening_hours"].astype("string")


In [20]:
# which name_keys are duplicated?
dup_keys = (
    gdf_final["name_key"]
    .value_counts()
    .loc[lambda s: s > 1]
)

print("Duplicate groups:")
display(dup_keys)

# peek at the rows for each duplicated key
dups = gdf_final[gdf_final["name_key"].isin(dup_keys.index)].copy()
dups = dups.sort_values(["name_key", "addr_full", "website"])
display(dups[["name","name_key","amenity","addr_full","website","lat_num","lon_num"]].head(20))


Duplicate groups:


name_key
cinestar          3
freilichtbühne    3
cabuwazi          2
cineplex          2
Name: count, dtype: int64

Unnamed: 0,name,name_key,amenity,addr_full,website,lat_num,lon_num
273,Cabuwazi,cabuwazi,theatre,Bouchéstraße 74 12435 Berlin,https://cabuwazi.de/,52.489365,13.44817
220,Cabuwazi,cabuwazi,theatre,Otto-Rosenberg-Straße 2 12681 Berlin,https://cabuwazi.de/,52.551948,13.547572
13,Cineplex,cineplex,cinema,,http://www.cineplex.de,52.538501,13.206343
30,Cineplex,cineplex,cinema,,https://www.cineplex.de/berlin-neukoelln/,52.481966,13.431696
5,CineStar,cinestar,cinema,,,52.583919,13.286005
26,CineStar,cinestar,cinema,,,52.537798,13.604115
247,CineStar,cinestar,cinema,Elsenstraße 115 - 116 12435 Berlin,,52.491936,13.458688
133,Freilichtbühne,freilichtbühne,theatre,,,52.47268,13.269892
261,Freilichtbühne,freilichtbühne,theatre,,,52.500081,13.531442
266,Freilichtbühne,freilichtbühne,theatre,,,52.517686,13.477455


| name_key           | Count | Observation                                                                                    |
| ------------------ | ----- | ---------------------------------------------------------------------------------------------- |
| **cinestar**       | 3     | Different coordinates — multiple CineStar cinemas (Charlottenburg, Neukölln, etc.)             |
| **cineplex**       | 2     | Two separate Cineplex cinemas (Charlottenburg + Neukölln). Legitimate chain.                   |
| **cabuwazi**       | 2     | Two “Cabuwazi” children’s theatres in Berlin — correct (Friedrichshain & Marzahn).             |
| **freilichtbühne** | 3     | Three open-air stages (“Freilichtbühne” means open-air theatre) — different parks. Legitimate. |

✅ Conclusion:
These are real multi-location brands, not duplicates to drop.

In [21]:
#Auto-detect multi-branch theatres / cinemas
gdf_final["multi_branch"] = gdf_final["name_key"].duplicated(keep=False)
#✅ Optional check
print("✅ Multi-branch count:", gdf_final["multi_branch"].sum())
gdf_final[gdf_final["multi_branch"]].groupby("name_key").size()

✅ Multi-branch count: 10


name_key
cabuwazi          2
cineplex          2
cinestar          3
freilichtbühne    3
dtype: int64

In [22]:
#1️⃣ Add a boolean convenience flag for 
gdf_final["wheelchair_accessible"] = gdf_final["wheelchair"].map({
    "yes": True,
    "limited": True,
    "no": False
})
gdf_final["wheelchair"] = gdf_final["wheelchair"].astype("category")

In [23]:
gdf_final.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 276 entries, 0 to 279
Data columns (total 29 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   geometry               276 non-null    geometry
 1   name                   276 non-null    object  
 2   name_key               276 non-null    object  
 3   amenity                276 non-null    object  
 4   operator               86 non-null     object  
 5   opening_hours          48 non-null     string  
 6   wheelchair             205 non-null    category
 7   cinema                 1 non-null      object  
 8   cinema_type            1 non-null      object  
 9   cinema_3d              1 non-null      object  
 10  screen                 36 non-null     object  
 11  theatre_type           19 non-null     object  
 12  theatre_genre          50 non-null     object  
 13  website                230 non-null    object  
 14  phone                  160 non-null    

# ✅ gdf_final Summary — Clean, Normalized, and Merge-Ready

| Column                    | Type       | Status              | Description / Notes                                                                                                      |
| :------------------------ | :--------- | :------------------ | :----------------------------------------------------------------------------------------------------------------------- |
| **geometry**              | `geometry` | ✅ Clean             | Point or centroid geometry in EPSG:4326 (WGS84). Ready for spatial join / distance merge.                                |
| **name**                  | `string`   | ✅ Clean             | Original venue name from OSM (human-readable).                                                                           |
| **name_key**              | `string`   | ✅ Normalized        | Lowercased, stripped, simplified slug for joining with Wikidata (`df_wiki_dedup_clean`).                                 |
| **amenity**               | `string`   | ✅ Clean             | Either `"cinema"` or `"theatre"` from OSM tags. Defines entity type.                                                     |
| **operator**              | `string`   | ✅ Normalized        | Clean text; filled and propagated for multi-branch brands.                                                               |
| **opening_hours**         | `string`   | ✅ Normalized        | Stripped + standardized spacing (OSM syntax). <br>✅ Use `has_opening_hours` / `always_open` helper columns for analysis. |
| **wheelchair**            | `category` | ✅ Normalized        | Standardized values: `yes`, `no`, `limited`.                                                                             |
| **wheelchair_accessible** | `boolean`  | ✅ Added flag        | True for `yes` or `limited`, False for `no`, `<NA>` if missing.                                                          |
| **cinema**                | `string`   | ✅ Clean             | Occasionally filled in OSM; not always used.                                                                             |
| **cinema_type**           | `string`   | ✅ Renamed           | From `cinema:type`; normalized text.                                                                                     |
| **cinema_3d**             | `string`   | ✅ Renamed           | From `cinema:3D`; normalized text.                                                                                       |
| **screen**                | `string`   | ✅ Clean             | Screen count or label if present.                                                                                        |
| **theatre_type**          | `string`   | ✅ Renamed           | From `theatre:type`; normalized text.                                                                                    |
| **theatre_genre**         | `string`   | ✅ Renamed           | From `theatre:genre`; normalized text.                                                                                   |
| **website**               | `string`   | ✅ Coalesced         | Filled from `contact:website` if missing; validated with `https://` prefix.                                              |
| **phone**                 | `string`   | ✅ Coalesced         | Filled from `contact:phone`; cleaned, digits-only, formatted with +49 prefix.                                            |
| **email**                 | `string`   | ✅ Coalesced         | Filled from `contact:email`; normalized.                                                                                 |
| **wikipedia**             | `string`   | ✅ Clean             | Standardized URLs.                                                                                                       |
| **wikidata**              | `string`   | ✅ Clean (short QID) | Extracted `Q####` ID only; ready to merge with Wikidata by ID.                                                           |
| **addr_housenumber**      | `string`   | ✅ Clean             | Normalized address part.                                                                                                 |
| **addr_street**           | `string`   | ✅ Clean             | Normalized street name.                                                                                                  |
| **addr_postcode**         | `string`   | ✅ Clean             | Normalized postal code.                                                                                                  |
| **addr_city**             | `string`   | ✅ Filled            | Defaults to `"Berlin"` if missing.                                                                                       |
| **addr_country**          | `string`   | ✅ Clean             | Standard `"DE"` or `"Germany"`.                                                                                          |
| **addr_full**             | `string`   | ✅ Derived           | Concatenated full address; safe for display or merge.                                                                    |
| **lat_num**               | `float64`  | ✅ Derived           | Centroid latitude in WGS84.                                                                                              |
| **lon_num**               | `float64`  | ✅ Derived           | Centroid longitude in WGS84.                                                                                             |
| **multi_branch**          | `boolean`  | ✅ Derived           | True if same `name_key` appears more than once (multi-location brands).                                                  |
| **source**                | `string`   | ✅ Added             | Constant `"osm"` — identifies data origin.                                                                               |
## 🧮 Quality Overview
| Category               | Count       | Notes                                    |
| ---------------------- | ----------- | ---------------------------------------- |
| Rows total             | 269         | after dropping 4 with missing `name_key` |
| Unique `name_key`s     | 269         | all unique after cleaning                |
| With `wheelchair` info | 199 (~74%)  | standardized                             |
| With `website` info    | ~140 (~52%) | coalesced and normalized                 |
| With `addr_full`       | >90%        | clean and ready for matching             |
| CRS                    | EPSG:4326   | ✅ consistent                             |





# ✅ Save Clean OSM Data (gdf_final) to CSV + GeoJSON

In [None]:
from pathlib import Path
import os

# --- Define  output directory ---
OUT_DIR = Path("/Users/mariannagokova/Documents/DA_PROJECT/Thetres_Berlin_Task/clear_ready_git/source")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# --- Define output filenames ---
csv_path = OUT_DIR / "theatres_berlin_osm_clean.csv"
geojson_path = OUT_DIR / "theatres_berlin_osm_clean.geojson"

# --- Save to CSV (no geometry, for tables & merge) ---
gdf_final.drop(columns=["geometry"]).to_csv(csv_path, index=False, encoding="utf-8")
print(f"✅ CSV saved to: {csv_path}")

# --- Save to GeoJSON (keep geometry for mapping / GIS tools) ---
gdf_final.to_file(geojson_path, driver="GeoJSON", encoding="utf-8")
print(f"✅ GeoJSON saved to: {geojson_path}")

# --- Optional: show file sizes ---
print(f"📦 CSV size: {os.path.getsize(csv_path)/1024:.1f} KB")
print(f"📦 GeoJSON size: {os.path.getsize(geojson_path)/1024:.1f} KB")


✅ CSV saved to: /Users/mariannagokova/Documents/DA_PROJECT/Thetres_Berlin_Task/clear_ready_git/source/theatres_berlin_osm_clean.csv
✅ GeoJSON saved to: /Users/mariannagokova/Documents/DA_PROJECT/Thetres_Berlin_Task/clear_ready_git/source/theatres_berlin_osm_clean.geojson
📦 CSV size: 61.5 KB
📦 GeoJSON size: 264.5 KB


# WIKI
fetch_wikidata

In [25]:
# simple_wikidata_fetcher.py
from SPARQLWrapper import SPARQLWrapper, JSON
import re, time
import pandas as pd
from pathlib import Path
import json

CACHE_DIR = Path("cache_wikidata_simple")
CACHE_DIR.mkdir(parents=True, exist_ok=True)
USER_AGENT = "TheatresBerlinBot/1.0 (contact:thetresberlinl@gmail.com)"
WD_QIDS = {"cinema": "Q41253", "theater": "Q24354"}

def _short_qid(uri_or_qid):
    """Return 'Q12345' part for a Wikidata URI or return input if already short."""
    if not uri_or_qid:
        return None
    s = str(uri_or_qid).rstrip("/")
    return s.rsplit("/", 1)[-1]

_coord_re = re.compile(r"Point\s*\(\s*([-\d\.]+)\s+([-\d\.]+)\s*\)", flags=re.I)
_float_pair_re = re.compile(r"([-+]?\d*\.\d+|[-+]?\d+)[,;\s]+([-+]?\d*\.\d+|[-+]?\d+)")

def parse_coord_literal(lit):
    """Try to get (lon, lat) floats from common coordinate literal formats.
       If not found, return (None, None). This does NOT normalize or guess beyond simple heuristics."""
    if not lit:
        return None, None
    s = str(lit)
    m = _coord_re.search(s)
    if m:
        try:
            lon = float(m.group(1)); lat = float(m.group(2))
            return lon, lat
        except Exception:
            return None, None
    m2 = _float_pair_re.search(s)
    if m2:
        try:
            a = float(m2.group(1)); b = float(m2.group(2))
            # do a tiny heuristic: if first looks like lon-range for Europe, return (a,b)
            if -180 <= a <= 180 and -90 <= b <= 90:
                return a, b
            return None, None
        except Exception:
            return None, None
    return None, None

def _read_cache(path: Path):
    if path.exists():
        try:
            return json.loads(path.read_text(encoding="utf-8"))
        except Exception:
            return None
    return None

def _write_cache(path: Path, data):
    try:
        path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
    except Exception:
        pass

def fetch_wikidata_raw(entity_type: str, use_cache=True) -> pd.DataFrame:
    """Fetch a slim, raw Wikidata table for 'cinema' or 'theater'.
       This returns df_wiki_slim and intentionally avoids any normalization of addresses/capacities."""
    assert entity_type in WD_QIDS, "entity_type must be 'cinema' or 'theater'"

    cache_file = CACHE_DIR / f"wikidata_raw_{entity_type}.json"
    if use_cache:
        cached = _read_cache(cache_file)
        if cached is not None:
            print(f"Using cached Wikidata ({entity_type}): {len(cached)} items")
            return pd.DataFrame(cached)

    sparql = SPARQLWrapper("https://query.wikidata.org/sparql", agent=USER_AGENT)
    sparql.setReturnFormat(JSON)

    # simple query: instance of cinema/theater, located in Berlin (Q64), optional common props
    query = f"""
    SELECT ?item ?itemLabel ?coord ?website ?phone ?operatorLabel ?ownerLabel ?street ?postcode ?capacity WHERE {{
      ?item wdt:P31 wd:{WD_QIDS[entity_type]} .
      ?item wdt:P131* wd:Q64 .  # located in Berlin
      OPTIONAL {{ ?item wdt:P625 ?coord. }}
      OPTIONAL {{ ?item wdt:P856 ?website. }}
      OPTIONAL {{ ?item wdt:P1329 ?phone. }}
      OPTIONAL {{ ?item wdt:P137 ?operator. }}
      OPTIONAL {{ ?item wdt:P127 ?owner. }}
      OPTIONAL {{ ?item wdt:P6375 ?street. }}
      OPTIONAL {{ ?item wdt:P281 ?postcode. }}
      OPTIONAL {{ ?item wdt:P1083 ?capacity. }}
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "de,en". }}
    }}
    """
    sparql.setQuery(query)
    res = sparql.query().convert()

    rows = []
    bindings = res.get("results", {}).get("bindings", [])
    for b in bindings:
        # helper to safely extract binding value
        def getval(field):
            return b.get(field, {}).get("value") if field in b else None

        coord_raw = getval("coord")
        lon, lat = parse_coord_literal(coord_raw)

        qid = _short_qid(getval("item"))
        cap_raw = getval("capacity")  # keep capacity as raw string (no numeric normalization)

        row = {
            "wikidata_id": qid,
            "name": getval("itemLabel") or pd.NA,
            "coord_raw": coord_raw or pd.NA,
            "lat": lat if lat is not None else pd.NA,
            "lon": lon if lon is not None else pd.NA,
            "website": getval("website") or pd.NA,
            "phone": getval("phone") or pd.NA,
            "operator": getval("operatorLabel") or pd.NA,
            "owner": getval("ownerLabel") or pd.NA,
            "street": getval("street") or pd.NA,      # raw street string, not split
            "postcode": getval("postcode") or pd.NA,  # raw postcode string, not normalized
            "capacity_raw": cap_raw or pd.NA,         # raw capacity string
            "source": "wikidata",
            "entity_type": entity_type
        }
        rows.append(row)

    _write_cache(cache_file, rows)
    # polite short pause
    time.sleep(0.1)
    print(f"Fetched {len(rows)} items from Wikidata ({entity_type})")
    df_wiki_slim = pd.DataFrame(rows)
    return df_wiki_slim





In [26]:
# Fetch data from Wikidata (first time it may take ~10–30s)
df_cinemas = fetch_wikidata_raw("cinema")
df_theaters = fetch_wikidata_raw("theater")

# Combine both
df_wiki_slim = pd.concat([df_cinemas, df_theaters], ignore_index=True)


Fetched 911 items from Wikidata (cinema)
Fetched 140 items from Wikidata (theater)


In [27]:
df_wiki_slim.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1051 entries, 0 to 1050
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   wikidata_id   1051 non-null   object
 1   name          1051 non-null   object
 2   coord_raw     1036 non-null   object
 3   lat           1036 non-null   object
 4   lon           1036 non-null   object
 5   website       106 non-null    object
 6   phone         4 non-null      object
 7   operator      36 non-null     object
 8   owner         6 non-null      object
 9   street        857 non-null    object
 10  postcode      34 non-null     object
 11  capacity_raw  13 non-null     object
 12  source        1051 non-null   object
 13  entity_type   1051 non-null   object
dtypes: object(14)
memory usage: 115.1+ KB


In [28]:
 df_wiki_slim["name"].value_counts()

name
Capitol                       8
Amor-Lichtspiele              5
Kamera                        5
Tivoli                        5
Theater des Weddings          5
                             ..
Orania-Lichtspiele            1
Universum-Theater             1
Casino-Lichtspiele            1
WBT-Lichtspiele               1
Theater am Frankfurter Tor    1
Name: count, Length: 855, dtype: int64

In [29]:
df_wiki_slim.head()

Unnamed: 0,wikidata_id,name,coord_raw,lat,lon,website,phone,operator,owner,street,postcode,capacity_raw,source,entity_type
0,Q686156,Berliner Sportpalast,Point(13.359166666 52.494722222),52.494722,13.359167,,,,,"Potsdamer Straße 170–172, 10783 Berlin",,,wikidata,cinema
1,Q704933,Haus der Kulturen der Welt,Point(13.3648 52.5188),52.5188,13.3648,https://www.hkw.de/,+49 30 39 78 71 75,,,,10557.0,,wikidata,cinema
2,Q47155480,Lido,Point(13.44506 52.49919),52.49919,13.44506,https://www.lido-berlin.de,,,,"Cuvrystraße 7, 10997 Berlin",,,wikidata,cinema
3,Q47155480,Lido,Point(13.44506 52.49919),52.49919,13.44506,https://www.lido-berlin.de,,,,"Schlesische Straße 15, 10997 Berlin",,,wikidata,cinema
4,Q43061296,Schaubude Berlin,Point(13.439105555 52.540927777),52.540928,13.439106,,,,,"Greifswalder Straße 81-84, 10405 Berlin",10405.0,,wikidata,cinema


✅ Stepwise approach for complex cases

Normalize: replace en-dash with -, normalize spaces, strip, remove trailing slashes.

Split by comma: separate postcode/city from the street part.

Split by slash /: get multiple street segments.

Extract street name + house number from each segment using regex.

Optionally join multiple segments with a separator if you want to keep all info.

In [30]:
# Create name_key *before* dropping anything
df_wiki_slim["name_key"] = (
    df_wiki_slim["name"]
    .fillna("")
    .str.lower()
    .str.replace(r"[^a-z0-9]+", "-", regex=True)
    .str.strip("-")
)

# Drop rows where coord_raw is missing (NaN) or empty string
df_wiki_slim = df_wiki_slim.dropna(subset=["coord_raw"]).copy()

# Optional: also drop rows where coord_raw is an empty string or 'None'
df_wiki_slim = df_wiki_slim[df_wiki_slim["coord_raw"].astype(str).str.strip().ne("")]

print(f"✅ Remaining rows after filtering: {len(df_wiki_slim)}")


✅ Remaining rows after filtering: 1036


In [31]:
df_wiki_slim.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1036 entries, 0 to 1050
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   wikidata_id   1036 non-null   object
 1   name          1036 non-null   object
 2   coord_raw     1036 non-null   object
 3   lat           1036 non-null   object
 4   lon           1036 non-null   object
 5   website       105 non-null    object
 6   phone         4 non-null      object
 7   operator      36 non-null     object
 8   owner         6 non-null      object
 9   street        852 non-null    object
 10  postcode      34 non-null     object
 11  capacity_raw  13 non-null     object
 12  source        1036 non-null   object
 13  entity_type   1036 non-null   object
 14  name_key      1036 non-null   object
dtypes: object(15)
memory usage: 129.5+ KB


In [32]:
import pandas as pd
import re

# -------------------------------------------------
# STEP 0 — Copy DataFrame safely (no drop)
# -------------------------------------------------
df_wiki_slim_full = df_wiki_slim.copy()

# Optionally preview what columns you have
print("📋 Columns before cleaning:", list(df_wiki_slim_full.columns))

# -------------------------------------------------
# STEP 1 — Normalize base address text
# -------------------------------------------------
def normalize_address(addr):
    """Normalize spacing, dashes, and punctuation in raw address strings."""
    if pd.isna(addr):
        return pd.NA
    addr = str(addr)
    addr = addr.replace("–", "-")  # en dash → normal dash
    addr = addr.replace("—", "-")  # em dash → normal dash
    addr = re.sub(r"\(.*?\)", "", addr)  # remove text in parentheses
    addr = re.sub(r"\s+", " ", addr)  # collapse multiple spaces
    addr = addr.strip(" ,;/")  # trim trailing punctuation/spaces
    return addr

# Create a normalized street column
if "street" not in df_wiki_slim_full.columns:
    df_wiki_slim_full["street"] = pd.NA

df_wiki_slim_full["street_norm"] = df_wiki_slim_full["street"].apply(normalize_address)

# -------------------------------------------------
# STEP 2 — Parse normalized street into components
# -------------------------------------------------
def split_complex_address(addr):
    """
    Parse complex Berlin addresses into street, house number, postcode, and city.
    Handles multiple streets (joined by '/'), parenthetical info, and commas.
    """
    if pd.isna(addr):
        return pd.NA, pd.NA, pd.NA, pd.NA

    addr = str(addr).replace("–", "-")
    addr = re.sub(r"\s+", " ", addr).strip()

    # Split into comma-separated parts (e.g., "Berliner Str 8, 14169 Berlin")
    parts = [p.strip() for p in addr.split(",") if p.strip()]
    street_part = parts[0] if parts else ""
    postcode = pd.NA
    city = pd.NA

    # Detect postcode + city if present in later parts
    for p in parts[1:]:
        m = re.search(r"(\d{5})\s*(.*)", p)
        if m:
            postcode = m.group(1)
            city_candidate = m.group(2).strip()
            if city_candidate:
                city = city_candidate
            break

    # Handle multiple streets joined with "/"
    street_segments = [s.strip() for s in street_part.split("/") if s.strip()]
    streets, housenumbers = [], []

    for seg in street_segments:
        # Try to extract house number at end (e.g. "10", "5a", "38-42", "1/2")
        m = re.search(r"(\d+[A-Za-z\/\-]*)$", seg)
        if m:
            housenumbers.append(m.group(1))
            streets.append(seg[:m.start()].strip(" ,"))
        else:
            streets.append(seg.strip(" ,"))
            housenumbers.append("")

    street_final = " / ".join(streets)
    housenumber_final = " / ".join([h for h in housenumbers if h]) or pd.NA

    # Default city if missing
    if pd.isna(city) and "Berlin" in addr:
        city = "Berlin"

    return street_final, housenumber_final, postcode, city

# -------------------------------------------------
# STEP 3 — Apply the parsing to create new columns
# -------------------------------------------------
parsed_cols = df_wiki_slim_full["street_norm"].apply(split_complex_address)
df_wiki_slim_full[["addr:street", "addr:housenumber", "addr:postcode", "addr:city"]] = pd.DataFrame(
    parsed_cols.tolist(),
    index=df_wiki_slim_full.index
)

# -------------------------------------------------
# STEP 4 — Quick inspection
# -------------------------------------------------
print("✅ Address parsing complete.")
display(
    df_wiki_slim_full[
        ["street", "street_norm", "addr:street", "addr:housenumber", "addr:postcode", "addr:city"]
    ].head(15)
)


📋 Columns before cleaning: ['wikidata_id', 'name', 'coord_raw', 'lat', 'lon', 'website', 'phone', 'operator', 'owner', 'street', 'postcode', 'capacity_raw', 'source', 'entity_type', 'name_key']
✅ Address parsing complete.


Unnamed: 0,street,street_norm,addr:street,addr:housenumber,addr:postcode,addr:city
0,"Potsdamer Straße 170–172, 10783 Berlin","Potsdamer Straße 170-172, 10783 Berlin",Potsdamer Straße 170-,172.0,10783.0,Berlin
1,,,,,,
2,"Cuvrystraße 7, 10997 Berlin","Cuvrystraße 7, 10997 Berlin",Cuvrystraße,7.0,10997.0,Berlin
3,"Schlesische Straße 15, 10997 Berlin","Schlesische Straße 15, 10997 Berlin",Schlesische Straße,15.0,10997.0,Berlin
4,"Greifswalder Straße 81-84, 10405 Berlin","Greifswalder Straße 81-84, 10405 Berlin",Greifswalder Straße 81-,84.0,10405.0,Berlin
6,,,,,,
7,"Prignitzstraße 100, 12683 Berlin","Prignitzstraße 100, 12683 Berlin",Prignitzstraße,100.0,12683.0,Berlin
8,"Alt-Kaulsdorf 15, 12621 Berlin","Alt-Kaulsdorf 15, 12621 Berlin",Alt-Kaulsdorf,15.0,12621.0,Berlin
9,"Hönower Straße 76, 12623 Berlin","Hönower Straße 76, 12623 Berlin",Hönower Straße,76.0,12623.0,Berlin
10,"Alt-Marzahn 54, 12685 Berlin","Alt-Marzahn 54, 12685 Berlin",Alt-Marzahn,54.0,12685.0,Berlin


In [33]:
df_wiki_slim_full.head()

Unnamed: 0,wikidata_id,name,coord_raw,lat,lon,website,phone,operator,owner,street,postcode,capacity_raw,source,entity_type,name_key,street_norm,addr:street,addr:housenumber,addr:postcode,addr:city
0,Q686156,Berliner Sportpalast,Point(13.359166666 52.494722222),52.494722,13.359167,,,,,"Potsdamer Straße 170–172, 10783 Berlin",,,wikidata,cinema,berliner-sportpalast,"Potsdamer Straße 170-172, 10783 Berlin",Potsdamer Straße 170-,172.0,10783.0,Berlin
1,Q704933,Haus der Kulturen der Welt,Point(13.3648 52.5188),52.5188,13.3648,https://www.hkw.de/,+49 30 39 78 71 75,,,,10557.0,,wikidata,cinema,haus-der-kulturen-der-welt,,,,,
2,Q47155480,Lido,Point(13.44506 52.49919),52.49919,13.44506,https://www.lido-berlin.de,,,,"Cuvrystraße 7, 10997 Berlin",,,wikidata,cinema,lido,"Cuvrystraße 7, 10997 Berlin",Cuvrystraße,7.0,10997.0,Berlin
3,Q47155480,Lido,Point(13.44506 52.49919),52.49919,13.44506,https://www.lido-berlin.de,,,,"Schlesische Straße 15, 10997 Berlin",,,wikidata,cinema,lido,"Schlesische Straße 15, 10997 Berlin",Schlesische Straße,15.0,10997.0,Berlin
4,Q43061296,Schaubude Berlin,Point(13.439105555 52.540927777),52.540928,13.439106,,,,,"Greifswalder Straße 81-84, 10405 Berlin",10405.0,,wikidata,cinema,schaubude-berlin,"Greifswalder Straße 81-84, 10405 Berlin",Greifswalder Straße 81-,84.0,10405.0,Berlin


In [34]:


# Show unique wikidata_id and name pairs
unique_pairs = df_wiki_slim_full[['wikidata_id', 'name_key','lat', 'lon']].drop_duplicates()
# Display all or just first few
print(unique_pairs.head(10))  # or .to_string() to see all

   wikidata_id                                  name_key        lat        lon
0      Q686156                      berliner-sportpalast  52.494722  13.359167
1      Q704933                haus-der-kulturen-der-welt    52.5188    13.3648
2    Q47155480                                      lido   52.49919   13.44506
4    Q43061296                          schaubude-berlin  52.540928  13.439106
6   Q106873768  marmorsaal-im-zoologischen-garten-berlin  52.507418   13.34469
7    Q47116902                                       bio     52.523    13.5512
8    Q47116930                         volks-lichtspiele    52.5054     13.582
9    Q47117004                                    gloria    52.5118    13.6125
10   Q47117121                            alt-marzahn-54    52.5428    13.5611
11   Q47117127                                     sojus    52.5265    13.5424


In [35]:
#unique wikidata IDs (ignoring names):
df_wiki_slim_full['wikidata_id'].nunique()



952

In [36]:
#unique wikidata IDs (ignoring names):
df_wiki_slim_full['name_key'].nunique()

841

In [37]:
df_wiki_slim_full.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1036 entries, 0 to 1050
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   wikidata_id       1036 non-null   object
 1   name              1036 non-null   object
 2   coord_raw         1036 non-null   object
 3   lat               1036 non-null   object
 4   lon               1036 non-null   object
 5   website           105 non-null    object
 6   phone             4 non-null      object
 7   operator          36 non-null     object
 8   owner             6 non-null      object
 9   street            852 non-null    object
 10  postcode          34 non-null     object
 11  capacity_raw      13 non-null     object
 12  source            1036 non-null   object
 13  entity_type       1036 non-null   object
 14  name_key          1036 non-null   object
 15  street_norm       852 non-null    object
 16  addr:street       852 non-null    object
 17  addr:housenumber  8

###  Deduplicate df_wiki_slim_full into a clear, slim, unique dataset while keeping the full column list

In [38]:
import pandas as pd
import numpy as np

# -------------------------------------------------
# STEP 0 — Copy and basic info
# -------------------------------------------------
df_wiki_slim_unique = df_wiki_slim_full.copy()
print(f"🔍 Starting rows: {len(df_wiki_slim_unique)}")

# -------------------------------------------------
# STEP 1 — Define key columns for deduplication
# -------------------------------------------------
# We prefer to deduplicate by:
# 1. exact same Wikidata ID (most reliable)
# 2. same name_key + same coordinates (for items without ID)
dedup_keys = []

if "wikidata_id" in df_wiki_slim_unique.columns:
    dedup_keys.append("wikidata_id")

# Add name_key + coordinates if available
for key in ["name_key", "lat_num", "lon_num"]:
    if key in df_wiki_slim_unique.columns:
        dedup_keys.append(key)

print(f"🧩 Deduplication keys: {dedup_keys}")

# -------------------------------------------------
# STEP 2 — Remove exact duplicates
# -------------------------------------------------
before = len(df_wiki_slim_unique)

# Drop perfect duplicates (identical across all columns)
df_wiki_slim_unique = df_wiki_slim_unique.drop_duplicates(keep="first")

# Then drop duplicate rows based on key fields (wikidata_id / name / coords)
df_wiki_slim_unique = df_wiki_slim_unique.drop_duplicates(subset=dedup_keys, keep="first").reset_index(drop=True)

after = len(df_wiki_slim_unique)
print(f"✅ Deduplicated: {before - after} duplicates removed → {after} rows remain")

# -------------------------------------------------
# STEP 3 — Optional: ensure stable column order
# -------------------------------------------------
df_wiki_slim_unique = df_wiki_slim_unique.reindex(columns=sorted(df_wiki_slim_unique.columns))

# -------------------------------------------------
# STEP 4 — Optional diagnostics
# -------------------------------------------------
# Check for multiple Wikidata IDs with same coordinates (possible duplicates)
if all(k in df_wiki_slim_unique.columns for k in ["lat_num", "lon_num"]):
    dup_coords = df_wiki_slim_unique.duplicated(subset=["lat_num", "lon_num"], keep=False)
    if dup_coords.any():
        print(f"⚠️ {dup_coords.sum()} venues share identical coordinates — review if real duplicates.")
    else:
        print("✅ All coordinates unique.")

# -------------------------------------------------
# STEP 5 — Save (optional)
# -------------------------------------------------
# df_wiki_slim_unique.to_csv("wikidata_theatres_unique.csv", index=False, encoding="utf-8")
# print("💾 Saved: wikidata_theatres_unique.csv")

# -------------------------------------------------
# STEP 6 — Quick preview
# -------------------------------------------------
display(df_wiki_slim_unique.head(10))


🔍 Starting rows: 1036
🧩 Deduplication keys: ['wikidata_id', 'name_key']
✅ Deduplicated: 84 duplicates removed → 952 rows remain


Unnamed: 0,addr:city,addr:housenumber,addr:postcode,addr:street,capacity_raw,coord_raw,entity_type,lat,lon,name,name_key,operator,owner,phone,postcode,source,street,street_norm,website,wikidata_id
0,Berlin,172.0,10783.0,Potsdamer Straße 170-,,Point(13.359166666 52.494722222),cinema,52.494722,13.359167,Berliner Sportpalast,berliner-sportpalast,,,,,wikidata,"Potsdamer Straße 170–172, 10783 Berlin","Potsdamer Straße 170-172, 10783 Berlin",,Q686156
1,,,,,,Point(13.3648 52.5188),cinema,52.5188,13.3648,Haus der Kulturen der Welt,haus-der-kulturen-der-welt,,,+49 30 39 78 71 75,10557.0,wikidata,,,https://www.hkw.de/,Q704933
2,Berlin,7.0,10997.0,Cuvrystraße,,Point(13.44506 52.49919),cinema,52.49919,13.44506,Lido,lido,,,,,wikidata,"Cuvrystraße 7, 10997 Berlin","Cuvrystraße 7, 10997 Berlin",https://www.lido-berlin.de,Q47155480
3,Berlin,84.0,10405.0,Greifswalder Straße 81-,,Point(13.439105555 52.540927777),cinema,52.540928,13.439106,Schaubude Berlin,schaubude-berlin,,,,10405.0,wikidata,"Greifswalder Straße 81-84, 10405 Berlin","Greifswalder Straße 81-84, 10405 Berlin",,Q43061296
4,,,,,,Point(13.344690105 52.507417709),cinema,52.507418,13.34469,Marmorsaal im Zoologischen Garten Berlin,marmorsaal-im-zoologischen-garten-berlin,,,,,wikidata,,,,Q106873768
5,Berlin,100.0,12683.0,Prignitzstraße,,Point(13.5512 52.523),cinema,52.523,13.5512,Bio,bio,,,,,wikidata,"Prignitzstraße 100, 12683 Berlin","Prignitzstraße 100, 12683 Berlin",,Q47116902
6,Berlin,15.0,12621.0,Alt-Kaulsdorf,,Point(13.582 52.5054),cinema,52.5054,13.582,Volks-Lichtspiele,volks-lichtspiele,,,,,wikidata,"Alt-Kaulsdorf 15, 12621 Berlin","Alt-Kaulsdorf 15, 12621 Berlin",,Q47116930
7,Berlin,76.0,12623.0,Hönower Straße,,Point(13.6125 52.5118),cinema,52.5118,13.6125,Gloria,gloria,,,,,wikidata,"Hönower Straße 76, 12623 Berlin","Hönower Straße 76, 12623 Berlin",,Q47117004
8,Berlin,54.0,12685.0,Alt-Marzahn,,Point(13.5611 52.5428),cinema,52.5428,13.5611,Alt-Marzahn 54,alt-marzahn-54,,,,,wikidata,"Alt-Marzahn 54, 12685 Berlin","Alt-Marzahn 54, 12685 Berlin",,Q47117121
9,Berlin,12.0,12681.0,Helene-Weigel-Platz,,Point(13.5424 52.5265),cinema,52.5265,13.5424,Sojus,sojus,,,,,wikidata,"Helene-Weigel-Platz 12, 12681 Berlin","Helene-Weigel-Platz 12, 12681 Berlin",,Q47117127


In [39]:
df_wiki_slim_unique.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 952 entries, 0 to 951
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   addr:city         776 non-null    object
 1   addr:housenumber  770 non-null    object
 2   addr:postcode     747 non-null    object
 3   addr:street       782 non-null    object
 4   capacity_raw      6 non-null      object
 5   coord_raw         952 non-null    object
 6   entity_type       952 non-null    object
 7   lat               952 non-null    object
 8   lon               952 non-null    object
 9   name              952 non-null    object
 10  name_key          952 non-null    object
 11  operator          33 non-null     object
 12  owner             5 non-null      object
 13  phone             3 non-null      object
 14  postcode          29 non-null     object
 15  source            952 non-null    object
 16  street            782 non-null    object
 17  street_norm     

In [40]:
#Inspect unique names:
df_wiki_slim_unique['name_key'].sort_values().head(30)

320                                1-berliner-kinomuseum
186                                            abc-kinos
341                                      abc-lichtspiele
207                                      abc-lichtspiele
363                                    aboli-lichtspiele
826                                                 acud
230                                    adler-lichtspiele
829                                       admiralspalast
32                                       adria-filmb-hne
830                     akademie-der-k-nste-hanseatenweg
169        aki-aktualit-ten-kino-am-zoo-hardenbergstra-e
172    aki-aktualit-ten-kino-am-zoo-joachimsthaler-st...
695                                         aki-neuk-lln
165                                     akme-lichtspiele
782                                       ala-filmpalast
564                                        aladin-camera
33                              albrechtshof-lichtspiele
472                            

Code block to fix and standardize name_key

In [41]:

# --- Add source tag for clarity ---
df_wiki_slim_unique["source"] = "wikidata"

# --- Sanity check ---
print("✅ Wikidata name_key re-normalized")
print(f"Rows: {len(df_wiki_slim_unique)} | Unique name_keys: {df_wiki_slim_unique['name_key'].nunique()}")
display(df_wiki_slim_unique[['name_key']].head(10))


✅ Wikidata name_key re-normalized
Rows: 952 | Unique name_keys: 841


Unnamed: 0,name_key
0,berliner-sportpalast
1,haus-der-kulturen-der-welt
2,lido
3,schaubude-berlin
4,marmorsaal-im-zoologischen-garten-berlin
5,bio
6,volks-lichtspiele
7,gloria
8,alt-marzahn-54
9,sojus


In [42]:
#Count how many unique theatres by type:
df_wiki_slim_unique['entity_type'].value_counts()

entity_type
cinema     859
theater     93
Name: count, dtype: int64

Block for fixiing adrr:*

In [43]:
import re
import pandas as pd

# Work on a single, consistent dataframe
df = df_wiki_slim_unique.copy()

# ---------- 1) normalize a raw address string ----------
def norm_raw(addr):
    if pd.isna(addr):
        return pd.NA
    s = str(addr)
    s = s.replace("–", "-").replace("—", "-")
    s = re.sub(r"\(.*?\)", "", s)      # drop (...) notes
    s = re.sub(r"\s+", " ", s)         # collapse spaces
    s = s.strip(" ,;/")
    return s if s else pd.NA

# ---------- 2) chop postcode + city from the tail ----------
_postcode_tail = re.compile(r"(.*?)[, ]*\b(\d{5})\b[ ,]*([A-Za-zÄÖÜäöüß\-\s]*)$", re.UNICODE)

def chop_tail_postcode_city(addr_norm: str):
    if pd.isna(addr_norm):
        return (pd.NA, pd.NA, pd.NA)
    s = str(addr_norm)
    m = _postcode_tail.match(s)
    if m:
        street_core = m.group(1).strip(" ,")
        postcode    = m.group(2)
        city        = (m.group(3) or "").strip().strip(",") or pd.NA
        return (street_core, postcode, city)
    # no explicit postcode; still capture Berlin if present
    city = "Berlin" if re.search(r"\bBerlin\b", s, re.IGNORECASE) else pd.NA
    return (s, pd.NA, city)

# ---------- 3) normalise a house number token ----------
def norm_house(h: str):
    h = h.strip()
    h = re.sub(r"\s*/\s*", "/", h)   # 5 / 5a -> 5/5a
    h = re.sub(r"\s*-\s*", "-", h)   # 38 – 42 -> 38-42
    return h

# ---------- 4) split multi-street and extract street + trailing housenumber ----------
_hn_tail = re.compile(
    r"(\d+[0-9A-Za-z]*(?:/\d+[0-9A-Za-z]*)?(?:-\d+[0-9A-Za-z]*(?:/\d+[0-9A-Za-z]*)?)?)\s*$"
)

def parse_segments(street_core: str):
    if pd.isna(street_core):
        return ([], [])
    segs = [seg.strip(" ,") for seg in re.split(r"\s*/\s*", str(street_core)) if seg.strip()]
    streets, hns = [], []
    for seg in segs:
        m = _hn_tail.search(seg)
        if m:
            hn  = norm_house(m.group(1))
            st  = seg[:m.start()].strip(" ,")
            streets.append(st)
            hns.append(hn)
        else:
            streets.append(seg.strip(" ,"))
            hns.append("")
    return (streets, hns)

# ---------- 5) end-to-end parts builder (string in -> Series out) ----------
def build_addr_parts_from_raw(raw):
    s0 = norm_raw(raw)
    street_core, pc, city = chop_tail_postcode_city(s0)
    streets, hns = parse_segments(street_core)
    addr_street = " / ".join(streets) if streets else pd.NA
    hn_list = [h for h in hns if h]
    addr_hn = " / ".join(hn_list) if hn_list else pd.NA
    if pd.isna(city) and isinstance(s0, str) and "berlin" in s0.lower():
        city = "Berlin"
    return pd.Series(
        [addr_street, addr_hn, pc, city],
        index=["addr:street_new","addr:housenumber_new","addr:postcode_new","addr:city_new"]
    )

# Ensure we have a source column to parse
if "street_norm" not in df.columns:
    src = "street" if "street" in df.columns else None
    if src is None:
        df["street_norm"] = pd.NA
    else:
        df["street_norm"] = df[src].apply(norm_raw)

# ---------- 6) compute parts on THIS dataframe ----------
parts = df["street_norm"].apply(build_addr_parts_from_raw)

# ---------- 7) coalesce into existing columns ONLY when missing ----------
for orig, new in [
    ("addr:street",      "addr:street_new"),
    ("addr:housenumber", "addr:housenumber_new"),
    ("addr:postcode",    "addr:postcode_new"),
    ("addr:city",        "addr:city_new"),
]:
    if orig not in df.columns:
        df[orig] = pd.NA
    df[orig] = df[orig].fillna(parts[new])

# default city to Berlin if still NA
df["addr:city"] = df["addr:city"].fillna("Berlin")

# ---------- 8) quick sanity peek ----------
check_cols = ["name_key","street_norm","addr:street","addr:housenumber","addr:postcode","addr:city"]
print("✅ Address parts filled. Preview:")
display(df[ [c for c in check_cols if c in df.columns] ].head(3))

# ---------- 9) diagnostics ----------
filled_postcode = parts["addr:postcode_new"].notna().sum()
remaining_na = df["addr:postcode"].isna().sum()
print(f"📈 Filled postcode in {filled_postcode} rows | Remaining NA in addr:postcode: {remaining_na}")

# Persist back if this is your working frame

df_wiki_slim_unique.info(3)

✅ Address parts filled. Preview:


Unnamed: 0,name_key,street_norm,addr:street,addr:housenumber,addr:postcode,addr:city
0,berliner-sportpalast,"Potsdamer Straße 170-172, 10783 Berlin",Potsdamer Straße 170-,172.0,10783.0,Berlin
1,haus-der-kulturen-der-welt,,,,,Berlin
2,lido,"Cuvrystraße 7, 10997 Berlin",Cuvrystraße,7.0,10997.0,Berlin


📈 Filled postcode in 747 rows | Remaining NA in addr:postcode: 203
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 952 entries, 0 to 951
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   addr:city         776 non-null    object
 1   addr:housenumber  770 non-null    object
 2   addr:postcode     747 non-null    object
 3   addr:street       782 non-null    object
 4   capacity_raw      6 non-null      object
 5   coord_raw         952 non-null    object
 6   entity_type       952 non-null    object
 7   lat               952 non-null    object
 8   lon               952 non-null    object
 9   name              952 non-null    object
 10  name_key          952 non-null    object
 11  operator          33 non-null     object
 12  owner             5 non-null      object
 13  phone             3 non-null      object
 14  postcode          29 non-null     object
 15  source            952 non-null    object


In [44]:
df_wiki_slim_unique.head(3)


Unnamed: 0,addr:city,addr:housenumber,addr:postcode,addr:street,capacity_raw,coord_raw,entity_type,lat,lon,name,name_key,operator,owner,phone,postcode,source,street,street_norm,website,wikidata_id
0,Berlin,172.0,10783.0,Potsdamer Straße 170-,,Point(13.359166666 52.494722222),cinema,52.494722,13.359167,Berliner Sportpalast,berliner-sportpalast,,,,,wikidata,"Potsdamer Straße 170–172, 10783 Berlin","Potsdamer Straße 170-172, 10783 Berlin",,Q686156
1,,,,,,Point(13.3648 52.5188),cinema,52.5188,13.3648,Haus der Kulturen der Welt,haus-der-kulturen-der-welt,,,+49 30 39 78 71 75,10557.0,wikidata,,,https://www.hkw.de/,Q704933
2,Berlin,7.0,10997.0,Cuvrystraße,,Point(13.44506 52.49919),cinema,52.49919,13.44506,Lido,lido,,,,,wikidata,"Cuvrystraße 7, 10997 Berlin","Cuvrystraße 7, 10997 Berlin",https://www.lido-berlin.de,Q47155480


In [45]:
gdf_final.columns.tolist()

['geometry',
 'name',
 'name_key',
 'amenity',
 'operator',
 'opening_hours',
 'wheelchair',
 'cinema',
 'cinema_type',
 'cinema_3d',
 'screen',
 'theatre_type',
 'theatre_genre',
 'website',
 'phone',
 'email',
 'wikipedia',
 'wikidata',
 'addr_housenumber',
 'addr_street',
 'addr_postcode',
 'addr_city',
 'addr_country',
 'addr_full',
 'lat_num',
 'lon_num',
 'source',
 'multi_branch',
 'wheelchair_accessible']

## Rename Wikidata address columns for merge compatibility

In [46]:
# --- Define rename mapping ---
rename_map = {
    "addr:housenumber": "addr_housenumber",
    "addr:street": "addr_street",
    "addr:postcode": "addr_postcode",
    "addr:city": "addr_city",
    "addr:country": "addr_country"
}

# --- Apply rename only if columns exist ---
df_wiki_slim_unique = df_wiki_slim_unique.rename(
    columns={k: v for k, v in rename_map.items() if k in df_wiki_slim_unique.columns}
).copy()

print("✅ Wikidata address columns renamed for consistency:")
df_wiki_slim_unique.columns


✅ Wikidata address columns renamed for consistency:


Index(['addr_city', 'addr_housenumber', 'addr_postcode', 'addr_street',
       'capacity_raw', 'coord_raw', 'entity_type', 'lat', 'lon', 'name',
       'name_key', 'operator', 'owner', 'phone', 'postcode', 'source',
       'street', 'street_norm', 'website', 'wikidata_id'],
      dtype='object')

In [47]:
def coalesce_postcode(df, colon=False):
    # accepts both 'addr:postcode' or 'addr_postcode'
    col_addr = "addr:postcode" if colon else "addr_postcode"
    if col_addr not in df.columns:
        df[col_addr] = pd.NA
    if "postcode" in df.columns:
        df[col_addr] = df[col_addr].fillna(df["postcode"])
    return df

# OSM: column names are already snake_case
df_wiki_slim_unique = coalesce_postcode(df_wiki_slim_unique, colon=False)
df_wiki_slim_unique = df_wiki_slim_unique.drop(columns=["postcode"], errors="ignore")
df_wiki_slim_unique = df_wiki_slim_unique.copy()

In [48]:
df_wiki_slim_unique.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 952 entries, 0 to 951
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   addr_city         776 non-null    object
 1   addr_housenumber  770 non-null    object
 2   addr_postcode     760 non-null    object
 3   addr_street       782 non-null    object
 4   capacity_raw      6 non-null      object
 5   coord_raw         952 non-null    object
 6   entity_type       952 non-null    object
 7   lat               952 non-null    object
 8   lon               952 non-null    object
 9   name              952 non-null    object
 10  name_key          952 non-null    object
 11  operator          33 non-null     object
 12  owner             5 non-null      object
 13  phone             3 non-null      object
 14  source            952 non-null    object
 15  street            782 non-null    object
 16  street_norm       782 non-null    object
 17  website         

### 🧪 Sanity Check for df_wiki_final

In [49]:
df_wiki_final = df_wiki_slim_unique.copy()

In [50]:
print("🔍 --- SANITY CHECK: df_wiki_final ---")

# --- Basic shape and columns ---
print(f"Rows: {len(df_wiki_final)}")
print(f"Columns: {len(df_wiki_final.columns)}")
print()

# --- CRS and geometry info (if GeoDataFrame) ---
if hasattr(df_wiki_final, "crs"):
    print("CRS:", df_wiki_final.crs)
    print("Geometry type counts:")
    print(df_wiki_final.geom_type.value_counts())
    print()

# --- Key identifier check ---
if "wikidata_id" in df_wiki_final.columns:
    unique_ids = df_wiki_final["wikidata_id"].nunique()
    print(f"Unique wikidata_id: {unique_ids}")
    dup_ids = df_wiki_final["wikidata_id"].duplicated().sum()
    if dup_ids:
        print(f"⚠️ Duplicate wikidata_id entries: {dup_ids}")
    else:
        print("✅ No duplicate wikidata_id values.")
print()

# --- Coordinate sanity ---
if {"lat_num", "lon_num"}.issubset(df_wiki_final.columns):
    lat_min, lat_max = df_wiki_final["lat_num"].min(), df_wiki_final["lat_num"].max()
    lon_min, lon_max = df_wiki_final["lon_num"].min(), df_wiki_final["lon_num"].max()
    print("📍 Coordinate ranges:")
    print(f"  Latitude:  {lat_min:.5f} → {lat_max:.5f}")
    print(f"  Longitude: {lon_min:.5f} → {lon_max:.5f}")
    if not (52.0 <= lat_min <= 53.2 and 13.0 <= lon_min <= 14.8):
        print("⚠️ Some coordinates fall outside Berlin bounds.")
    else:
        print("✅ Coordinates within Berlin range.")
print()

# --- Missing value summary ---
print("🧩 Missing values (top 10):")
nulls = df_wiki_final.isna().sum().sort_values(ascending=False)
print(nulls.head(10))
print()

# --- Check address coverage ---
addr_cols = ["addr_street","addr_housenumber","addr_postcode","addr_city"]
addr_coverage = (df_wiki_final[addr_cols].notna().mean() * 100).round(1)
print("🏠 Address completeness (% filled):")
print(addr_coverage)
print()

# --- Quick sample for visual check ---
print("📋 Sample rows:")
display(df_wiki_final.sample(5, random_state=42)[
    ["wikidata_id","name_key","addr_street","addr_housenumber","addr_postcode","addr_city","website","operator"]
])


🔍 --- SANITY CHECK: df_wiki_final ---
Rows: 952
Columns: 19

Unique wikidata_id: 952
✅ No duplicate wikidata_id values.


🧩 Missing values (top 10):
phone               949
owner               947
capacity_raw        946
operator            919
website             860
addr_postcode       192
addr_housenumber    182
addr_city           176
addr_street         170
street_norm         170
dtype: int64

🏠 Address completeness (% filled):
addr_street         82.1
addr_housenumber    80.9
addr_postcode       79.8
addr_city           81.5
dtype: float64

📋 Sample rows:


Unnamed: 0,wikidata_id,name_key,addr_street,addr_housenumber,addr_postcode,addr_city,website,operator
199,Q47092928,schlo-lichtspiele,Schloßstraße /,30 / 31,14059.0,Berlin,,
420,Q47086495,jugend,Langhansstraße,23,13086.0,Berlin,,
694,Q47089276,ili-lichtspiele,Innstraße,35,12045.0,Berlin,,
750,Q35010623,silvana,Baumschulenstraße,78,12437.0,Berlin,,
507,Q47307719,bio-lichtspiele-hackerscher-markt,,,,,,


# 🧠 Quick analysis of  sanity check

| Aspect                   | Observation                                                   | Meaning                                                                    |
| :----------------------- | :------------------------------------------------------------ | :------------------------------------------------------------------------- |
| **Rows**                 | 952                                                           | Stable, clean dataset — consistent with earlier deduplicated count         |
| **CRS**                  | EPSG:4326                                                     | ✅ correct coordinate system for merging with OSM                           |
| **Geometry type**        | 952 Points                                                    | ✅ all geocoded — no missing geometry                                       |
| **Unique IDs**           | All 952 unique `wikidata_id`                                  | ✅ solid identifier integrity                                               |
| **Coordinate range**     | lat 52.38–52.65 / lon 13.13–13.72                             | ✅ perfectly within Berlin bounds                                           |
| **Missing values**       | mostly in `phone`, `owner`, `capacity`, `website`, `operator` | ⚠️ Wikidata rarely stores phone/website — that’s fine; OSM will fill those |
| **Address completeness** | street 82%, housenumber 81%, postcode 80%, city 100%          | ✅ strong base coverage (good backup when OSM lacks them)                   |
| **Random sample**        | shows realistic theatres and cinemas                          | ✅ consistent content quality                                               |





# ✅ Save Clean WIKI Data (df_wiki_final) to CSV + GeoJSON.  

In [51]:
from pathlib import Path
import os
import geopandas as gpd

# --- Define your output directory ---
OUT_DIR = Path("/Users/mariannagokova/Documents/DA_PROJECT/Thetres_Berlin_Task/clear_ready_git/source")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# --- Define output filenames ---
csv_path = OUT_DIR / "theatres_berlin_wiki_clean.csv"
geojson_path = OUT_DIR / "theatres_berlin_wiki_clean.geojson"

# --- Ensure df_wiki_final is a GeoDataFrame ---
if not isinstance(df_wiki_final, gpd.GeoDataFrame):
    if {"lon", "lat"}.issubset(df_wiki_final.columns):
        df_wiki_final = gpd.GeoDataFrame(
            df_wiki_final.copy(),
            geometry=gpd.points_from_xy(df_wiki_final["lon"], df_wiki_final["lat"]),
            crs="EPSG:4326"
        )
    else:
        raise ValueError("❌ df_wiki_final must have 'lon' and 'lat' columns or a geometry.")

# --- Save to CSV (no geometry, for table use) ---
df_wiki_final.drop(columns=["geometry"], errors="ignore").to_csv(csv_path, index=False, encoding="utf-8")
print(f"✅ CSV saved to: {csv_path}")

# --- Save to GeoJSON (for GIS / mapping) ---
df_wiki_final.to_file(geojson_path, driver="GeoJSON", encoding="utf-8")
print(f"✅ GeoJSON saved to: {geojson_path}")

# --- Optional: show file sizes ---
print(f"📦 CSV size: {os.path.getsize(csv_path)/1024:.1f} KB")
print(f"📦 GeoJSON size: {os.path.getsize(geojson_path)/1024:.1f} KB")


✅ CSV saved to: /Users/mariannagokova/Documents/DA_PROJECT/Thetres_Berlin_Task/clear_ready_git/source/theatres_berlin_wiki_clean.csv
✅ GeoJSON saved to: /Users/mariannagokova/Documents/DA_PROJECT/Thetres_Berlin_Task/clear_ready_git/source/theatres_berlin_wiki_clean.geojson
📦 CSV size: 195.2 KB
📦 GeoJSON size: 592.3 KB


# Merge sources Wiki and OSM
Wikidata-only venues that OSM is missing, while also enriching OSM rows first.

In [52]:
import pandas as pd
import geopandas as gpd

# Inputs
gdf_osm = gdf_final.copy()       # cleaned OSM GeoDataFrame
df_wd   = df_wiki_final.copy()   # cleaned Wikidata DataFrame

TARGET_COLS = ["addr_city","addr_postcode","addr_street","addr_housenumber","phone","website","operator"]
max_dist_m = 120

# --- 1) NAME-KEY FILL ---
wd_small = df_wd[["name_key"] + [c for c in TARGET_COLS if c in df_wd.columns]].add_prefix("wd_")

merged = gdf_osm.merge(wd_small, left_on="name_key", right_on="wd_name_key", how="left")
merged["match_by_name"] = merged["wd_name_key"].notna()

for col in TARGET_COLS:
    wcol = f"wd_{col}"
    if wcol in merged.columns and col in merged.columns:
        merged[col] = merged[col].fillna(merged[wcol])

In [53]:
# --- 2) GEO FALLBACK (only rows still missing something) ---
need_geo = merged[TARGET_COLS].isna().any(axis=1)
if need_geo.any():
    # WD points
    wd_pts = gpd.GeoDataFrame(
        df_wd.dropna(subset=["lat","lon"]).copy(),
        geometry=gpd.points_from_xy(df_wd["lon"], df_wd["lat"]),
        crs="EPSG:4326"
    )[["name_key","geometry"] + [c for c in TARGET_COLS if c in df_wd.columns]]

    osm_missing = merged.loc[need_geo].copy()
    osm_m = gpd.GeoDataFrame(osm_missing, geometry=osm_missing.geometry, crs=gdf_osm.crs).to_crs(3857)
    wd_m  = wd_pts.to_crs(3857)

    nearest = gpd.sjoin_nearest(osm_m, wd_m, how="left", distance_col="__dist_m")
    nearest = nearest[nearest["__dist_m"] <= max_dist_m].to_crs(4326)

    for col in TARGET_COLS:
        if col in nearest.columns and col in merged.columns:
            merged.loc[nearest.index, col] = merged.loc[nearest.index, col].fillna(nearest[col])

    merged.loc[nearest.index, "match_by_geo"] = True
else:
    merged["match_by_geo"] = False


In [54]:
# --- 3) Cleanup + report ---
merged = merged.drop(columns=[c for c in merged.columns if c.startswith("wd_")], errors="ignore")

total = len(merged)
by_name = int(merged["match_by_name"].sum())
by_geo  = int((~merged["match_by_name"] & merged["match_by_geo"]).sum())
still_missing = merged[TARGET_COLS].isna().any(axis=1).sum()

print(f"✅ Total OSM rows: {total}")
print(f"🔗 Filled by name_key: {by_name}")
print(f"📍 Filled by geo (≤{max_dist_m} m): {by_geo}")
print(f"⚠️ Rows still missing any of {TARGET_COLS}: {still_missing}")

gdf_merged = merged.copy()

✅ Total OSM rows: 282
🔗 Filled by name_key: 54
📍 Filled by geo (≤120 m): 84
⚠️ Rows still missing any of ['addr_city', 'addr_postcode', 'addr_street', 'addr_housenumber', 'phone', 'website', 'operator']: 217


# ✅ Save Clean OSM and WIKI Data (gdf_merged) to CSV + GeoJSON.

In [55]:
from pathlib import Path
import os
import geopandas as gpd

# --- Define your output directory ---
OUT_DIR = Path("/Users/mariannagokova/Documents/DA_PROJECT/Thetres_Berlin_Task/clear_ready_git")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# --- Define output filenames ---
csv_path = OUT_DIR / "theatres_berlin_enriched.csv"
geojson_path = OUT_DIR / "theatres_berlin_enriched.geojson"

# --- Ensure GeoDataFrame (create geometry if missing) ---
if not isinstance(gdf_merged, gpd.GeoDataFrame):
    if {"lon_num", "lat_num"}.issubset(gdf_merged.columns):
        gdf_merged = gpd.GeoDataFrame(
            gdf_merged.copy(),
            geometry=gpd.points_from_xy(gdf_merged["lon_num"], gdf_merged["lat_num"]),
            crs="EPSG:4326"
        )
    else:
        raise ValueError("❌ gdf_merged must have geometry or 'lon_num' and 'lat_num' columns.")

# --- Save to CSV (tabular version, no geometry) ---
gdf_merged.drop(columns=["geometry"], errors="ignore").to_csv(csv_path, index=False, encoding="utf-8")
print(f"✅ CSV saved to: {csv_path}")

# --- Save to GeoJSON (with geometry for GIS / mapping) ---
gdf_merged.to_file(geojson_path, driver="GeoJSON", encoding="utf-8")
print(f"✅ GeoJSON saved to: {geojson_path}")

# --- Optional: show file sizes ---
print(f"📦 CSV size: {os.path.getsize(csv_path)/1024:.1f} KB")
print(f"📦 GeoJSON size: {os.path.getsize(geojson_path)/1024:.1f} KB")

# --- Optional: confirmation summary ---
print(f"\n🧾 Saved dataset summary:")
print(f"Rows: {len(gdf_merged)}")
print(f"Columns: {len(gdf_merged.columns)}")
print(f"CRS: {gdf_merged.crs}")


✅ CSV saved to: /Users/mariannagokova/Documents/DA_PROJECT/Thetres_Berlin_Task/clear_ready_git/theatres_berlin_enriched.csv
✅ GeoJSON saved to: /Users/mariannagokova/Documents/DA_PROJECT/Thetres_Berlin_Task/clear_ready_git/theatres_berlin_enriched.geojson
📦 CSV size: 65.4 KB
📦 GeoJSON size: 282.5 KB

🧾 Saved dataset summary:
Rows: 282
Columns: 31
CRS: epsg:4326


# Districts 🎯 Goal

Create a single, spatially enabled layer (lor_full) that includes:

geometry from lor_ortsteile.geojson

attributes (IDs, names, etc.) 

In [56]:
#1️⃣ Load 
import geopandas as gpd
import pandas as pd
from pathlib import Path

BASE = Path("/Users/mariannagokova/Documents/DA_PROJECT/Thetres_Berlin_Task/clear_ready_git/source")

lor = gpd.read_file(BASE / "lor_ortsteile.geojson").to_crs(epsg=4326)


print(lor.shape)


(96, 8)


# Inspect keys for joining

In [57]:
lor.head(3)

Unnamed: 0,gml_id,spatial_name,spatial_alias,spatial_type,OTEIL,BEZIRK,FLAECHE_HA,geometry
0,re_ortsteil.0101,101,Mitte,Polygon,Mitte,Mitte,1063.8748,"POLYGON ((13.41649 52.52696, 13.41635 52.52702..."
1,re_ortsteil.0102,102,Moabit,Polygon,Moabit,Mitte,768.7909,"POLYGON ((13.33884 52.51974, 13.33884 52.51974..."
2,re_ortsteil.0103,103,Hansaviertel,Polygon,Hansaviertel,Mitte,52.5337,"POLYGON ((13.34322 52.51557, 13.34323 52.51557..."


In [58]:
lor.columns

Index(['gml_id', 'spatial_name', 'spatial_alias', 'spatial_type', 'OTEIL',
       'BEZIRK', 'FLAECHE_HA', 'geometry'],
      dtype='object')

# 📘 Columns explained

| Column            | Meaning                                                             | Typical Use                                                                                     |
| ----------------- | ------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------- |
| **name**          | Internal feature ID, e.g. `re_ortsteil.0101`                        | A unique reference ID (not human-readable).                                                     |
| **spatial_alias** | The **official LOR ID** (4-digit string like `0101`, `0102`)        | Use this as the **`neighborhood_id`** key. The first two digits = district (e.g. `01` = Mitte). |
| **spatial_type**  | The **Ortsteil name** (neighborhood), e.g. *Moabit*, *Hansaviertel* | Use as the **`neighborhood`** field.                                                            |
| **OTEIL**         | Duplicate of `spatial_type` (older field)                           | Often redundant — can drop or rename.                                                           |
| **BEZIRK**        | District name, e.g. *Mitte*, *Charlottenburg-Wilmersdorf*           | Use as **`district`**.                                                                          |
| **FLAECHE_HA**    | Area size (in hectares)                                             | Useful for stats or area-based joins.                                                           |
| **geometry**      | Polygon boundary for the neighborhood                               | Use for spatial joins (e.g., assign theaters to neighborhood).                                  |


# Build a clean LOR table (Polygon → MultiPolygon)

In [59]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Polygon, MultiPolygon
import re

# --- 1) Geo setup: 4326 + MultiPolygon ---
if not isinstance(lor, gpd.GeoDataFrame):
    lor = gpd.GeoDataFrame(lor, geometry="geometry")

if lor.crs is None:
    lor = lor.set_crs(4326)
else:
    lor = lor.to_crs(4326)

lor["geometry"] = lor["geometry"].apply(lambda g: MultiPolygon([g]) if isinstance(g, Polygon) else g)

# --- 2) Tidy columns (source names as given) ---
lor_clean = lor.assign(
    district        = lor["BEZIRK"].astype("string").str.strip().str.replace(r"\s+", " ", regex=True),
    neighborhood_id = lor["spatial_name"].astype(str).str.strip(),
    neighborhood    = lor["OTEIL"].astype("string").str.strip(),
    area_ha         = pd.to_numeric(lor.get("FLAECHE_HA", pd.Series(pd.NA, index=lor.index)), errors="coerce"),
)[["district","neighborhood_id","neighborhood","area_ha","geometry"]].copy()

# --- 3) district → official 8-digit id (case/dash/space tolerant) ---
district_mapping = {
    "Mitte": "11001001",
    "Friedrichshain-Kreuzberg": "11002002",
    "Pankow": "11003003",
    "Charlottenburg-Wilmersdorf": "11004004",
    "Spandau": "11005005",
    "Steglitz-Zehlendorf": "11006006",
    "Tempelhof-Schöneberg": "11007007",
    "Neukölln": "11008008",
    "Treptow-Köpenick": "11009009",
    "Marzahn-Hellersdorf": "11010010",
    "Lichtenberg": "11011011",
    "Reinickendorf": "11012012",
}
_map = {k.casefold(): v for k, v in district_mapping.items()}
lor_clean["district_id"] = lor_clean["district"].map(lambda s: _map.get(s.casefold(), pd.NA) if pd.notna(s) else pd.NA).astype("string")

# report any unmapped districts
unmapped = lor_clean.loc[lor_clean["district_id"].isna(), "district"].dropna().drop_duplicates().tolist()
if unmapped:
    print("⚠️ Unmapped district names:", unmapped)

# --- 4) area_ha fallback: compute if missing ---
if lor_clean["area_ha"].isna().any():
    tmp = gpd.GeoDataFrame(lor_clean.loc[lor_clean["area_ha"].isna()], geometry="geometry", crs=lor.crs).to_crs(25833)
    computed = (tmp.area / 10_000).astype("float64")
    lor_clean.loc[computed.index, "area_ha"] = computed


# --- 5) Final column order ---
lor_clean = lor_clean[["district_id","district","neighborhood_id","neighborhood","area_ha","geometry"]]


In [60]:
lor_clean.head(10)

Unnamed: 0,district_id,district,neighborhood_id,neighborhood,area_ha,geometry
0,11001001,Mitte,101,Mitte,1063.8748,"MULTIPOLYGON (((13.41649 52.52696, 13.41635 52..."
1,11001001,Mitte,102,Moabit,768.7909,"MULTIPOLYGON (((13.33884 52.51974, 13.33884 52..."
2,11001001,Mitte,103,Hansaviertel,52.5337,"MULTIPOLYGON (((13.34322 52.51557, 13.34323 52..."
3,11001001,Mitte,104,Tiergarten,516.0672,"MULTIPOLYGON (((13.36879 52.49878, 13.36891 52..."
4,11001001,Mitte,105,Wedding,919.9112,"MULTIPOLYGON (((13.34656 52.53879, 13.34664 52..."
5,11001001,Mitte,106,Gesundbrunnen,610.8368,"MULTIPOLYGON (((13.39449 52.56339, 13.39449 52..."
6,11002002,Friedrichshain-Kreuzberg,201,Friedrichshain,991.2352,"MULTIPOLYGON (((13.41975 52.52555, 13.4198 52...."
7,11002002,Friedrichshain-Kreuzberg,202,Kreuzberg,1033.9178,"MULTIPOLYGON (((13.43926 52.48961, 13.43927 52..."
8,11003003,Pankow,301,Prenzlauer Berg,1096.9869,"MULTIPOLYGON (((13.41649 52.52696, 13.41669 52..."
9,11003003,Pankow,302,Weißensee,790.4559,"MULTIPOLYGON (((13.46731 52.5385, 13.4674 52.5..."


In [61]:
print("Rows:", len(lor_clean))                  
print("CRS:", lor_clean.crs)
print("Geom types:\n", lor_clean.geom_type.value_counts())
print("\nNulls:\n", lor_clean[["district_id","district","neighborhood_id","neighborhood"]].isna().sum())

# Uniqueness
print("\nUnique district_id:", lor_clean["district_id"].nunique())
print("Unique neighborhood_id:", lor_clean["neighborhood_id"].nunique())

# Quick peek
display(lor_clean.head(3)[["district_id","district","neighborhood_id","neighborhood"]])


Rows: 96
CRS: EPSG:4326
Geom types:
 MultiPolygon    96
Name: count, dtype: int64

Nulls:
 district_id        0
district           0
neighborhood_id    0
neighborhood       0
dtype: int64

Unique district_id: 12
Unique neighborhood_id: 96


Unnamed: 0,district_id,district,neighborhood_id,neighborhood
0,11001001,Mitte,101,Mitte
1,11001001,Mitte,102,Moabit
2,11001001,Mitte,103,Hansaviertel


## ✅ Save unified LOR layer

In [62]:
out_path = BASE / "lor_berlin_full_clean.geojson"
lor_clean.to_file(out_path, driver="GeoJSON", encoding="utf-8")
print(f"✅ Saved merged LOR layer to: {out_path}")


✅ Saved merged LOR layer to: /Users/mariannagokova/Documents/DA_PROJECT/Thetres_Berlin_Task/clear_ready_git/source/lor_berlin_full_clean.geojson


In [63]:
lor_clean.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 96 entries, 0 to 95
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   district_id      96 non-null     string  
 1   district         96 non-null     string  
 2   neighborhood_id  96 non-null     object  
 3   neighborhood     96 non-null     string  
 4   area_ha          96 non-null     float64 
 5   geometry         96 non-null     geometry
dtypes: float64(1), geometry(1), object(1), string(3)
memory usage: 4.6+ KB


# Merge final lor(district) and final theaters


In [64]:
import geopandas as gpd

# --- 1️⃣ Make sure both GeoDataFrames share the same CRS ---
gdf_merged = gdf_merged.to_crs(4326)
lor_clean    = lor_clean.to_crs(4326)

# --- 2️⃣ Spatial join: assign each theater to its LOR polygon ---
theaters_enriched = gpd.sjoin(
    gdf_merged,
    lor_clean[["district_id","district","neighborhood_id","neighborhood","geometry"]],
    how="left",
    predicate="within"     # Point-in-polygon
).drop(columns=["index_right"])

# --- 3️⃣ Check results ---
print(theaters_enriched[["name","district","neighborhood", "geometry"]].head())


                      name district neighborhood                   geometry
0         Filmrauschpalast    Mitte       Moabit  POINT (13.35962 52.53438)
1    Friedrichstadt-Palast    Mitte        Mitte  POINT (13.38888 52.52392)
2      Quatsch Comedy Club    Mitte        Mitte  POINT (13.38862 52.52362)
3  Kabarett-Theater Distel    Mitte        Mitte  POINT (13.38851 52.52067)
4           Admiralspalast    Mitte        Mitte   POINT (13.3889 52.52077)


In [65]:
theaters_enriched.columns

Index(['geometry', 'name', 'name_key', 'amenity', 'operator', 'opening_hours',
       'wheelchair', 'cinema', 'cinema_type', 'cinema_3d', 'screen',
       'theatre_type', 'theatre_genre', 'website', 'phone', 'email',
       'wikipedia', 'wikidata', 'addr_housenumber', 'addr_street',
       'addr_postcode', 'addr_city', 'addr_country', 'addr_full', 'lat_num',
       'lon_num', 'source', 'multi_branch', 'wheelchair_accessible',
       'match_by_name', 'match_by_geo', 'district_id', 'district',
       'neighborhood_id', 'neighborhood'],
      dtype='object')

## ✅ Save clean outputs theaters_enriched_district

In [66]:
OUT = Path("/Users/mariannagokova/Documents/DA_PROJECT/Thetres_Berlin_Task/clear_ready_git/source")
OUT.mkdir(parents=True, exist_ok=True)

theaters_enriched.drop(columns=["geometry"]).to_csv(
    OUT / "theatres_berlin_enriched_district.csv", index=False, encoding="utf-8"
)
theaters_enriched.to_file(
    OUT / "theatres_berlin_enriched_district.geojson", driver="GeoJSON", encoding="utf-8"
)

print("✅ Saved:")
print("CSV:", OUT / "theatres_berlin_enriched_district.csv")
print("GeoJSON:", OUT / "theatres_berlin_enriched_district.geojson")
print("Rows:", len(theaters_enriched))
print("Missing district_id:", theaters_enriched['district_id'].isna().sum())
print("Missing neighborhood_id:", theaters_enriched['neighborhood_id'].isna().sum())
print("Geometry type:", theaters_enriched.geom_type.unique())

✅ Saved:
CSV: /Users/mariannagokova/Documents/DA_PROJECT/Thetres_Berlin_Task/clear_ready_git/source/theatres_berlin_enriched_district.csv
GeoJSON: /Users/mariannagokova/Documents/DA_PROJECT/Thetres_Berlin_Task/clear_ready_git/source/theatres_berlin_enriched_district.geojson
Rows: 282
Missing district_id: 0
Missing neighborhood_id: 0
Geometry type: ['Point' 'Polygon']


#  Create theatres_berlin_DB_ready

In [67]:
final_db_ready = theaters_enriched.copy()

**name and name_key**

In [68]:
# --- Prepare name + name_key for DB ---
final_db_ready["name"] = (
    final_db_ready["name"]
    .astype(str)
    .str.strip()
    .str.replace(r"\s+", " ", regex=True)     # collapse multiple spaces
)

if "name_key" in final_db_ready.columns:
    final_db_ready["name_key"] = (
        final_db_ready["name_key"]
        .astype(str)
        .str.strip()
        .str.replace(r"\s+", " ", regex=True)
    )

# Ensure both are database-friendly string dtypes
final_db_ready["name"] = final_db_ready["name"].astype("string")
final_db_ready["name_key"] = final_db_ready["name_key"].astype("string")

# Optional: clip overly long names
final_db_ready["name"] = final_db_ready["name"].str.slice(0, 200)
final_db_ready["name_key"] = final_db_ready["name_key"].str.slice(0, 150)


**operator**

In [69]:
final_db_ready["operator"] = (
    final_db_ready.get("operator")
    .astype("string")
    .str.strip()
    .str.replace(r"\s+", " ", regex=True)
    .replace({"": pd.NA, "nan": pd.NA, "none": pd.NA})
    .str.slice(0, 200)
)

**amenity + operator**

In [70]:
# --- Prepare amenity + operator columns for DB ---

# amenity: fill missing with placeholder, normalize whitespace
final_db_ready["amenity"] = (
    final_db_ready["amenity"]
    .astype(str)
    .str.strip()
    .str.replace(r"\s+", " ", regex=True)
    .replace("nan", pd.NA)
)
final_db_ready["amenity"] = final_db_ready["amenity"].fillna("unknown")

# operator: optional, keep empty if not available
if "operator" not in final_db_ready.columns:
    final_db_ready["operator"] = pd.NA

final_db_ready["operator"] = (
    final_db_ready["operator"]
    .astype(str)
    .str.strip()
    .str.replace(r"\s+", " ", regex=True)
    .replace("nan", pd.NA)
)

# DB-friendly string dtype
final_db_ready["amenity"] = final_db_ready["amenity"].astype("string")
final_db_ready["operator"] = final_db_ready["operator"].astype("string")

# Optional: shorten text for DB constraints
final_db_ready["amenity"] = final_db_ready["amenity"].str.slice(0, 50)
final_db_ready["operator"] = final_db_ready["operator"].str.slice(0, 200)


**wheelchair and opening_hours**

In [71]:
# --- Prepare opening_hours + wheelchair columns for DB ---

# opening_hours: trim, normalize whitespace, fill empty with <NA>
if "opening_hours" in final_db_ready.columns:
    final_db_ready["opening_hours"] = (
        final_db_ready["opening_hours"]
        .astype(str)
        .str.strip()
        .str.replace(r"\s+", " ", regex=True)
        .replace(["nan", ""], pd.NA)
    )
    # convert to DB-friendly string
    final_db_ready["opening_hours"] = final_db_ready["opening_hours"].astype("string")
    # optional length limit
    final_db_ready["opening_hours"] = final_db_ready["opening_hours"].str.slice(0, 200)
else:
    final_db_ready["opening_hours"] = pd.NA


# wheelchair: normalize yes/no/limited
if "wheelchair" in final_db_ready.columns:
    final_db_ready["wheelchair"] = (
        final_db_ready["wheelchair"]
        .astype(str)
        .str.lower()
        .str.strip()
        .replace({
            "nan": pd.NA,
            "none": pd.NA,
            "no": "no",
            "yes": "yes",
            "limited": "limited",
            "partial": "limited",
            "accessible": "yes",
            "not accessible": "no"
        })
    )
else:
    final_db_ready["wheelchair"] = pd.NA

# enforce as string (avoid category dtype for DB export)
final_db_ready["wheelchair"] = final_db_ready["wheelchair"].astype("string")


In [72]:
final_db_ready[["cinema", "screen","cinema_type","cinema_3d"]].info()

<class 'pandas.core.frame.DataFrame'>
Index: 282 entries, 0 to 281
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   cinema       1 non-null      object
 1   screen       36 non-null     object
 2   cinema_type  1 non-null      object
 3   cinema_3d    1 non-null      object
dtypes: object(4)
memory usage: 11.0+ KB


**cinema, cinema_type, cinema_3d**

In [73]:
# --- Drop low-information cinema columns ---
cols_to_drop = [c for c in ["cinema", "cinema_type", "cinema_3d"] if c in final_db_ready.columns]
final_db_ready.drop(columns=cols_to_drop, inplace=True)


**screen**

In [74]:
# screen: numeric if possible
if "screen" in final_db_ready.columns:
    final_db_ready["screen"] = pd.to_numeric(final_db_ready["screen"], errors="coerce")
else:
    final_db_ready["screen"] = pd.NA

**Combine "type and genre" in theatre_category**

Standardize these into a single theatre_category without losing signal, and also keep a lightweight tag field so  still see both type + genre when both are present.

In [75]:
import pandas as pd
import re

# --- helpers ---
def _clean(s):
    if pd.isna(s): return pd.NA
    s = str(s).strip()
    s = re.sub(r"\s+", " ", s)
    return pd.NA if s in {"", "nan", "none"} else s

# Canonical vocabularies
GENRE_CANON = {
    "drama": "drama",
    "variety": "variety",
    "circus": "circus",
    "puppet": "puppet",
    "cabaret": "cabaret",
    "philharmonic": "philharmonic",
    "opera": "opera",
    "chamber_music": "chamber_music",
    "musical": "musical",
    "stand_up_comedy": "stand_up_comedy",
    "burlesque": "burlesque",
    "magic": "magic",
    "children": "children",
    "comedy": "comedy",
}
TYPE_CANON = {
    "concert_hall": "concert_hall",
    "stage": "stage",
    "opera_house": "opera_house",
    "open_air": "open_air",
    "amphi": "amphi",
    "puppet_theatre": "puppet_theatre",
    "circus": "circus",
}
GENRE_IMPLIES_TYPE = {
    "opera": "opera_house",
    "philharmonic": "concert_hall",
    "chamber_music": "concert_hall",
    "circus": "circus",
    "puppet": "puppet_theatre",
}

# --- 1) Clean inputs ---
for c in ["theatre_type", "theatre_genre"]:
    if c in final_db_ready.columns:
        final_db_ready[c] = final_db_ready[c].map(_clean)

# --- 2) Canonicalize ---
if "theatre_type" in final_db_ready.columns:
    final_db_ready["theatre_type"] = final_db_ready["theatre_type"].map(
        lambda s: TYPE_CANON.get(s, s) if pd.notna(s) else s
    )
if "theatre_genre" in final_db_ready.columns:
    final_db_ready["theatre_genre"] = final_db_ready["theatre_genre"].map(
        lambda s: GENRE_CANON.get(s, s) if pd.notna(s) else s
    )

tt = final_db_ready.get("theatre_type")
tg = final_db_ready.get("theatre_genre")

# --- 3) Build theatre_category (vectorized) ---
# start with type
theatre_category = tt.copy()

# where type is NA but genre implies a type -> use implied type
implied = tg.map(GENRE_IMPLIES_TYPE).where(tg.notna(), pd.NA)
theatre_category = theatre_category.fillna(implied)

# where still NA, use genre itself
theatre_category = theatre_category.fillna(tg)

# If you want NULLs in DB for unknown, leave as NA.
# If you prefer the literal "unknown", uncomment:
# theatre_category = theatre_category.fillna("unknown")

final_db_ready["theatre_category"] = (
    theatre_category.astype("string").str.slice(0, 100)
)

# --- 4) Build theatre_tags (vectorized: "type; genre" when both & different) ---
tags = tt.astype("string")

need_genre = tg.notna() & (tt.isna() | (tg != tt))
tags = tags.where(~need_genre, tags.fillna("") + "; " + tg.astype("string"))

# tidy up: remove leading/trailing separators and empty strings to NA
tags = tags.str.replace(r"^; | ;$", "", regex=True).replace({"": pd.NA})

final_db_ready["theatre_tags"] = tags.astype("string")

final_db_ready.drop(columns=[c for c in ["theatre_type","theatre_genre"] if c in final_db_ready.columns],
                    inplace=True)

# --- 5) Quick sanity check ---
print(final_db_ready["theatre_category"].value_counts(dropna=False))
print(final_db_ready["theatre_tags"].value_counts(dropna=False).head(10))


theatre_category
<NA>               225
circus               9
variety              7
drama                7
concert_hall         7
puppet_theatre       6
stage                5
cabaret              4
opera_house          3
musical              2
open_air             2
stand_up_comedy      1
burlesque            1
amphi                1
magic                1
children             1
Name: count, dtype: Int64
theatre_tags
<NA>                          225
circus                          8
variety                         7
drama                           7
puppet                          5
cabaret                         4
concert_hall; philharmonic      4
opera_house; opera              3
stage                           3
open_air                        2
Name: count, dtype: Int64


**website**

In [76]:
import re
import pandas as pd
from urllib.parse import urlparse, urlunparse

# match http(s)://... OR bare domain like example.com/path
URL_TOKEN_RE = re.compile(
    r"(?:(?:https?://)?(?:[A-Z0-9.-]+\.[A-Z]{2,})(?:/[^\s]*)?)",
    re.I
)

def simple_website(s):
    if s is None or (isinstance(s, float) and pd.isna(s)):
        return pd.NA
    s = str(s).strip()
    if not s:
        return pd.NA

    m = URL_TOKEN_RE.search(s)
    if not m:
        return pd.NA

    u = m.group(0)

    # add scheme if missing
    if not re.match(r"^[a-z]+://", u, re.I):
        u = "https://" + u

    try:
        p = urlparse(u)
        if not p.hostname:
            return pd.NA
        host = p.hostname.lower()
        netloc = host if not p.port else f"{host}:{p.port}"
        # keep path as-is; drop fragment
        return urlunparse((p.scheme.lower(), netloc, p.path or "/", p.params, p.query, ""))
    except Exception:
        return pd.NA

final_db_ready["website"] = (
    final_db_ready["website"]
    .map(simple_website)
    .astype("string")
    .str.slice(0, 255)
)


**phone**

In [77]:
# final phone normalized

import pandas as pd
import re

# --- helpers (no city assumptions) ---
_EXT = re.compile(r"(?:\bext\.?\s*|\bx\s*|\bdurchwahl\s*|#)\s*(\d{1,6})$", re.IGNORECASE)

def _clean_raw(s):
    if s is None or (isinstance(s, float) and pd.isna(s)): return pd.NA
    s = str(s).strip()
    s = re.sub(r"\s+", " ", s)
    return pd.NA if s.lower() in {"", "nan", "none"} else s

def _strip_ext(s):  # remove visible extension part; keep main number only
    if pd.isna(s): return s
    m = _EXT.search(s)
    if m: 
        return s[:m.start()].strip()
    hy = re.search(r"(.*?)(?:-|–|—)\s*(\d{1,6})$", s)
    if hy and re.search(r"\d", hy.group(1)):
        return hy.group(1).strip()
    return s

def _to_e164_strict(main: str):
    """Normalize if +, 00, or German trunk 0; else return None."""
    if not main: return None
    s = re.sub(r"[^\d+]", "", main)

    # 00… -> +…
    if s.startswith("00"):
        s = "+" + s[2:]

    if s.startswith("+"):
        s = "+" + re.sub(r"\D", "", s[1:])
        return s if len(s) >= 8 else None

    # German trunk (0…) -> +49…
    if s.startswith("0"):
        s = "+49" + re.sub(r"\D", "", s[1:])
        return s if len(s) >= 8 else None

    return None  # bare local/unknown → don't guess


# clean raw
raw = final_db_ready["phone"].map(_clean_raw)

# try normalize to E.164 (first piece only if multiple are present)
first_piece = raw.str.split(r"[;/,|]+").str[0].map(_clean_raw).map(_strip_ext)
normalized = first_piece.map(_to_e164_strict)

# final phone: prefer normalized; else keep cleaned raw
phone_final = normalized.fillna(first_piece).astype("string")

# clip to DB size and assign back
final_db_ready["phone"] = phone_final.str.slice(0, 32)

#after sanity check bad_phone_format_loose = 115 (set obviously bad → NULL)
final_db_ready["phone"] = (
    final_db_ready["phone"]
    .where(final_db_ready["phone"].str.match(r"^\+?\d[\d \-()/]{4,}$", na=True))
    .astype("string")
)


**email**

In [78]:
import re
import pandas as pd

EMAIL_RE = re.compile(r"[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,63}")

def simple_email(s):
    if s is None or (isinstance(s, float) and pd.isna(s)):
        return pd.NA
    s = str(s).strip()
    if s.lower().startswith("mailto:"):
        s = s[7:].strip()
    m = EMAIL_RE.search(s)            # find first email anywhere
    return m.group(0).lower() if m else pd.NA

final_db_ready["email"] = (
    final_db_ready["email"].map(simple_email).astype("string").str.slice(0, 254)
)


**wikipedia and wikidata**

In [79]:
#removes the  wikipedia / wikidata columns from  DB table.
final_db_ready.drop(columns=[c for c in ["wikipedia","wikidata"] if c in final_db_ready.columns],
                    inplace=True, errors="ignore")

**addr_street, addr_housenumber,addr_postcode, addr_city, addr_country, addr_full**

In [80]:
final_db_ready['addr_city'] = final_db_ready['addr_city'].fillna('Berlin')
final_db_ready['addr_country'] = final_db_ready['addr_country'].fillna('DE')
for c in ["addr_housenumber","addr_street","addr_postcode"]:
    final_db_ready[c] = (
        final_db_ready[c]
        .replace({"unknown": pd.NA, "Unknown": pd.NA, "": pd.NA})
        .astype("string")
    )
# keep only valid 5-digit PLZ or NULL
final_db_ready["addr_postcode"] = final_db_ready["addr_postcode"].where(
    final_db_ready["addr_postcode"].str.fullmatch(r"\d{5}"), pd.NA
)
final_db_ready["addr_full"] = final_db_ready["addr_full"].astype("string").str.slice(0, 255)


**lon_num, lat_num and geometry**

In [81]:
final_db_ready['latitude'] = pd.to_numeric(final_db_ready['lat_num'], errors='coerce')
final_db_ready['longitude'] = pd.to_numeric(final_db_ready['lon_num'], errors='coerce')


In [82]:
cols_to_drop = [
    "source",
    "multi_branch",
    "wheelchair_accessible",
    "match_by_name",
    "match_by_geo",
]

# drop in place; ignore missing
final_db_ready.drop(columns=[c for c in cols_to_drop if c in final_db_ready.columns],
                    inplace=True, errors="ignore")

## theater_id

stable, deterministic theater_id by hashing a normalized fingerprint of key fields:

name_key (normalized)

amenity (e.g., theater/cinema)

lat, lon (rounded for stability)

We’ll use a namespaced UUIDv5 (deterministic, portable) and keep it short: thb_ + first 12 hex chars → 16-char ID that fits your DB example.

In [83]:
import pandas as pd
import uuid

# --- Fixed namespace (DO NOT CHANGE once IDs are in use) ---
THEATER_NS = uuid.UUID("7e3f7a02-9a8e-4b7d-9c5c-5f9d9d2f2a11")

def _to_str(v):   return "" if pd.isna(v) else str(v)
def _to_coord(v): return "NA" if pd.isna(v) else f"{float(v):.6f}"

def _payload(row):
    # deterministic payload from normalized fields
    return "|".join([
        _to_str(row["name_key"]),
        _to_str(row["amenity"]).lower(),
        _to_coord(row["lat_num"]),
        _to_coord(row["lon_num"]),
    ])

# 1) Base IDs from core payload
base_payload = final_db_ready.apply(_payload, axis=1)
final_db_ready["theater_id"] = base_payload.map(
    lambda s: "thb_" + uuid.uuid5(THEATER_NS, s).hex[:12]
).astype("string")

# 2) Re-seed ONLY duplicates with a stable salt (row index)
dups = final_db_ready["theater_id"].duplicated(keep=False)
if dups.any():
    salted = (base_payload[dups] + "|@" + final_db_ready.index[dups].astype(str))
    final_db_ready.loc[dups, "theater_id"] = salted.map(
        lambda s: "thb_" + uuid.uuid5(THEATER_NS, s).hex[:12]
    ).astype("string")

# 3) If any (ultra-rare) remain duplicated, widen to 16 hex for those rows
still = final_db_ready["theater_id"].duplicated(keep=False)
if still.any():
    salted2 = (base_payload[still] + "|@" + final_db_ready.index[still].astype(str))
    final_db_ready.loc[still, "theater_id"] = salted2.map(
        lambda s: "thb_" + uuid.uuid5(THEATER_NS, s).hex[:16]  # total len 20 incl. prefix
    ).astype("string")

# Final assert for safety
assert not final_db_ready["theater_id"].duplicated().any(), "theater_id still duplicated"


**last_updated**

In [84]:
from datetime import datetime, timezone
import pandas as pd

# Europe/Berlin time as ISO 8601 (e.g., 2025-10-10T12:34:56+02:00)
last_updated_iso = pd.Timestamp.now(tz="Europe/Berlin").isoformat(timespec="seconds")

final_db_ready["last_updated"] = last_updated_iso  # same value for all rows
final_db_ready["last_updated"] = final_db_ready["last_updated"].astype("string")


**geometry**

In [85]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

# --- Ensure GeoDataFrame + geometry column ---
if not isinstance(final_db_ready, gpd.GeoDataFrame):
    final_db_ready = gpd.GeoDataFrame(final_db_ready, geometry="geometry", crs="EPSG:4326")

# --- Fix CRS to EPSG:4326 (WGS84) ---
if final_db_ready.crs is None:
    final_db_ready = final_db_ready.set_crs(4326)
elif final_db_ready.crs.to_epsg() != 4326:
    final_db_ready = final_db_ready.to_crs(4326)

# --- Convert non-Point geoms to centroids in projected CRS, then back ---
mask_non_point = ~final_db_ready.geometry.geom_type.eq("Point")
if mask_non_point.any():
    tmp = final_db_ready.loc[mask_non_point].to_crs(3857).copy()
    tmp["geometry"] = tmp.geometry.buffer(0)          # fix invalids (no-op if valid)
    cent = tmp.geometry.centroid
    cent4326 = gpd.GeoSeries(cent, crs=3857).to_crs(4326)
    final_db_ready.loc[mask_non_point, "geometry"] = cent4326.values

# --- Create geometry where missing but lat/lon exist ---
has_coords = final_db_ready["lat_num"].notna() & final_db_ready["lon_num"].notna()
need_geom  = final_db_ready["geometry"].isna() & has_coords
if need_geom.any():
    final_db_ready.loc[need_geom, "geometry"] = [
        Point(xy) for xy in zip(final_db_ready.loc[need_geom, "lon_num"],
                                 final_db_ready.loc[need_geom, "lat_num"])
    ]

# --- Sync lat/lon FROM geometry (geometry is source of truth), rounded ---
final_db_ready["lon_num"] = pd.to_numeric(final_db_ready.geometry.x, errors="coerce").round(6)
final_db_ready["lat_num"] = pd.to_numeric(final_db_ready.geometry.y, errors="coerce").round(6)

# --- Optional: drop geometry if  won’t use spatial SQL ---
# final_db_ready.drop(columns=["geometry"], inplace=True, errors="ignore")


In [86]:
final_db_ready.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 282 entries, 0 to 281
Data columns (total 29 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   geometry          282 non-null    geometry
 1   name              282 non-null    string  
 2   name_key          282 non-null    string  
 3   amenity           282 non-null    string  
 4   operator          282 non-null    string  
 5   opening_hours     282 non-null    string  
 6   wheelchair        210 non-null    string  
 7   screen            36 non-null     float64 
 8   website           236 non-null    string  
 9   phone             166 non-null    string  
 10  email             80 non-null     string  
 11  addr_housenumber  218 non-null    string  
 12  addr_street       219 non-null    string  
 13  addr_postcode     215 non-null    string  
 14  addr_city         282 non-null    object  
 15  addr_country      282 non-null    object  
 16  addr_full         282 n

# PRE-DB SANITY CHECK

In [87]:
import re
import pandas as pd

def run_sanity_checks(final_db_ready, keep_geometry=True):
    issues = {}

    # --- core id/coords ---
    issues["theater_id_null"] = int(final_db_ready["theater_id"].isna().sum())
    issues["theater_id_dups"] = int(final_db_ready["theater_id"].duplicated().sum())
    issues["lat_null"] = int(final_db_ready["lat_num"].isna().sum())
    issues["lon_null"] = int(final_db_ready["lon_num"].isna().sum())

    # coords roughly within Berlin bbox (report only)
    bbox = (
        final_db_ready["lon_num"].between(13.06, 13.77)
        & final_db_ready["lat_num"].between(52.33, 52.68)
    )
    issues["coords_outside_berlin_bbox"] = int((~bbox).sum())

    # --- address formats ---
    if "addr_postcode" in final_db_ready.columns:
        bad_plz = final_db_ready["addr_postcode"].notna() & ~final_db_ready["addr_postcode"].str.fullmatch(r"\d{5}")
        issues["bad_postcode_format"] = int(bad_plz.sum())

    # --- contact (display-level) ---
    if "website" in final_db_ready.columns:
        # must have a scheme (http/https/...), otherwise treat as bad
        bad_url = final_db_ready["website"].notna() & ~final_db_ready["website"].str.match(r"^[a-z]+://", na=True)
        issues["bad_website_scheme"] = int(bad_url.sum())

    if "email" in final_db_ready.columns:
        EMAIL_TOKEN = re.compile(r"[A-Z0-9._%+\-]+@[A-Z0-9.\-]+\.[A-Z]{2,63}", re.I)
        bad_email = final_db_ready["email"].notna() & ~final_db_ready["email"].str.contains(EMAIL_TOKEN, na=True)
        issues["bad_email_format"] = int(bad_email.sum())

    if "phone" in final_db_ready.columns:
        # loose shape: +E.164 or basic digits with separators
        bad_phone = final_db_ready["phone"].notna() & ~final_db_ready["phone"].str.match(r"^\+?\d[\d \-()/]{4,}$")
        issues["bad_phone_format_loose"] = int(bad_phone.sum())

    # --- geometry (optional) ---
    if keep_geometry and "geometry" in final_db_ready.columns:
        try:
            import geopandas as gpd  # noqa
            is_point = final_db_ready.geometry.geom_type.eq("Point")
            issues["non_point_geoms"] = int((~is_point).sum())
            issues["crs_is_4326"] = bool(getattr(final_db_ready, "crs", None) and final_db_ready.crs.to_epsg() == 4326)
        except Exception as e:
            issues["geometry_check_error"] = str(e)

    # --- NOT NULL expectations on core text fields ---
    for col in ["name", "amenity", "addr_full", "district_id", "district"]:
        if col in final_db_ready.columns:
            issues[f"{col}_null"] = int(final_db_ready[col].isna().sum())

    # print only non-zero / truthy issues
    printable = {k: v for k, v in issues.items() if bool(v)}
    print("Sanity check summary:", printable)

    # hard stops
    critical = []
    if issues.get("theater_id_null", 0) > 0: critical.append("theater_id has NULLs")
    if issues.get("theater_id_dups", 0) > 0: critical.append("theater_id has duplicates")
    if issues.get("lat_null", 0) > 0 or issues.get("lon_null", 0) > 0: critical.append("lat/lon missing")

    if critical:
        raise ValueError("DB insert blocked: " + "; ".join(critical))

# ---- run it ----
run_sanity_checks(final_db_ready, keep_geometry=True)  # set False if you dropped geometry


Sanity check summary: {'crs_is_4326': True}


In [88]:
final_db_ready = final_db_ready.rename(columns={"amenity": "place_type"})
final_db_ready = final_db_ready.drop(columns=["lat_num","lon_num","neighborhood"], errors="ignore")


# 🎉 Final column order (nice for exports)

In [89]:
cols_order = [
    "theater_id","name","name_key","place_type",
    "operator","opening_hours","wheelchair","screen",
    "website","phone","email",
    "addr_full","addr_street","addr_housenumber","addr_postcode","addr_city","addr_country",
    "theatre_tags","theatre_category",
    "district_id","district","neighborhood_id",
    "longitude","latitude","last_updated"
]
# keep any existing columns not listed (e.g., geometry if kept)
cols_order = [c for c in cols_order if c in final_db_ready.columns] + \
             [c for c in final_db_ready.columns if c not in cols_order]
final_db_ready = final_db_ready[cols_order]


In [90]:
final_db_ready.head(3)

Unnamed: 0,theater_id,name,name_key,place_type,operator,opening_hours,wheelchair,screen,website,phone,...,addr_country,theatre_tags,theatre_category,district_id,district,neighborhood_id,longitude,latitude,last_updated,geometry
0,thb_1fed95705f16,Filmrauschpalast,filmrauschpalast,cinema,Filmrausch Moabit e.V.,,no,1.0,https://www.filmrausch.de/,49303844344,...,DE,,,11001001,Mitte,102,13.359623,52.534378,2025-10-12T23:39:15+02:00,POINT (13.35962 52.53438)
1,thb_957cf1213125,Friedrichstadt-Palast,friedrichstadt-palast,theatre,,,yes,,https://www.palast.berlin/,493023262326,...,DE,,,11001001,Mitte,101,13.388879,52.523922,2025-10-12T23:39:15+02:00,POINT (13.38888 52.52392)
2,thb_e0da8d4b4612,Quatsch Comedy Club,quatsch-comedy-club,theatre,,,limited,,https://www.quatsch-comedy-club.de/,493027879030,...,DE,stand_up_comedy,stand_up_comedy,11001001,Mitte,101,13.388621,52.523624,2025-10-12T23:39:15+02:00,POINT (13.38862 52.52362)


In [91]:
final_db_ready.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 282 entries, 0 to 281
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   theater_id        282 non-null    string  
 1   name              282 non-null    string  
 2   name_key          282 non-null    string  
 3   place_type        282 non-null    string  
 4   operator          282 non-null    string  
 5   opening_hours     282 non-null    string  
 6   wheelchair        210 non-null    string  
 7   screen            36 non-null     float64 
 8   website           236 non-null    string  
 9   phone             166 non-null    string  
 10  email             80 non-null     string  
 11  addr_full         282 non-null    string  
 12  addr_street       219 non-null    string  
 13  addr_housenumber  218 non-null    string  
 14  addr_postcode     215 non-null    string  
 15  addr_city         282 non-null    object  
 16  addr_country      282 n

## ✅ Save clean outputs theaters_berlin_db_ready

In [92]:
import os
import geopandas as gpd
from pathlib import Path

# ------------------------------------------------------
# 📁 1. Define output directory and file names
# ------------------------------------------------------
OUT = Path("/Users/mariannagokova/Documents/DA_PROJECT/Thetres_Berlin_Task/clear_ready_git/source")
OUT.mkdir(parents=True, exist_ok=True)

csv_path = OUT / "theaters_berlin_db_ready.csv"
geojson_path = OUT / "theaters_berlin_db_ready.geojson"

# ------------------------------------------------------
# 🧭 2. Ensure final_db_ready is a GeoDataFrame
# ------------------------------------------------------
if isinstance(final_db_ready, gpd.GeoDataFrame):
    gdf = final_db_ready
elif "geometry" in final_db_ready.columns:
    gdf = gpd.GeoDataFrame(final_db_ready, geometry="geometry", crs=getattr(final_db_ready, "crs", "EPSG:4326"))
else:
    gdf = None
    print("⚠️ Warning: No geometry column found — only CSV export will be created.")

# ------------------------------------------------------
# 💾 3. Save to CSV (tabular, without geometry)
# ------------------------------------------------------
cols_no_geom = [c for c in final_db_ready.columns if c not in {"geometry", "geom"}]
final_db_ready[cols_no_geom].to_csv(csv_path, index=False, encoding="utf-8")
print(f"✅ CSV saved to: {csv_path}")

# ------------------------------------------------------
# 🌍 4. Save to GeoJSON (for GIS / visualization)
# ------------------------------------------------------
if gdf is not None:
    if gdf.crs is None:
        gdf = gdf.set_crs(4326)
    else:
        gdf = gdf.to_crs(4326)

    gdf.to_file(geojson_path, driver="GeoJSON", encoding="utf-8")
    print(f"✅ GeoJSON saved to: {geojson_path}")
else:
    print("ℹ️ GeoJSON export skipped (no geometry).")

# ------------------------------------------------------
# 📦 5. Report file sizes and dataset info
# ------------------------------------------------------
try:
    print(f"📦 CSV size: {os.path.getsize(csv_path)/1024:.1f} KB")
    if gdf is not None and geojson_path.exists():
        print(f"📦 GeoJSON size: {os.path.getsize(geojson_path)/1024:.1f} KB")
except OSError as e:
    print(f"⚠️ Could not get file sizes: {e}")

# ------------------------------------------------------
# 🧾 6. Summary info
# ------------------------------------------------------
print("\n🧾 Saved dataset summary:")
print(f"Rows: {len(final_db_ready)}")
print(f"Columns: {len(final_db_ready.columns)}")
print(f"CRS: {getattr(gdf, 'crs', 'None')}")


✅ CSV saved to: /Users/mariannagokova/Documents/DA_PROJECT/Thetres_Berlin_Task/clear_ready_git/source/theaters_berlin_db_ready.csv
✅ GeoJSON saved to: /Users/mariannagokova/Documents/DA_PROJECT/Thetres_Berlin_Task/clear_ready_git/source/theaters_berlin_db_ready.geojson
📦 CSV size: 77.8 KB
📦 GeoJSON size: 226.9 KB

🧾 Saved dataset summary:
Rows: 282
Columns: 26
CRS: epsg:4326


# 🎭 SQL Table: theatres_berlin
 Column-by-Column Explanation with Example Data

| **Column**         | **Description**                                                               | **Example Value**                             |
| ------------------ | ----------------------------------------------------------------------------- | --------------------------------------------- |
| `theater_id`       | Unique stable ID for each venue ( hash or UUID).                              | `thb_1fed95705f16`                        |
| `name`             | Official name of the venue.                                                   | `Volksbühne Berlin`                           |
| `name_key`         | Normalized lowercase key version of the name (used for joins/matching).       | `volksbuehne_berlin`                          |
| `place_type`       | Venue type category.                                                          | `theatre` / `cinema`                          |
| `operator`         | Organization or company running the venue.                                    | `Stiftung Volksbühne am Rosa-Luxemburg-Platz` |
| `opening_hours`    | Opening hours in OSM format.                                                  | `Tu-Su 11:00-20:00`                           |
| `wheelchair`       | Accessibility info.                                                           | `yes`                                         |
| `screen`           | Number of cinema screens (if applicable).                                     | `3`                                           |
| `website`          | Official website URL.                                                         | `https://www.volksbuehne.berlin/`             |
| `phone`            | Contact phone number.                                                         | `+49 30 24065 777`                            |
| `email`            | Contact email.                                                                | `info@volksbuehne-berlin.de`                  |
| `addr_full`        | Complete formatted address.                                                   | `Rosa-Luxemburg-Platz, 10178 Berlin, DE`      |
| `addr_street`      | Street name only.                                                             | `Rosa-Luxemburg-Platz`                        |
| `addr_housenumber` | House number.                                                                 | `1`                                           |
| `addr_postcode`    | Postal code.                                                                  | `10178`                                       |
| `addr_city`        | City name.                                                                    | `Berlin`                                      |
| `addr_country`     | Country name.                                                                 | `Germany`                                     |
| `theatre_tags`     | Raw source tags (from OSM or Wikidata, possibly JSON or semicolon-separated). | `amenity=theatre;wikidata=Q2451973`           |
| `theatre_category` | Simplified thematic classification.                                           | `Performing Arts Theatre`                     |
| `district_id`      | LOR district numeric code.                                                    | `11001001`                                          |
| `district`         | District name (LOR "Bezirk").                                                 | `Mitte`                                       |
| `neighborhood_id`  | LOR neighborhood (Ortsteil) code.                                             | `0102`                                        |
| `longitude`        | Longitude (EPSG:4326).                                                        | `13.41053`                                    |
| `latitude`         | Latitude (EPSG:4326).                                                         | `52.52854`                                    |
| `last_updated`     | Timestamp of last data update or import.                                      | `2025-10-09 18:45:00`                         |


# Recommended DB layout (PostgreSQL + PostGIS)

In [93]:
"""
CREATE TABLE theatres_berlin (
    theater_id         VARCHAR(64) PRIMARY KEY,         -- Unique stable ID (e.g., hash or UUID)
    name               VARCHAR(255) NOT NULL,           -- Official name of the theatre or cinema
    name_key           VARCHAR(255),                    -- Normalized lowercase name (slug, key)
    place_type         VARCHAR(50),                     -- Type: "cinema", "theatre", etc.
    operator           VARCHAR(255),                    -- Organization or company operating it
    opening_hours      VARCHAR(255),                    -- OSM-style hours string
    wheelchair         VARCHAR(50),                     -- Accessibility info: "yes", "no", "limited"
    screen             INTEGER,                         -- Number of screens (for cinemas)
    website            VARCHAR(255),                    -- Official website URL
    phone              VARCHAR(100),                    -- Contact phone number
    email              VARCHAR(255),                    -- Contact email address
    addr_full          VARCHAR(255),                    -- Full formatted address
    addr_street        VARCHAR(255),                    -- Street name
    addr_housenumber   VARCHAR(50),                     -- House or building number
    addr_postcode      VARCHAR(20),                     -- Postal code
    addr_city          VARCHAR(100),                    -- City (usually "Berlin")
    addr_country       VARCHAR(100),                    -- Country (usually "Germany")
    theatre_tags       TEXT,                            -- Raw tags or classification info from OSM/Wikidata
    theatre_category   VARCHAR(100),                    -- Derived label: e.g., "performing arts", "independent cinema"
    district_id        VARCHAR(10),                     -- LOR district code
    district           VARCHAR(100),                    -- LOR district name
    neighborhood_id    VARCHAR(10),                     -- LOR neighborhood (Ortsteil) code
    longitude          DECIMAL(9,6),                    -- WGS84 coordinate (lon)
    latitude           DECIMAL(9,6),                    -- WGS84 coordinate (lat)
    last_updated       TIMESTAMP DEFAULT CURRENT_TIMESTAMP  -- Timestamp when data last updated
    CONSTRAINT district_id_fk FOREIGN KEY (district_id)
        REFERENCES berlin_data.districts(district_id)
        ON DELETE RESTRICT
        ON UPDATE CASCADE
);
"""

'\nCREATE TABLE theatres_berlin (\n    theater_id         VARCHAR(64) PRIMARY KEY,         -- Unique stable ID (e.g., hash or UUID)\n    name               VARCHAR(255) NOT NULL,           -- Official name of the theatre or cinema\n    name_key           VARCHAR(255),                    -- Normalized lowercase name (slug, key)\n    place_type         VARCHAR(50),                     -- Type: "cinema", "theatre", etc.\n    operator           VARCHAR(255),                    -- Organization or company operating it\n    opening_hours      VARCHAR(255),                    -- OSM-style hours string\n    wheelchair         VARCHAR(50),                     -- Accessibility info: "yes", "no", "limited"\n    screen             INTEGER,                         -- Number of screens (for cinemas)\n    website            VARCHAR(255),                    -- Official website URL\n    phone              VARCHAR(100),                    -- Contact phone number\n    email              VARCHAR(255),     