# Scrape Enrichment Data using OpenStreetMap

In [None]:
import geopandas as gpd
import pandas as pd
from osmnx.features import features_from_polygon
from shapely.geometry import Polygon
import time

# Load Ortsteil polygons
gdf_boundary = gpd.read_file("../data/spatial_data/lor_ortsteile.geojson")

# Define tags for POIs
tags = {
    "shop": ["supermarket"],
    "amenity": [
        "cafe", "restaurant", "bar", "nightclub",
        "school", "kindergarten", "university",
        "clinic", "hospital", "pharmacy"
    ],
    "leisure": ["park", "playground"],
    "landuse": ["grass", "forest"],
    "natural": ["wood"]
}

# Empty lists to collect data
pois_list = []
failed_ortsteile = []

# Loop through Ortsteile
for i, row in gdf_boundary.iterrows():
    ortsteil_name = row["OTEIL"]  # or possibly row["spatial_name"] if you use that one
    polygon = row['geometry']
    
    try:
        print(f"🔎 Fetching POIs for: {ortsteil_name}")
        pois = features_from_polygon(polygon, tags)
        pois["ortsteil"] = ortsteil_name
        pois_list.append(pois)
        time.sleep(2)  # polite pause between requests
    except Exception as e:
        print(f"❌ Failed for {ortsteil_name}: {e}")
        failed_ortsteile.append(ortsteil_name)

# Save failed list (optional)
with open("failed_ortsteile.txt", "w") as f:
    for item in failed_ortsteile:
        f.write(f"{item}\n")

# Combine successful POIs
gdf_pois = gpd.GeoDataFrame(pd.concat(pois_list, ignore_index=True))

# Save to file
gdf_pois.to_file("berlin_pois.geojson", driver="GeoJSON")

print(f"\n✅ Saved {len(gdf_pois)} POIs to 'berlin_pois.geojson'")
print(f"⚠️  {len(failed_ortsteile)} Ortsteile failed and were saved to 'failed_ortsteile.txt'")

🔎 Fetching POIs for: Mitte
🔎 Fetching POIs for: Moabit
🔎 Fetching POIs for: Hansaviertel
🔎 Fetching POIs for: Tiergarten
🔎 Fetching POIs for: Wedding
🔎 Fetching POIs for: Gesundbrunnen
🔎 Fetching POIs for: Friedrichshain
🔎 Fetching POIs for: Kreuzberg
🔎 Fetching POIs for: Prenzlauer Berg
🔎 Fetching POIs for: Weißensee
🔎 Fetching POIs for: Blankenburg
🔎 Fetching POIs for: Heinersdorf
🔎 Fetching POIs for: Karow
🔎 Fetching POIs for: Stadtrandsiedlung Malchow
🔎 Fetching POIs for: Pankow
🔎 Fetching POIs for: Blankenfelde
🔎 Fetching POIs for: Buch
🔎 Fetching POIs for: Französisch Buchholz
🔎 Fetching POIs for: Niederschönhausen
🔎 Fetching POIs for: Rosenthal
🔎 Fetching POIs for: Wilhelmsruh
🔎 Fetching POIs for: Charlottenburg
🔎 Fetching POIs for: Wilmersdorf
🔎 Fetching POIs for: Schmargendorf
🔎 Fetching POIs for: Grunewald
🔎 Fetching POIs for: Westend
🔎 Fetching POIs for: Charlottenburg-Nord
🔎 Fetching POIs for: Halensee
🔎 Fetching POIs for: Spandau
🔎 Fetching POIs for: Haselhorst
🔎 Fetching 

# Create CSV file Subdistricts with pois features

In [None]:
import geopandas as gpd
import pandas as pd
from osmnx.features import features_from_polygon
from shapely.geometry import Polygon
import time

# Load existing Berlin POIs
gdf_pois = gpd.read_file("berlin_pois.geojson")

# Define tags for POIs
tags = {
    "shop": ["supermarket"],
    "amenity": [
        "cafe", "restaurant", "bar", "nightclub",
        "school", "kindergarten", "university",
        "clinic", "hospital", "pharmacy"
    ],
    "leisure": ["park", "playground"],
    "landuse": ["grass", "forest"],
    "natural": ["wood"]
}

# Flatten tag values into a new column
gdf_pois["main_tag"] = gdf_pois.apply(
    lambda row: next((k for k in tags.keys() if k in row and pd.notnull(row[k])), None), axis=1
)

gdf_pois["tag_value"] = gdf_pois.apply(
    lambda row: row[row["main_tag"]] if pd.notnull(row["main_tag"]) else None, axis=1
)

# Group by Ortsteil and tag value
poi_counts = gdf_pois.groupby(["ortsteil", "tag_value"]).size().unstack(fill_value=0).reset_index()

Skipping field contact:phone:description: unsupported OGR type: 10
Skipping field opening_hours:checkin: unsupported OGR type: 10


# Clean up extracted data

In [8]:
import pandas as pd

df = pd.read_csv("berlin_poi_counts_per_ortsteil.csv")

# Count how many subdistricts have > 0 for each POI type
poi_presence = (df.drop(columns="ortsteil") > 0).sum().sort_values(ascending=False)
print(poi_presence)

playground          96
school              96
grass               96
park                96
restaurant          95
kindergarten        95
supermarket         91
cafe                91
wood                88
pharmacy            88
forest              80
bar                 61
clinic              57
hospital            35
university          30
nightclub           29
bakery              23
garden              19
deli                 9
coffee               9
convenience          5
confectionery        5
alcohol              4
community_centre     4
pastry               4
dog_park             3
kiosk                3
tea                  3
ice_cream            3
pitch                2
hairdresser          2
books                2
bathing_place        2
animal_training      1
antiques             1
wine                 1
vacant               1
art                  1
travel_agency        1
theatre              1
tea;art              1
bakery;pastry        1
stationery           1
second_hand

In [9]:
grouped_columns = {
    "cafes": ["cafe", "coffee", "tea"],
    "bakeries": ["bakery", "pastry", "bakery;pastry"],
    "green_space": ["park", "forest", "meadow", "wood", "grass"],
    "schools": ["school", "kindergarten", "university"],
    "medical": ["clinic", "hospital", "pharmacy"]
}

# Aggregate groups
for new_col, tags in grouped_columns.items():
    df[new_col] = df[tags].sum(axis=1)
    df.drop(columns=tags, inplace=True)

In [12]:
# Save as CSV
df.to_csv("berlin_poi_counts_per_ortsteil.csv", index=False)

# Merge with subdistrict master table

In [18]:
import pandas as pd
import unicodedata

def normalize_ortsteil(name):
    if pd.isnull(name):
        return ""
    # Replace umlauts
    name = name.replace("ä", "ae").replace("ö", "oe").replace("ü", "ue").replace("ß", "ss")
    # Lowercase and strip
    return name.lower().strip()

# Load datasets
df_master = pd.read_csv("../data/cleaned_data/berlin_ortsteil_table.csv")
df_poi = pd.read_csv("berlin_poi_counts_per_ortsteil.csv")

# Normalize ortsteil names in both datasets
df_master["ortsteil_norm"] = df_master["ortsteil"].apply(normalize_ortsteil)
df_poi["ortsteil_norm"] = df_poi["ortsteil"].apply(normalize_ortsteil)

# Merge on normalized names
df_merged = df_master.merge(df_poi, how="left", on="ortsteil_norm", suffixes=("", "_poi"))

# Drop helper column if needed
df_merged.drop(columns=["ortsteil_norm", "ortsteil_poi"], inplace=True)

# Fill POI NaNs with 0
poi_columns = df_poi.columns.difference(['ortsteil', 'ortsteil_norm'])
df_merged[poi_columns] = df_merged[poi_columns].fillna(0).astype(int)



# Save result
df_merged.to_csv("berlin_ortsteil_master_with_poi_features.csv", index=False)
print("✅ Merge successful!")

✅ Merge successful!


# Merge District and Subdistrict level master tables

In [2]:
import pandas as pd

df_sub = pd.read_csv('../data/master_tables/berlin_ortsteil_master_with_poi_features.csv')
df_dist = pd.read_csv('../data/master_tables/berlin_bezirk_master_table.csv')

In [4]:
df_full = df_sub.merge(df_dist, on="bezirk", how="left")
df_full.to_csv("berlin_final_master_table.csv", index=False)