In [5]:
import geopandas as gpd
import pandas as pd
from osmnx.features import features_from_polygon
from shapely.geometry import Polygon
import time

# Load Ortsteil polygons
gdf_boundary = gpd.read_file("../data/spatial_data/lor_ortsteile.geojson")

# Define tags for POIs
tags = {
    "shop": ["supermarket"],
    "amenity": [
        "cafe", "restaurant", "bar", "nightclub",
        "school", "kindergarten", "university",
        "clinic", "hospital", "pharmacy"
    ],
    "leisure": ["park", "playground"],
    "landuse": ["grass", "forest"],
    "natural": ["wood"]
}

# Empty lists to collect data
pois_list = []
failed_ortsteile = []

# Loop through Ortsteile
for i, row in gdf_boundary.iterrows():
    ortsteil_name = row['name'] if 'name' in row else f"ortsteil_{i}"
    polygon = row['geometry']
    
    try:
        print(f"🔎 Fetching POIs for: {ortsteil_name}")
        pois = features_from_polygon(polygon, tags)
        pois["ortsteil"] = ortsteil_name
        pois_list.append(pois)
        time.sleep(2)  # polite pause between requests
    except Exception as e:
        print(f"❌ Failed for {ortsteil_name}: {e}")
        failed_ortsteile.append(ortsteil_name)

# Save failed list (optional)
with open("failed_ortsteile.txt", "w") as f:
    for item in failed_ortsteile:
        f.write(f"{item}\n")

# Combine successful POIs
gdf_pois = gpd.GeoDataFrame(pd.concat(pois_list, ignore_index=True))

# Save to file
gdf_pois.to_file("berlin_pois.geojson", driver="GeoJSON")

print(f"\n✅ Saved {len(gdf_pois)} POIs to 'berlin_pois.geojson'")
print(f"⚠️  {len(failed_ortsteile)} Ortsteile failed and were saved to 'failed_ortsteile.txt'")

🔎 Fetching POIs for: ortsteil_0
🔎 Fetching POIs for: ortsteil_1
🔎 Fetching POIs for: ortsteil_2
🔎 Fetching POIs for: ortsteil_3
🔎 Fetching POIs for: ortsteil_4
🔎 Fetching POIs for: ortsteil_5
🔎 Fetching POIs for: ortsteil_6
🔎 Fetching POIs for: ortsteil_7
🔎 Fetching POIs for: ortsteil_8
🔎 Fetching POIs for: ortsteil_9
🔎 Fetching POIs for: ortsteil_10
🔎 Fetching POIs for: ortsteil_11
🔎 Fetching POIs for: ortsteil_12
🔎 Fetching POIs for: ortsteil_13
🔎 Fetching POIs for: ortsteil_14
🔎 Fetching POIs for: ortsteil_15
🔎 Fetching POIs for: ortsteil_16
🔎 Fetching POIs for: ortsteil_17
🔎 Fetching POIs for: ortsteil_18
🔎 Fetching POIs for: ortsteil_19
🔎 Fetching POIs for: ortsteil_20
❌ Failed for ortsteil_20: HTTPSConnectionPool(host='overpass-api.de', port=443): Read timed out. (read timeout=180)
🔎 Fetching POIs for: ortsteil_21
🔎 Fetching POIs for: ortsteil_22
🔎 Fetching POIs for: ortsteil_23
🔎 Fetching POIs for: ortsteil_24
🔎 Fetching POIs for: ortsteil_25
🔎 Fetching POIs for: ortsteil_26
🔎 F

In [7]:
import geopandas as gpd
from osmnx.features import features_from_polygon
import pandas as pd

# Load Ortsteil polygons
gdf_boundary = gpd.read_file("../data/spatial_data/lor_ortsteile.geojson")

# Index of the failed Ortsteil
index = 20

# Extract geometry and name
polygon = gdf_boundary.iloc[index].geometry
ortsteil_name = gdf_boundary.iloc[index]["OTEIL"]

# Define OSM tags
tags = {
    "amenity": ["restaurant", "bar", "cafe", "pub", "fast_food", "nightclub", 
                "school", "kindergarten", "university", "clinic", "hospital", "pharmacy"],
    "shop": ["supermarket"],
    "leisure": ["park", "playground"],
    "landuse": ["grass", "forest"],
    "natural": ["wood"]
}

# Retry fetching POIs
try:
    print(f"🔁 Retrying POIs for: {ortsteil_name}")
    pois = features_from_polygon(polygon, tags)
    pois["ortsteil"] = ortsteil_name
    pois["ortsteil_index"] = index
    pois.to_file("berlin_pois_ortsteil_20.geojson", driver="GeoJSON")
    print("✅ Successfully saved retry POIs.")
except Exception as e:
    print(f"❌ Retry failed: {e}")

🔁 Retrying POIs for: Wilhelmsruh
✅ Successfully saved retry POIs.


In [5]:
print(gdf_boundary.columns)

Index(['gml_id', 'spatial_name', 'spatial_alias', 'spatial_type', 'OTEIL',
       'BEZIRK', 'FLAECHE_HA', 'geometry'],
      dtype='object')


In [8]:
# Load existing Berlin POIs
gdf_all_pois = gpd.read_file("berlin_pois.geojson")

# Load the retry POIs
gdf_retry = gpd.read_file("berlin_pois_ortsteil_20.geojson")

# Concatenate
gdf_pois_combined = pd.concat([gdf_all_pois, gdf_retry], ignore_index=True)

# Optional: Save updated full POI file
gdf_pois_combined.to_file("berlin_pois_full.geojson", driver="GeoJSON")

Skipping field contact:phone:description: unsupported OGR type: 10
Skipping field opening_hours:checkin: unsupported OGR type: 10


In [9]:
# Flatten tag values into a new column
gdf_pois_combined["main_tag"] = gdf_pois_combined.apply(
    lambda row: next((k for k in tags.keys() if k in row and pd.notnull(row[k])), None), axis=1
)

gdf_pois_combined["tag_value"] = gdf_pois_combined.apply(
    lambda row: row[row["main_tag"]] if pd.notnull(row["main_tag"]) else None, axis=1
)

# Group by Ortsteil and tag value
poi_counts = gdf_pois_combined.groupby(["ortsteil", "tag_value"]).size().unstack(fill_value=0).reset_index()

# Optional: Save as CSV
poi_counts.to_csv("berlin_poi_counts_per_ortsteil.csv", index=False)

  super().__setitem__(key, value)
  super().__setitem__(key, value)


In [10]:
# Load the aggregated POI counts
df_poi_counts = pd.read_csv("berlin_poi_counts_per_ortsteil.csv")

# Rename tag columns
df_poi_counts.rename(columns={
    "supermarket": "subdistrict_num_supermarkets",
    "cafe": "subdistrict_num_cafes",
    "restaurant": "subdistrict_num_restaurants",
    "bar": "subdistrict_num_bars",
    "nightclub": "subdistrict_num_nightclubs",
    "park": "subdistrict_num_parks",
    "playground": "subdistrict_num_playgrounds",
    "grass": "subdistrict_num_grass_areas",
    "forest": "subdistrict_num_forests",
    "wood": "subdistrict_num_wood_areas",
    "school": "subdistrict_num_schools",
    "kindergarten": "subdistrict_num_kindergartens",
    "university": "subdistrict_num_universities",
    "clinic": "subdistrict_num_clinics",
    "hospital": "subdistrict_num_hospitals",
    "pharmacy": "subdistrict_num_pharmacies"
}, inplace=True)

# Optional: save clean version
df_poi_counts.to_csv("berlin_poi_counts_cleaned.csv", index=False)

In [14]:
# Load your master table
df_master = pd.read_csv("../data/master_tables/berlin_ortsteil_master_table.csv")

# Merge on Ortsteil name
df_final = df_master.merge(df_poi_counts, how="left", on="ortsteil")

# Fill missing POI values with 0
df_final.fillna(0, inplace=True)

# Save the final enriched dataset
df_final.to_csv("berlin_ortsteil_master_with_poi_features.csv", index=False)

In [13]:
print(df_master.columns.tolist())

['bezirk', 'ortsteil', 'subdistrict_population_age_0_5', 'subdistrict_population_age_5_10', 'subdistrict_population_age_10_15', 'subdistrict_population_age_15_20', 'subdistrict_population_age_20_25', 'subdistrict_population_age_25_30', 'subdistrict_population_age_30_35', 'subdistrict_population_age_35_40', 'subdistrict_population_age_40_45', 'subdistrict_population_age_45_50', 'subdistrict_population_age_50_55', 'subdistrict_population_age_55_60', 'subdistrict_population_age_60_65', 'subdistrict_population_age_65_70', 'subdistrict_population_age_70_75', 'subdistrict_population_age_75_80', 'subdistrict_population_age_80_85', 'subdistrict_population_age_85_90', 'subdistrict_population_age_90_95', 'subdistrict_population_age_95_plus', 'subdistrict_male_population', 'subdistrict_female_population', 'total_population', 'subdistrict_senior_population', 'subdistrict_youth_population', 'subdistrict_senior_share', 'subdistrict_youth_share', 'subdistrict_middle_age_population', 'subdistrict_avg_