## üñºÔ∏èüß© Step 1: Gallery Data Modeling

### Import Libraries

In [120]:
# Import Libraries
import osmnx as ox # to fetch data from OpenStreetMap
import geopandas as gpd # to work with geospatial data
import pandas as pd
import numpy as np
import re
from geopy.geocoders import Nominatim
from tqdm import tqdm

### Create the tag to pull galleries from tourism section

In [211]:
tags = {
        "tourism": "gallery"
       }

### Fetch Berlin Geometries

In [212]:
gallery_raw = ox.features_from_place("Berlin, Germany", tags)


### Display basic info

In [213]:
print(f"Number of gallery entries fetched: {len(gallery_raw)}")
gallery_raw.head()

Number of gallery entries fetched: 322


Unnamed: 0_level_0,Unnamed: 1_level_0,geometry,addr:city,addr:country,addr:housenumber,addr:postcode,addr:street,addr:suburb,contact:phone,contact:website,craft,...,heritage:operator,lda:criteria,ref:lda,roof:levels,roof:shape,building:use,colour,indoor,type,name:ko
element,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
node,301107444,POINT (13.57134 52.40975),Berlin,DE,6.0,12524.0,Richterstra√üe,Bohnsdorf,+49 30 6764261,http://achimkuehn.de/,sculptor,...,,,,,,,,,,
node,370766098,POINT (13.31363 52.50313),Berlin,DE,57.0,10629.0,Mommsenstra√üe,Charlottenburg,,http://www.galeriezandi.com/,,...,,,,,,,,,,
node,410135935,POINT (13.42749 52.49333),Berlin,DE,31.0,12047.0,B√ºrknerstra√üe,Neuk√∂lln,,,,...,,,,,,,,,,
node,410692505,POINT (13.43706 52.49291),,,,,,,,,,...,,,,,,,,,,
node,410745800,POINT (13.37533 52.55509),Berlin,DE,16.0,13357.0,Schwedenstra√üe,Gesundbrunnen,+491708089110,http://www.kronenboden.de/?page_id=10,,...,,,,,,,,,,


### Save the raw data to a csv file

- Define file paths

In [215]:
raw_csv_path = "../sources/csv_files/gallery_raw.csv"
raw_geojson_path = "../sources/geojson_files/gallery_raw.geojson"

- Save csv & geojson to correct folders in sources

In [216]:
gallery_raw.to_csv(raw_csv_path, index=False )

In [217]:
gallery_raw.to_file(raw_geojson_path, driver="GeoJSON")

### Remove the columns with 75% or more missing data

In [127]:
# Step 1: Calculate percentage of missing values per column
missing_percent = gallery_raw.isnull().mean() * 100

# Step 2: Identify columns with less than 75% missing
columns_to_keep = missing_percent[missing_percent < 75].index

# Step 3: Create a new filtered DataFrame
gallery_filtered = gallery_raw[columns_to_keep]

# Check shape and preview
print(gallery_filtered.shape)
gallery_filtered.head()

(322, 13)


Unnamed: 0_level_0,Unnamed: 1_level_0,geometry,addr:city,addr:country,addr:housenumber,addr:postcode,addr:street,addr:suburb,name,tourism,fee,opening_hours,wheelchair,website
element,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
node,301107444,POINT (13.57134 52.40975),Berlin,DE,6.0,12524.0,Richterstra√üe,Bohnsdorf,Atelier Achim K√ºhn,gallery,,,,
node,370766098,POINT (13.31363 52.50313),Berlin,DE,57.0,10629.0,Mommsenstra√üe,Charlottenburg,Galerie Zandi,gallery,no,Mo-Fr 11:00-18:00; Sa 11:00-16:00,no,
node,410135935,POINT (13.42749 52.49333),Berlin,DE,31.0,12047.0,B√ºrknerstra√üe,Neuk√∂lln,Studio f√ºr Gestaltung,gallery,no,"Th,Fr 13:00-18:00; Sa 12:00-16:00",yes,
node,410692505,POINT (13.43706 52.49291),,,,,,,La Girafe,gallery,no,,no,
node,410745800,POINT (13.37533 52.55509),Berlin,DE,16.0,13357.0,Schwedenstra√üe,Gesundbrunnen,kronenboden,gallery,,,yes,


### Check no Geometries missing

In [128]:
print("Missing geometries:", gallery_filtered.geometry.isna().sum())

Missing geometries: 0


### Add Latitude & Longitude columns to filtered geodataframe

In [129]:
# Reproject and extract lat/lon
gallery_filtered = gallery_filtered.to_crs(epsg=4326)
gallery_filtered['geometry'] = gallery_filtered['geometry'].apply(
    lambda geom: geom if geom.geom_type == 'Point' else geom.representative_point()
)
gallery_filtered['latitude'] = gallery_filtered.geometry.y
gallery_filtered['longitude'] = gallery_filtered.geometry.x

### Verify the lat/lon amounts are correct

In [130]:
print("Latitude range:", gallery_filtered["latitude"].min(), "to", gallery_filtered["latitude"].max())

print("Longitude range:", gallery_filtered["longitude"].min(), "to", gallery_filtered["longitude"].max())

Latitude range: 52.4097499 to 52.6339917
Longitude range: 13.1459292 to 13.615677950110676


### Reset index and change column name 'id' to 'gallery_id'

In [131]:
gallery_filtered = gallery_filtered.reset_index()

# Rename the "id" column to "gallery_id"
gallery_filtered = gallery_filtered.rename(columns={"id": "gallery_id"})  
# set the gallery_id to string
gallery_filtered["gallery_id"] = gallery_filtered["gallery_id"].astype(str)
#  
# Drop the redundant column "element"
gallery_filtered= gallery_filtered.drop(columns=["element"],errors='ignore')
# Show final list of columns
print(gallery_filtered.columns.tolist())

['gallery_id', 'geometry', 'addr:city', 'addr:country', 'addr:housenumber', 'addr:postcode', 'addr:street', 'addr:suburb', 'name', 'tourism', 'fee', 'opening_hours', 'wheelchair', 'website', 'latitude', 'longitude']


In [132]:
gallery_filtered.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 322 entries, 0 to 321
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   gallery_id        322 non-null    object  
 1   geometry          322 non-null    geometry
 2   addr:city         189 non-null    object  
 3   addr:country      125 non-null    object  
 4   addr:housenumber  220 non-null    object  
 5   addr:postcode     203 non-null    object  
 6   addr:street       220 non-null    object  
 7   addr:suburb       128 non-null    object  
 8   name              311 non-null    object  
 9   tourism           322 non-null    object  
 10  fee               140 non-null    object  
 11  opening_hours     142 non-null    object  
 12  wheelchair        118 non-null    object  
 13  website           142 non-null    object  
 14  latitude          322 non-null    float64 
 15  longitude         322 non-null    float64 
dtypes: float64(2), geo

---

## üñºÔ∏èüîÑ Step 2: Gallery Data Transformation

### Drop unnecessary columns
    - Dont need Berlin and DE as user knows they are looking for data in Berlin, Germany
    - Drop tourism column as all Gallery
    - Drop suburb as it is the same as neighborhood which will be added later to make sure nothing is missing

In [133]:
gallery_filtered.drop(columns=['addr:city', 'addr:country', 'tourism', 'addr:suburb'], errors='ignore', inplace=True)


### Copy the filtered database for record purposes and then standardise column names

In [134]:
gallery_cleaned = gallery_filtered.copy()

def clean_column(col):
    col = col.strip().lower().replace('addr:', '')
    col = col.replace(' ', '_')
    col = re.sub(r'[^a-z0-9_]', '', col)
    return col

gallery_cleaned.columns = [clean_column(col) for col in gallery_cleaned.columns]

### Rename columns for clarity

In [135]:
gallery_cleaned.rename(columns={
    'postcode': 'postal_code',
    'housenumber': 'house_number',
    'name': 'gallery_name'
}, inplace=True)

In [136]:
gallery_cleaned.columns.tolist()

['gallery_id',
 'geometry',
 'house_number',
 'postal_code',
 'street',
 'gallery_name',
 'fee',
 'opening_hours',
 'wheelchair',
 'website',
 'latitude',
 'longitude']

### Check the unique values in yes/no columns to see if possible to change to True/False

In [137]:
print("Unique fee:"  + str(gallery_cleaned.fee.unique()))                   # Can change as only yes/no
print("Unique wheelchair:"  + str(gallery_cleaned.wheelchair.unique()))     # Leave as is because of limited

Unique fee:[nan 'no' 'yes']
Unique wheelchair:[nan 'no' 'yes' 'limited']


In [138]:
gallery_cleaned["fee"] = gallery_cleaned["fee"].map({"yes": True, "no": False})

### Fetch Districts & Neighbourhoods

- Load official Berlin districts GeoDataFrame

In [None]:
districts_gdf = gpd.read_file("../sources/geojson_files/lor_ortsteile.geojson")

In [140]:
districts_gdf.head(2)

Unnamed: 0,gml_id,spatial_name,spatial_alias,spatial_type,OTEIL,BEZIRK,FLAECHE_HA,geometry
0,re_ortsteil.0101,101,Mitte,Polygon,Mitte,Mitte,1063.8748,"POLYGON ((13.41649 52.52696, 13.41635 52.52702..."
1,re_ortsteil.0102,102,Moabit,Polygon,Moabit,Mitte,768.7909,"POLYGON ((13.33884 52.51974, 13.33884 52.51974..."


- Reproject GeoDataFrames to EPSG:4326 

In [None]:
gallery_cleaned = gallery_cleaned.to_crs(epsg=4326)
districts_gdf = districts_gdf.to_crs(epsg=4326)

- Spatial join with district(Bezirk) and Neighbourhood_id(spatial_name)

In [None]:
gallery_df_district = gpd.sjoin(
    gallery_cleaned,
    districts_gdf[["BEZIRK", "spatial_name","geometry"]],
    how="left",
    predicate="within"
)

- Rename columns for clarity

In [None]:

gallery_df_district = gallery_df_district.rename(columns={
    "BEZIRK": "district",
    "spatial_name": "neighbourhood_id"
}).drop(columns=["index_right"])  # drop district_number if not needed

- District mapping (official codes as strings)

In [None]:
district_mapping = {
    'Mitte': '11001001',
    'Friedrichshain-Kreuzberg': '11002002',
    'Pankow': '11003003',
    'Charlottenburg-Wilmersdorf': '11004004',
    'Spandau': '11005005',
    'Steglitz-Zehlendorf': '11006006',
    'Tempelhof-Sch√∂neberg': '11007007',
    'Neuk√∂lln': '11008008',
    'Treptow-K√∂penick': '11009009',
    'Marzahn-Hellersdorf': '11010010',
    'Lichtenberg': '11011011',
    'Reinickendorf': '11012012'
}

# Apply mapping to create district_id column (string)
gallery_df_district['district_id'] = gallery_df_district['district'].map(district_mapping).astype(str)

In [214]:
gallery_df_district.head()

Unnamed: 0,gallery_id,geometry,house_number,postal_code,street,gallery_name,fee,opening_hours,wheelchair,website,latitude,longitude,district,neighbourhood_id,district_id,postal_code_from_geo,street_from_geo
0,301107444,POINT (13.57134 52.40975),6.0,12524.0,Richterstra√üe,Atelier Achim K√ºhn,,,,,52.40975,13.571342,Treptow-K√∂penick,908,11009009,12524,Richterstra√üe
1,370766098,POINT (13.31363 52.50313),57.0,10629.0,Mommsenstra√üe,Galerie Zandi,False,Mo-Fr 11:00-18:00; Sa 11:00-16:00,no,,52.503135,13.313631,Charlottenburg-Wilmersdorf,401,11004004,10629,Mommsenstra√üe
2,410135935,POINT (13.42749 52.49333),31.0,12047.0,B√ºrknerstra√üe,Studio f√ºr Gestaltung,False,"Th,Fr 13:00-18:00; Sa 12:00-16:00",yes,,52.493327,13.42749,Neuk√∂lln,801,11008008,12047,B√ºrknerstra√üe
3,410692505,POINT (13.43706 52.49291),,,,La Girafe,False,,no,,52.492914,13.437057,Friedrichshain-Kreuzberg,202,11002002,10999,Glogauer Stra√üe
4,410745800,POINT (13.37533 52.55509),16.0,13357.0,Schwedenstra√üe,kronenboden,,,yes,,52.555091,13.375326,Mitte,106,11001001,13357,Schwedenstra√üe


### Used reverse Geocoding and Nominatim to get Postal code & Street info
    - Created new columns at end to check against nulls in original columns

In [146]:
tqdm.pandas()

# Initialize geocoder
geolocator = Nominatim(user_agent="gallery_locator")

# Define function to extract postal code and street
def get_postcode_and_street(row):
    try:
        location = geolocator.reverse((row['latitude'], row['longitude']), exactly_one=True)
        address = location.raw.get('address', {})
        postcode = address.get('postcode')
        street = address.get('road') or address.get('pedestrian') or address.get('footway') or address.get('street')
        return pd.Series([postcode, street])
    except:
        return pd.Series([None, None])

# Apply to your DataFrame
gallery_df_district[['postal_code_from_geo', 'street_from_geo']] = gallery_df_district.progress_apply(get_postcode_and_street, axis=1)


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 322/322 [07:41<00:00,  1.43s/it]


In [221]:
gallery_full = gallery_df_district.copy()

### Check null count before replacing Nulls in postal code

In [222]:
print(gallery_full[['postal_code']].isnull().sum())
print(gallery_full[['street']].isnull().sum())

postal_code    119
dtype: int64
street    102
dtype: int64


### Replace the Nulls in the original columns

In [223]:

gallery_full['postal_code'] = gallery_full['postal_code'].fillna(gallery_full['postal_code_from_geo'])
gallery_full['street'] = gallery_full['street'].fillna(gallery_full['street_from_geo'])

### Check null values now to see if a difference

In [224]:
print(gallery_full[['postal_code']].isnull().sum())
print(gallery_full[['street']].isnull().sum())

postal_code    2
dtype: int64
street    5
dtype: int64


### Data Cleaning & Normalization
    - Replace all missing values with Nan

In [294]:
# Replace empty strings with NaN
gallery_full.replace('', np.nan, inplace=True)

gallery_full.head(2)

Unnamed: 0,gallery_id,house_number,postal_code,street,gallery_name,fee,opening_hours,wheelchair,website,latitude,longitude,neighbourhood_id,district_id
0,301107444,6,12524,richter stra√üe,atelier achim k√ºhn,,,,,52.40975,13.571342,908,11009009
1,370766098,57,10629,mommsen stra√üe,galerie zandi,False,mo-fr 11:00-18:00; sa 11:00-16:00,no,,52.503135,13.313631,401,11004004


### Normalize the street name column

In [295]:
def normalize_street_name(name):
    if pd.isna(name):
        return np.nan
    # Replace underscores with spaces
    name = name.replace('_', ' ').replace('-', ' ')
    # Replace 'str.' or 'str' at end with ' Stra√üe'
    name = re.sub(r'\bstr\.?\s*$', ' Stra√üe', name, flags=re.IGNORECASE)
    # Ensure space before 'stra√üe' if missing
    name = re.sub(r'(?<!\s)(stra√üe)$', r' Stra√üe', name, flags=re.IGNORECASE)
    # Ensure space before 'allee' if missing
    name = re.sub(r'(?<!\s)(allee)$', r' Allee', name, flags=re.IGNORECASE)
    # Ensure space before 'damm' if missing
    name = re.sub(r'(?<!\s)(damm)$', r' Damm', name, flags=re.IGNORECASE)
    # Ensure space before 'weg' if missing
    name = re.sub(r'(?<!\s)(weg)$', r' Weg', name, flags=re.IGNORECASE)
    # Ensure space before 'graben' if missing
    name = re.sub(r'(?<!\s)(graben)$', r' Graben', name, flags=re.IGNORECASE)
    # Ensure space before 'ufer' if missing
    name = re.sub(r'(?<!\s)(ufer)$', r' Ufer', name, flags=re.IGNORECASE)
    # Ensure space before 'korso' if missing
    name = re.sub(r'(?<!\s)(korso)$', r' Korso', name, flags=re.IGNORECASE)
    # Ensure space before 'zeile' if missing
    name = re.sub(r'(?<!\s)(zeile)$', r' Ziele', name, flags=re.IGNORECASE)
    # Ensure space before 'promenade' if missing
    name = re.sub(r'(?<!\s)(promenade)$', r' Promenade', name, flags=re.IGNORECASE)
    # Ensure space before 'kiez' if missing
    name = re.sub(r'(?<!\s)(kiez)$', r' Kiez', name, flags=re.IGNORECASE)
    # Ensure space before 'platz' if missing
    name = re.sub(r'(?<!\s)(platz)$', r' Platz', name, flags=re.IGNORECASE)
    # Remove extra spaces
    name = re.sub(r'\s+', ' ', name).strip()
    return name

In [296]:
gallery_full['street'] = gallery_full['street'].apply(normalize_street_name)

### Drop unnecessary columns

In [297]:
gallery_full.drop(columns=['geometry', 'district', 'postal_code_from_geo', 'street_from_geo'], errors='ignore', inplace=True)

### Convert all text in columns to lowercase to avoid any duplications

In [298]:
text_cols = ["gallery_name", "street", "website", "opening_hours", "wheelchair"]

for col in text_cols:
    if col in gallery_full.columns:
        gallery_full[col] = gallery_full[col].apply(
            lambda x: x.strip().lower() if isinstance(x, str) else x
        )

### Check column data types
    - All have correct type allocated so no need to change anything

In [299]:
gallery_full.info()

<class 'pandas.core.frame.DataFrame'>
Index: 311 entries, 0 to 320
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gallery_id        311 non-null    object 
 1   house_number      216 non-null    object 
 2   postal_code       309 non-null    object 
 3   street            307 non-null    object 
 4   gallery_name      311 non-null    object 
 5   fee               139 non-null    object 
 6   opening_hours     142 non-null    object 
 7   wheelchair        118 non-null    object 
 8   website           142 non-null    object 
 9   latitude          311 non-null    float64
 10  longitude         311 non-null    float64
 11  neighbourhood_id  311 non-null    object 
 12  district_id       311 non-null    object 
dtypes: float64(2), object(11)
memory usage: 34.0+ KB


### Remove duplicates
    - In this case we have no duplicates but will keep this check for future runs incase

In [300]:
# See how many duplicate rows exist
gallery_full.duplicated().sum()

np.int64(0)

In [301]:
# Display the actual duplicate rows
gallery_full[gallery_full.duplicated()]

Unnamed: 0,gallery_id,house_number,postal_code,street,gallery_name,fee,opening_hours,wheelchair,website,latitude,longitude,neighbourhood_id,district_id


In [302]:
# Remove duplicate rows and reset index
gallery_full = gallery_full.drop_duplicates().reset_index(drop=True)

### Remove row if name missing

In [303]:
gallery_full = gallery_full.dropna(subset=['gallery_name'])

### Reorder column names to be clearer
    - Leave geometry and district off as columns no longer needed

In [304]:
gallery_listings = gallery_full[["gallery_id", "gallery_name","house_number", "street", "neighbourhood_id", "district_id", "postal_code", "website", "opening_hours", "wheelchair", "fee", "latitude", "longitude"]]

In [305]:
gallery_listings.to_csv("../sources/csv_files/gallery_listings.csv", index=False)

### Final Summary of cleaned and Transformed Data

In [306]:
# Shape of dataframe
print(f"Number of rows: {gallery_listings.shape[0]}")
print(f"Number of columns: {gallery_listings.shape[1]}")

Number of rows: 311
Number of columns: 13


In [307]:
# Column list
print("\nRemaining columns:")
print(gallery_listings.columns.tolist())


Remaining columns:
['gallery_id', 'gallery_name', 'house_number', 'street', 'neighbourhood_id', 'district_id', 'postal_code', 'website', 'opening_hours', 'wheelchair', 'fee', 'latitude', 'longitude']


In [308]:
# Missing values check
missing = gallery_listings.isnull().sum()
print("\nMissing values after cleaning and transforming :")
print(missing)


Missing values after cleaning and transforming :
gallery_id            0
gallery_name          0
house_number         95
street                4
neighbourhood_id      0
district_id           0
postal_code           2
website             169
opening_hours       169
wheelchair          193
fee                 172
latitude              0
longitude             0
dtype: int64


In [309]:
missing_percent = gallery_listings.isnull().mean() * 100
print(missing_percent.sort_values(ascending=False))

wheelchair          62.057878
fee                 55.305466
website             54.340836
opening_hours       54.340836
house_number        30.546624
street               1.286174
postal_code          0.643087
gallery_id           0.000000
gallery_name         0.000000
neighbourhood_id     0.000000
district_id          0.000000
latitude             0.000000
longitude            0.000000
dtype: float64


In [310]:
gallery_listings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 311 entries, 0 to 310
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gallery_id        311 non-null    object 
 1   gallery_name      311 non-null    object 
 2   house_number      216 non-null    object 
 3   street            307 non-null    object 
 4   neighbourhood_id  311 non-null    object 
 5   district_id       311 non-null    object 
 6   postal_code       309 non-null    object 
 7   website           142 non-null    object 
 8   opening_hours     142 non-null    object 
 9   wheelchair        118 non-null    object 
 10  fee               139 non-null    object 
 11  latitude          311 non-null    float64
 12  longitude         311 non-null    float64
dtypes: float64(2), object(11)
memory usage: 31.7+ KB


In [311]:
gallery_listings.head()

Unnamed: 0,gallery_id,gallery_name,house_number,street,neighbourhood_id,district_id,postal_code,website,opening_hours,wheelchair,fee,latitude,longitude
0,301107444,atelier achim k√ºhn,6.0,richter stra√üe,908,11009009,12524,,,,,52.40975,13.571342
1,370766098,galerie zandi,57.0,mommsen stra√üe,401,11004004,10629,,mo-fr 11:00-18:00; sa 11:00-16:00,no,False,52.503135,13.313631
2,410135935,studio f√ºr gestaltung,31.0,b√ºrkner stra√üe,801,11008008,12047,,"th,fr 13:00-18:00; sa 12:00-16:00",yes,False,52.493327,13.42749
3,410692505,la girafe,,glogauer stra√üe,202,11002002,10999,,,no,False,52.492914,13.437057
4,410745800,kronenboden,16.0,schweden stra√üe,106,11001001,13357,,,yes,,52.555091,13.375326
