## üèõÔ∏èüß© Step 1 : Museum Data Modelling

### Import Libraries

In [177]:
# Import Libraries
import osmnx as ox # to fetch data from OpenStreetMap
import geopandas as gpd # to work with geospatial data
import pandas as pd
import numpy as np
import re
from geopy.geocoders import Nominatim
from tqdm import tqdm

### Pull raw data for museums tag

In [178]:
tags = {
        "tourism": "museum"
       }

### Fetch Berlin Geometries

In [179]:
museum_raw = ox.features_from_place("Berlin, Germany", tags)


### Display basic info

In [180]:
print(f"Number of museum entries fetched: {len(museum_raw)}")
museum_raw.head()

Number of museum entries fetched: 244


Unnamed: 0_level_0,Unnamed: 1_level_0,geometry,addr:housenumber,addr:street,email,fee,image,name,name:en,name:fr,name:ru,...,name:tr,nickname,type,alt_name:it,construction_year,photography,name:ko,changing_table:count,changing_table:location,changing_table:location:notes
element,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
node,73696610,POINT (13.36299 52.50761),13-14,Stauffenbergstra√üe,sekretariat@gdw-berlin.de,no,http://commons.wikimedia.org/wiki/File:Bendele...,Gedenkst√§tte Deutscher Widerstand,Memorial to the German Resistance,Memorial de la R√©sistance Allemande,–ú–µ–º–æ—Ä–∏–∞–ª –ù–µ–º–µ—Ü–∫–æ–≥–æ –°–æ–ø—Ä–æ—Ç–∏–≤–ª–µ–Ω–∏—è,...,,,,,,,,,,
node,84644782,POINT (13.38221 52.50487),90,Stresemannstra√üe,,no,,"Dokumentationszentrum Flucht, Vertreibung, Ver...",,,,...,,,,,,,,,,
node,259855486,POINT (13.43277 52.51048),3,Koppenstra√üe,info@s-bahn-museum.de,yes,,Berliner S-Bahn-Museum,,,,...,,,,,,,,,,
node,268591806,POINT (13.28894 52.45836),,,,yes,,Das Museum der Dom√§ne Dahlem,,,,...,,,,,,,,,,
node,281391655,POINT (13.48748 52.51451),103,Ruschestra√üe,,yes,,Stasimuseum,Stasi Museum,,,...,,,,,,,,,,


### Save the raw data to a csv file

- Define file paths

In [186]:
raw_csv_path = "../sources/csv_files/museum_raw.csv"
raw_geojson_path = "../sources/geojson_files/museum_raw.geojson"

- Save csv & geojson to correct folders in sources

In [187]:
museum_raw.to_csv(raw_csv_path, index=False )

In [188]:
museum_raw.to_file(raw_geojson_path, driver="GeoJSON")

### Remove the columns with 75% or more missing data

In [92]:
# Step 1: Calculate percentage of missing values per column
missing_percent = museum_raw.isnull().mean() * 100

# Step 2: Identify columns with less than 75% missing
columns_to_keep = missing_percent[missing_percent < 75].index

# Step 3: Create a new filtered DataFrame
museum_filtered = museum_raw[columns_to_keep]

# Check shape and preview
print(museum_filtered.shape)
museum_filtered.head()

(244, 22)


Unnamed: 0_level_0,Unnamed: 1_level_0,geometry,addr:housenumber,addr:street,fee,name,opening_hours,tourism,website,wheelchair,wikidata,...,addr:postcode,addr:suburb,toilets:wheelchair,addr:country,fee:icom_member,museum,operator,contact:phone,contact:website,building
element,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
node,73696610,POINT (13.36299 52.50761),13-14,Stauffenbergstra√üe,no,Gedenkst√§tte Deutscher Widerstand,Mo-Fr 09:00-18:00; Sa-Su 10:00-18:00,museum,https://www.gdw-berlin.de/,limited,Q880798,...,,,,,,,,,,
node,84644782,POINT (13.38221 52.50487),90,Stresemannstra√üe,no,"Dokumentationszentrum Flucht, Vertreibung, Ver...",Tu-Su 10:00-19:00; Mo off,museum,https://www.flucht-vertreibung-versoehnung.de/,yes,,...,10963.0,Kreuzberg,yes,,,,,,,
node,259855486,POINT (13.43277 52.51048),3,Koppenstra√üe,yes,Berliner S-Bahn-Museum,"We 12:00-16:00, Th,Fr 15:00-20:00, Su 14:00-18:00",museum,https://s-bahn-museum.de/,yes,,...,10243.0,Friedrichshain,,DE,unknown,railway,Berliner S-Bahn-Museum gGmbH,,,
node,268591806,POINT (13.28894 52.45836),,,yes,Das Museum der Dom√§ne Dahlem,Mo-Su 10:00-18:00; Th off,museum,https://www.domaene-dahlem.de/home/,limited,,...,,,,,no,open_air,,,,
node,281391655,POINT (13.48748 52.51451),103,Ruschestra√üe,yes,Stasimuseum,Mo-Fr 10:00-18:00; Sa-Su 11:00-18:00,museum,,yes,Q570472,...,10365.0,Lichtenberg,,DE,yes,history,ASTAK e.V.,+49 30 55368 54,https://www.stasimuseum.de/,


### Check no Geometries missing

In [93]:
print("Missing geometries:", museum_filtered.geometry.isna().sum())

Missing geometries: 0


### Add Latitude & Longitude columns to filtered geodataframe

In [94]:
# Reproject and extract lat/lon
museum_filtered = museum_filtered.to_crs(epsg=4326)
museum_filtered['geometry'] = museum_filtered['geometry'].apply(
    lambda geom: geom if geom.geom_type == 'Point' else geom.representative_point()
)
museum_filtered['latitude'] = museum_filtered.geometry.y
museum_filtered['longitude'] = museum_filtered.geometry.x

### Verify the lat/lon amounts are correct

In [95]:
print("Latitude range:", museum_filtered["latitude"].min(), "to", museum_filtered["latitude"].max())

print("Longitude range:", museum_filtered["longitude"].min(), "to", museum_filtered["longitude"].max())

Latitude range: 52.3878178 to 52.615457250000006
Longitude range: 13.13197421866466 to 13.6623573


### Reset index and change column name 'id' to 'museum_id'

In [96]:
museum_filtered = museum_filtered.reset_index()

# Rename the "id" column to "museum_id"
museum_filtered = museum_filtered.rename(columns={"id": "museum_id"})  

# Drop the redundant column "element"
museum_filtered= museum_filtered.drop(columns=["element"],errors='ignore')

# Show final list of columns
print(museum_filtered.columns.tolist())

['museum_id', 'geometry', 'addr:housenumber', 'addr:street', 'fee', 'name', 'opening_hours', 'tourism', 'website', 'wheelchair', 'wikidata', 'wikipedia', 'addr:city', 'addr:postcode', 'addr:suburb', 'toilets:wheelchair', 'addr:country', 'fee:icom_member', 'museum', 'operator', 'contact:phone', 'contact:website', 'building', 'latitude', 'longitude']


### Set the museum_id to string

In [None]:
museum_filtered["museum_id"] = museum_filtered["museum_id"].astype(str)

In [98]:
museum_filtered.info(verbose=True)

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 25 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   museum_id           244 non-null    object  
 1   geometry            244 non-null    geometry
 2   addr:housenumber    173 non-null    object  
 3   addr:street         179 non-null    object  
 4   fee                 180 non-null    object  
 5   name                242 non-null    object  
 6   opening_hours       193 non-null    object  
 7   tourism             244 non-null    object  
 8   website             124 non-null    object  
 9   wheelchair          157 non-null    object  
 10  wikidata            143 non-null    object  
 11  wikipedia           95 non-null     object  
 12  addr:city           166 non-null    object  
 13  addr:postcode       173 non-null    object  
 14  addr:suburb         129 non-null    object  
 15  toilets:wheelchair  64 non-null 

---

## üèõÔ∏èüîÑ Step 2 : Museum Data Transformation

### Drop unnecessary columns
    - Dont need Berlin and DE as user knows they are looking for data in Berlin, Germany
    - Drop tourism column as all Museum
    - Drop suburb as it is the same as neighborhood which will be added later to make sure nothing is missing

In [99]:
museum_filtered.drop(columns=['addr:city', 'addr:country', 'tourism', 'addr:suburb'], errors='ignore', inplace=True)


### Copy the filtered database for record purposes and then standardise column names

In [100]:
museum_cleaned = museum_filtered.copy()

def clean_column(col):
    col = col.strip().lower().replace('addr:', '')
    col = col.replace(' ', '_')
    col = re.sub(r'[^a-z0-9_]', '', col)
    return col

museum_cleaned.columns = [clean_column(col) for col in museum_cleaned.columns]

### Rename columns for clarity

In [101]:
museum_cleaned.rename(columns={
    'postcode': 'postal_code',
    'housenumber': 'house_number',
    'name': 'museum_name',
    'toiletswheelchair': 'toilets_wheelchair_accessible',
    'feeicom_member': 'fee_icom_member',
    'contactphone': 'phone',
    'contactwebsite': 'contact_website',
    'museum': 'museum_type'
}, inplace=True)

In [102]:
museum_cleaned.columns.tolist()

['museum_id',
 'geometry',
 'house_number',
 'street',
 'fee',
 'museum_name',
 'opening_hours',
 'website',
 'wheelchair',
 'wikidata',
 'wikipedia',
 'postal_code',
 'toilets_wheelchair_accessible',
 'fee_icom_member',
 'museum_type',
 'operator',
 'phone',
 'contact_website',
 'building',
 'latitude',
 'longitude']

### Check the unique values

In [103]:
print("Unique fee:"  + str(museum_cleaned.fee.unique()))                                                        
print("Unique fee icom member:"  + str(museum_cleaned.fee_icom_member.unique()))                                                        
print("Unique wheelchair:"  + str(museum_cleaned.wheelchair.unique()))                                         
print("Unique toilets wheelchair accessible:"  + str(museum_cleaned.toilets_wheelchair_accessible.unique()))    
print("Unique museum_type:"  + str(museum_cleaned.museum_type.unique()))  
print("Unique 'building':"  + str(museum_cleaned['building'].unique())) 


Unique fee:['no' 'yes' nan]
Unique fee icom member:[nan 'unknown' 'no' 'yes' 'discount']
Unique wheelchair:['limited' 'yes' 'no' nan]
Unique toilets wheelchair accessible:[nan 'yes' 'no']
Unique museum_type:[nan 'railway' 'open_air' 'history' 'museum' 'local' 'nature' 'computer'
 'person' 'wax' 'art' 'children' 'living_history' 'archaeological'
 'technology' 'art,history,nature' 'science']
Unique 'building':[nan 'museum' 'civic' 'yes' 'church' 'public' 'apartments' 'commercial'
 'residential' 'bunker' 'industrial' 'government' 'detached' 'container']


### Fetch Districts & Neighbourhoods
- Load official Berlin districts GeoDataFrame

In [None]:
districts_gdf = gpd.read_file("../sources/geojson_files/lor_ortsteile.geojson")

In [105]:
districts_gdf.head(2)

Unnamed: 0,gml_id,spatial_name,spatial_alias,spatial_type,OTEIL,BEZIRK,FLAECHE_HA,geometry
0,re_ortsteil.0101,101,Mitte,Polygon,Mitte,Mitte,1063.8748,"POLYGON ((13.41649 52.52696, 13.41635 52.52702..."
1,re_ortsteil.0102,102,Moabit,Polygon,Moabit,Mitte,768.7909,"POLYGON ((13.33884 52.51974, 13.33884 52.51974..."


### Reproject GeoDataFrames to EPSG:4326 

In [None]:
museum_cleaned = museum_cleaned.to_crs(epsg=4326)
districts_gdf = districts_gdf.to_crs(epsg=4326)

### Spatial join with District Name and Neighbourhood_id(spatial_name)

In [None]:
museum_df_district = gpd.sjoin(
    museum_cleaned,
    districts_gdf[["BEZIRK", "spatial_name","geometry"]],
    how="left",
    predicate="within"
)

### Rename columns for clarity

In [None]:
museum_df_district = museum_df_district.rename(columns={
    "BEZIRK": "district",
    "spatial_name": "neighbourhood_id"
}).drop(columns=["index_right"])  # drop district_number if not needed

### District mapping (official codes as strings)

In [None]:
district_mapping = {
    'Mitte': '11001001',
    'Friedrichshain-Kreuzberg': '11002002',
    'Pankow': '11003003',
    'Charlottenburg-Wilmersdorf': '11004004',
    'Spandau': '11005005',
    'Steglitz-Zehlendorf': '11006006',
    'Tempelhof-Sch√∂neberg': '11007007',
    'Neuk√∂lln': '11008008',
    'Treptow-K√∂penick': '11009009',
    'Marzahn-Hellersdorf': '11010010',
    'Lichtenberg': '11011011',
    'Reinickendorf': '11012012'
}

# Apply mapping to create district_id column (string)
museum_df_district['district_id'] = museum_df_district['district'].map(district_mapping).astype(str)

In [110]:
museum_df_district.head(2)

Unnamed: 0,museum_id,geometry,house_number,street,fee,museum_name,opening_hours,website,wheelchair,wikidata,...,museum_type,operator,phone,contact_website,building,latitude,longitude,district,neighbourhood_id,district_id
0,73696610,POINT (13.36299 52.50761),13-14,Stauffenbergstra√üe,no,Gedenkst√§tte Deutscher Widerstand,Mo-Fr 09:00-18:00; Sa-Su 10:00-18:00,https://www.gdw-berlin.de/,limited,Q880798,...,,,,,,52.507605,13.362986,Mitte,104,11001001
1,84644782,POINT (13.38221 52.50487),90,Stresemannstra√üe,no,"Dokumentationszentrum Flucht, Vertreibung, Ver...",Tu-Su 10:00-19:00; Mo off,https://www.flucht-vertreibung-versoehnung.de/,yes,,...,,,,,,52.504871,13.382214,Friedrichshain-Kreuzberg,202,11002002


### Used reverse Geocoding and Nominatim to get Postal code & Street info
    - Created new columns at end to check against nulls in original columns

In [111]:
tqdm.pandas()

# Initialize geocoder
geolocator = Nominatim(user_agent="museum_locator")

# Define function to extract postal code and street
def get_postcode_and_street(row):
    try:
        location = geolocator.reverse((row['latitude'], row['longitude']), exactly_one=True)
        address = location.raw.get('address', {})
        postcode = address.get('postcode')
        street = address.get('road') or address.get('pedestrian') or address.get('footway') or address.get('street')
        return pd.Series([postcode, street])
    except:
        return pd.Series([None, None])

# Apply to your DataFrame
museum_df_district[['postal_code_from_geo', 'street_from_geo']] = museum_df_district.progress_apply(get_postcode_and_street, axis=1)


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 244/244 [06:28<00:00,  1.59s/it]


In [268]:
museum_full = museum_df_district.copy()

### Check null count before replacing Nulls in postal code

In [269]:
print(museum_full[['postal_code']].isnull().sum())
print(museum_full[['street']].isnull().sum())

postal_code    71
dtype: int64
street    65
dtype: int64


### Replace the Nulls in the original columns

In [270]:
museum_full['postal_code'] = museum_full['postal_code'].fillna(museum_full['postal_code_from_geo'])
museum_full['street'] = museum_full['street'].fillna(museum_full['street_from_geo'])

### Check null values now to see if a difference

In [271]:
print(museum_full[['postal_code']].isnull().sum())
print(museum_full[['street']].isnull().sum())

postal_code    0
dtype: int64
street    0
dtype: int64


### Replace empty strings with NaN

In [272]:
museum_full.replace('', np.nan, inplace=True)

museum_full.head(2)

Unnamed: 0,museum_id,geometry,house_number,street,fee,museum_name,opening_hours,website,wheelchair,wikidata,...,phone,contact_website,building,latitude,longitude,district,neighbourhood_id,district_id,postal_code_from_geo,street_from_geo
0,73696610,POINT (13.36299 52.50761),13-14,Stauffenbergstra√üe,no,Gedenkst√§tte Deutscher Widerstand,Mo-Fr 09:00-18:00; Sa-Su 10:00-18:00,https://www.gdw-berlin.de/,limited,Q880798,...,,,,52.507605,13.362986,Mitte,104,11001001,10785,Stauffenbergstra√üe
1,84644782,POINT (13.38221 52.50487),90,Stresemannstra√üe,no,"Dokumentationszentrum Flucht, Vertreibung, Ver...",Tu-Su 10:00-19:00; Mo off,https://www.flucht-vertreibung-versoehnung.de/,yes,,...,,,,52.504871,13.382214,Friedrichshain-Kreuzberg,202,11002002,10963,Stresemannstra√üe


### Replace yes/no with True/False in fee column

In [273]:
museum_full["fee"] =museum_full["fee"].map({"yes": True, "no": False})

### Check if website NaN then use data from contact_website if available

In [274]:
museum_full['website'] = museum_full['website'].fillna(museum_full['contact_website'])

### Normalize the street name column

In [275]:
def normalize_street_name(name):
    if pd.isna(name):
        return np.nan
    # Replace underscores with spaces
    name = name.replace('_', ' ').replace('-', ' ')
    # Replace 'str.' or 'str' at end with ' Stra√üe'
    name = re.sub(r'\bstr\.?\s*$', ' Stra√üe', name, flags=re.IGNORECASE)
    # Ensure space before 'stra√üe' if missing
    name = re.sub(r'(?<!\s)(stra√üe)$', r' Stra√üe', name, flags=re.IGNORECASE)
    # Ensure space before 'allee' if missing
    name = re.sub(r'(?<!\s)(allee)$', r' Allee', name, flags=re.IGNORECASE)
    # Ensure space before 'damm' if missing
    name = re.sub(r'(?<!\s)(damm)$', r' Damm', name, flags=re.IGNORECASE)
    # Ensure space before 'weg' if missing
    name = re.sub(r'(?<!\s)(weg)$', r' Weg', name, flags=re.IGNORECASE)
    # Ensure space before 'graben' if missing
    name = re.sub(r'(?<!\s)(graben)$', r' Graben', name, flags=re.IGNORECASE)
    # Ensure space before 'ufer' if missing
    name = re.sub(r'(?<!\s)(ufer)$', r' Ufer', name, flags=re.IGNORECASE)
    # Ensure space before 'korso' if missing
    name = re.sub(r'(?<!\s)(korso)$', r' Korso', name, flags=re.IGNORECASE)
    # Ensure space before 'zeile' if missing
    name = re.sub(r'(?<!\s)(zeile)$', r' Ziele', name, flags=re.IGNORECASE)
    # Ensure space before 'promenade' if missing
    name = re.sub(r'(?<!\s)(promenade)$', r' Promenade', name, flags=re.IGNORECASE)
    # Ensure space before 'kiez' if missing
    name = re.sub(r'(?<!\s)(kiez)$', r' Kiez', name, flags=re.IGNORECASE)
    # Ensure space before 'platz' if missing
    name = re.sub(r'(?<!\s)(platz)$', r' Platz', name, flags=re.IGNORECASE)
    # Ensure space before 'steig' if missing
    name = re.sub(r'(?<!\s)(steig)$', r' Steig', name, flags=re.IGNORECASE)
    # Remove extra spaces
    name = re.sub(r'\s+', ' ', name).strip()
    return name

In [276]:
museum_full['street'] = museum_full['street'].apply(normalize_street_name)

### Check and Normalize the telephone numbers

In [277]:
museum_full[museum_full['phone'].notna()]['phone'].head()

4      +49 30 55368 54
9     +49 30 200090300
16     +49-30-49910517
18     +49 30 20938591
20      +49 30 2291760
Name: phone, dtype: object

In [278]:
import pandas as pd
import numpy as np
import re

def normalize_phone_number(phone):
    if pd.isna(phone):
        return np.nan

    # Remove all non-digit and plus characters except leading '+'
    phone = re.sub(r'[^\d+]', '', phone)

    # Ensure it starts with +49
    if phone.startswith('0049'):
        phone = '+49' + phone[4:]
    elif phone.startswith('49'):
        phone = '+49' + phone[2:]
    elif not phone.startswith('+49'):
        phone = '+49' + phone.lstrip('0')

    # Insert space after country code and area code if it's Berlin (30)
    phone = re.sub(r'^\+49\s?30', '+49 30 ', phone)

    # Remove any remaining hyphens or double spaces
    phone = re.sub(r'\s+', ' ', phone).strip()

    return phone

In [279]:
museum_full['normalized_phone'] = museum_full['phone'].apply(normalize_phone_number)

In [280]:
museum_full[museum_full[['phone', 'normalized_phone']].notna().all(axis=1)][['phone', 'normalized_phone']].head()

Unnamed: 0,phone,normalized_phone
4,+49 30 55368 54,+49 30 5536854
9,+49 30 200090300,+49 30 200090300
16,+49-30-49910517,+49 30 49910517
18,+49 30 20938591,+49 30 20938591
20,+49 30 2291760,+49 30 2291760


### Drop unnecessary columns

In [281]:
museum_full.drop(columns=['geometry', 'district', 'postal_code_from_geo', 'street_from_geo', 'wikidata', 'fee_icon_member', 'contact_website', 'phone'], errors='ignore', inplace=True)

### Convert all text in columns to lowercase to avoid any duplications

In [282]:
text_cols = ['museum_name', 'street', 'website', 'opening_hours', 'wheelchair', 'wikipedia', 'toilets_wheelchair_accessible', 'museum_type', 'building', 'operator']

for col in text_cols:
    if col in museum_full.columns:
        museum_full[col] = museum_full[col].apply(
            lambda x: x.strip().lower() if isinstance(x, str) else x
        )

### Check column data types**
    - All have correct type allocated so no need to change anything

In [283]:
museum_full.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 244 entries, 0 to 243
Data columns (total 20 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   museum_id                      244 non-null    object 
 1   house_number                   173 non-null    object 
 2   street                         244 non-null    object 
 3   fee                            180 non-null    object 
 4   museum_name                    242 non-null    object 
 5   opening_hours                  193 non-null    object 
 6   website                        203 non-null    object 
 7   wheelchair                     157 non-null    object 
 8   wikipedia                      95 non-null     object 
 9   postal_code                    244 non-null    object 
 10  toilets_wheelchair_accessible  64 non-null     object 
 11  fee_icom_member                69 non-null     object 
 12  museum_type                    71 non-null     

### Remove duplicates
    - In this case we have no duplicates but will keep this check for future runs incase

- See how many duplicate rows exist

In [284]:
museum_full.duplicated().sum()

np.int64(0)

- Display the actual duplicate rows

In [285]:
museum_full[museum_full.duplicated()]

Unnamed: 0,museum_id,house_number,street,fee,museum_name,opening_hours,website,wheelchair,wikipedia,postal_code,toilets_wheelchair_accessible,fee_icom_member,museum_type,operator,building,latitude,longitude,neighbourhood_id,district_id,normalized_phone


- Remove duplicate rows and reset index

In [286]:
museum_full = museum_full.drop_duplicates().reset_index(drop=True)

### Remove row if name missing

In [287]:
museum_full = museum_full.dropna(subset=['museum_name'])

### Reorder column names to be clearer

In [288]:
museum_listings = museum_full[['museum_id', 'museum_name', 'house_number', 'street',  'neighbourhood_id', 'district_id', 'postal_code', 'website', 'normalized_phone', 'museum_type',
                               'operator', 'building', 'wikipedia', 'opening_hours', 'wheelchair', 'toilets_wheelchair_accessible', 'fee', 'latitude', 'longitude']]

In [289]:
museum_listings.to_csv("../sources/csv_files/museum_listings.csv", index=False)

### Final Summary of cleaned and Transformed Data

In [290]:
print(f"Number of rows: {museum_listings.shape[0]}")
print(f"Number of columns: {museum_listings.shape[1]}")

Number of rows: 242
Number of columns: 19


In [291]:
print("\nRemaining columns:")
print(museum_listings.columns.tolist())


Remaining columns:
['museum_id', 'museum_name', 'house_number', 'street', 'neighbourhood_id', 'district_id', 'postal_code', 'website', 'normalized_phone', 'museum_type', 'operator', 'building', 'wikipedia', 'opening_hours', 'wheelchair', 'toilets_wheelchair_accessible', 'fee', 'latitude', 'longitude']


In [292]:
missing = museum_listings.isnull().sum()
print("\nMissing values after cleaning and transforming :")
print(missing)


Missing values after cleaning and transforming :
museum_id                          0
museum_name                        0
house_number                      70
street                             0
neighbourhood_id                   0
district_id                        0
postal_code                        0
website                           39
normalized_phone                 179
museum_type                      172
operator                         168
building                         172
wikipedia                        147
opening_hours                     50
wheelchair                        85
toilets_wheelchair_accessible    178
fee                               62
latitude                           0
longitude                          0
dtype: int64


In [293]:
missing_percent = museum_listings.isnull().mean() * 100
print(missing_percent.sort_values(ascending=False))

normalized_phone                 73.966942
toilets_wheelchair_accessible    73.553719
museum_type                      71.074380
building                         71.074380
operator                         69.421488
wikipedia                        60.743802
wheelchair                       35.123967
house_number                     28.925620
fee                              25.619835
opening_hours                    20.661157
website                          16.115702
latitude                          0.000000
museum_id                         0.000000
museum_name                       0.000000
postal_code                       0.000000
district_id                       0.000000
neighbourhood_id                  0.000000
street                            0.000000
longitude                         0.000000
dtype: float64


In [294]:
museum_listings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 242 entries, 0 to 243
Data columns (total 19 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   museum_id                      242 non-null    object 
 1   museum_name                    242 non-null    object 
 2   house_number                   172 non-null    object 
 3   street                         242 non-null    object 
 4   neighbourhood_id               242 non-null    object 
 5   district_id                    242 non-null    object 
 6   postal_code                    242 non-null    object 
 7   website                        203 non-null    object 
 8   normalized_phone               63 non-null     object 
 9   museum_type                    70 non-null     object 
 10  operator                       74 non-null     object 
 11  building                       70 non-null     object 
 12  wikipedia                      95 non-null     object 


In [295]:
museum_listings.head()

Unnamed: 0,museum_id,museum_name,house_number,street,neighbourhood_id,district_id,postal_code,website,normalized_phone,museum_type,operator,building,wikipedia,opening_hours,wheelchair,toilets_wheelchair_accessible,fee,latitude,longitude
0,73696610,gedenkst√§tte deutscher widerstand,13-14,stauffenberg stra√üe,104,11001001,10785,https://www.gdw-berlin.de/,,,,,de:gedenkst√§tte deutscher widerstand,mo-fr 09:00-18:00; sa-su 10:00-18:00,limited,,False,52.507605,13.362986
1,84644782,"dokumentationszentrum flucht, vertreibung, ver...",90,stresemann stra√üe,202,11002002,10963,https://www.flucht-vertreibung-versoehnung.de/,,,,,"de:stiftung flucht, vertreibung, vers√∂hnung",tu-su 10:00-19:00; mo off,yes,yes,False,52.504871,13.382214
2,259855486,berliner s-bahn-museum,3,koppen stra√üe,201,11002002,10243,https://s-bahn-museum.de/,,railway,berliner s-bahn-museum ggmbh,,,"we 12:00-16:00, th,fr 15:00-20:00, su 14:00-18:00",yes,,True,52.510475,13.432765
3,268591806,das museum der dom√§ne dahlem,,k√∂nigin luise stra√üe,605,11006006,14195,https://www.domaene-dahlem.de/home/,,open_air,,,,mo-su 10:00-18:00; th off,limited,,True,52.458357,13.288944
4,281391655,stasimuseum,103,rusche stra√üe,1103,11011011,10365,https://www.stasimuseum.de/,+49 30 5536854,history,astak e.v.,,de:forschungs- und gedenkst√§tte normannenstra√üe,mo-fr 10:00-18:00; sa-su 11:00-18:00,yes,,True,52.514507,13.487485
