# 1. Install required libraries and packages

In [2]:
pip install osmnx geopandas pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


## 1.1 install packages

In [None]:
import osmnx as ox
import geopandas as gpd
import pandas as pd
import requests
import time
import logging
from sqlalchemy import create_engine, text
import psycopg2
import warnings

warnings.filterwarnings("ignore")

# Display all rows and columns in pandas DataFrames
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)


In [261]:
# disable disk caching
ox.settings.use_cache = False

# 2. get hospital data from OSM

In [329]:
#tags = {"amenity": "hospital"}
#tags = {
 #    "amenity": ["hospital","clinic"],  # hospitals and clinics
 #    "healthcare": ["hospital","clinic"],  # healthcare tagging
#}

##2.1 get hospital data from amenity hospital

In [330]:
tags1 = {
     "amenity": ["hospital"],  # hospitals and clinics
}
hospitalh_gdf = ox.features_from_place("Berlin, Germany", tags1)


In [331]:
hospitalh_gdf.info() 

<class 'geopandas.geodataframe.GeoDataFrame'>
MultiIndex: 58 entries, ('node', np.int64(5228591787)) to ('way', np.int64(1294595368))
Data columns (total 89 columns):
 #   Column                                 Non-Null Count  Dtype   
---  ------                                 --------------  -----   
 0   geometry                               58 non-null     geometry
 1   addr:city                              50 non-null     object  
 2   addr:housenumber                       50 non-null     object  
 3   addr:postcode                          50 non-null     object  
 4   addr:street                            50 non-null     object  
 5   amenity                                58 non-null     object  
 6   emergency                              35 non-null     object  
 7   healthcare                             58 non-null     object  
 8   healthcare:speciality                  19 non-null     object  
 9   name                                   58 non-null     object  
 10  

hospitals: 58 entries

In [375]:
print(hospitalh_gdf[["name"]])

                                                                                          name
element  id                                                                                   
node     5228591787                                                     Klinik Schöneberg GmbH
         10573646542                                                            Checkpoint BLN
         13108440442                                         Notaufnahme Park-Klinik Weißensee
         13162221796  Gemeinschaftspraxis Michael Balschin, Vadim Rubinstein, Irina Rabinovich
relation 7715945                                                 Franziskus-Krankenhaus Berlin
         8667439                                                            Wichernkrankenhaus
         16389059                        Evangelisches Krankenhaus Königin Elisabeth Herzberge
         17381892                                           Charité – Campus Benjamin Franklin
way      4610129                                  

## 2.2 get clinic data from amenity hospital

In [335]:
tags2 = {
     "amenity": ["clinic"],  # hospitals and clinics
}
hospitalc_gdf = ox.features_from_place("Berlin, Germany", tags2)

In [336]:
hospitalc_gdf.info() 

<class 'geopandas.geodataframe.GeoDataFrame'>
MultiIndex: 180 entries, ('node', np.int64(669088712)) to ('way', np.int64(1433767664))
Data columns (total 88 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   geometry                   180 non-null    geometry
 1   addr:city                  115 non-null    object  
 2   addr:country               82 non-null     object  
 3   addr:housenumber           129 non-null    object  
 4   addr:postcode              121 non-null    object  
 5   addr:street                132 non-null    object  
 6   addr:suburb                81 non-null     object  
 7   amenity                    180 non-null    object  
 8   check_date                 21 non-null     object  
 9   healthcare                 180 non-null    object  
 10  name                       174 non-null    object  
 11  wheelchair                 75 non-null     object  
 12  opening_hours              61 

clinic: 180 entries

In [376]:
print(hospitalc_gdf[["name"]])

                                                                                                                name
element  id                                                                                                         
node     669088712                                                                            Ärztehaus Johannisthal
         694302689                                                                       Ärztehaus am Roedeliusplatz
         872223518                                                                                         Ärztehaus
         874357625                                                                                MVZ am Moritzplatz
         911925361                                                                           Ärztehaus Damerowstraße
         933513566                                                                        Ärztehaus am Schlachtensee
         1038248666                                             

### 2.2.1 delete rows with inpropriate entries like Ärzte, Praxis, etc,

following occurrences are not hospitals:

Ärztehaus|Ärzte|Tagesklinik|MVZ|Praxis|Nierenzentrum|Nierenzentrum|Ostkreuz|Zahnklinik|Medico|Frauenheilkunde|CPC|Rückenzentrum|Augenheilkunde|Facharztzentrum|Fertility|Ästhetikwelt|ZAR|Hochschulambulanz|Schlaflabor|ÄrtzeZentrum|Polikum|Gesundheitszentrum|Institut|Hals-, Nasen-, Ohrenheilkunde|Sport-Gesundheitspark|Funktionsdiagnostik|Beratungszentrum|Gynäkologie Treptow|Pharmakovigilanz- und|Gerinnungszentrum|Endokrinologie|Orthomed|open.med|Augenarzt|Checkpoint|MRT-Akademie|M1|Notaufnahme|copv.berlin|Biberburg|Gesundheitsforum|Hausärztlich-Internistisches|Kimderwunsch|Versorungszentrum|OP-Zentrum|Tagesklinik|VitaHaus|Arztpraxen|Daheim|Roseneck|Psychiatrisch-Psychotherapeutische|Schlaf-Atem-Zentrum|Psychotherapie|Medical Center|Ambulatorium|Notaufnahme|Checkpoint

In [378]:
hospitalc_df = hospitalc_gdf[
    hospitalc_gdf["name"].notna() &
    ~hospitalc_gdf["name"].str.contains("Ärztehaus|Ärzte|Tagesklinik|MVZ|Praxis|Nierenzentrum|Nierenzentrum|Ostkreuz|Zahnklinik|Medico|Frauenheilkunde|CPC|Rückenzentrum|Augenheilkunde|Facharztzentrum|Fertility|Ästhetikwelt|ZAR|Hochschulambulanz|Schlaflabor|ÄrtzeZentrum|Polikum|Gesundheitszentrum|Institut|Hals-, Nasen-, Ohrenheilkunde|Sport-Gesundheitspark|Funktionsdiagnostik|Beratungszentrum|Gynäkologie Treptow|Pharmakovigilanz- und|Gerinnungszentrum|Endokrinologie|Orthomed|open.med|Augenarzt|Checkpoint|MRT-Akademie|M1|Notaufnahme|copv.berlin|Biberburg|Gesundheitsforum|Hausärztlich-Internistisches|Kimderwunsch|Versorungszentrum|OP-Zentrum|Tagesklinik|VitaHaus|Arztpraxen|Daheim|Roseneck|Psychiatrisch-Psychotherapeutische|Schlaf-Atem-Zentrum|Psychotherapie|Medical Center|Ambulatorium|Notaufnahme|Checkpoint|Checkpoint BLN", case=False, na=False)]

In [379]:
hospitalc_df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
MultiIndex: 22 entries, ('node', np.int64(3374664287)) to ('way', np.int64(546797635))
Data columns (total 88 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   geometry                   22 non-null     geometry
 1   addr:city                  10 non-null     object  
 2   addr:country               6 non-null      object  
 3   addr:housenumber           11 non-null     object  
 4   addr:postcode              11 non-null     object  
 5   addr:street                11 non-null     object  
 6   addr:suburb                6 non-null      object  
 7   amenity                    22 non-null     object  
 8   check_date                 3 non-null      object  
 9   healthcare                 22 non-null     object  
 10  name                       22 non-null     object  
 11  wheelchair                 4 non-null      object  
 12  opening_hours              3 no

after deletion of clincis 22 rows

## 2.3 union the 2 dataframes

In [380]:
union_df = pd.concat([hospitalh_gdf, hospitalc_df]).drop_duplicates().reset_index(drop=True)
union_df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 80 entries, 0 to 79
Columns: 119 entries, geometry to contact:suburb
dtypes: geometry(1), object(118)
memory usage: 74.5+ KB


unioned df has 80 entries

## 2.4 keep only relevant columns

In [381]:

hospital_df = union_df[["name", "geometry", "operator", "brand", "addr:city", "addr:street", "addr:housenumber", "addr:postcode", "addr:suburb", "phone","email", "website", "wheelchair", "toilets:wheelchair", "beds", "emergency", "healthcare:speciality","opening_hours","source"]]
hospital_df.head(3)

Unnamed: 0,name,geometry,operator,brand,addr:city,addr:street,addr:housenumber,addr:postcode,addr:suburb,phone,email,website,wheelchair,toilets:wheelchair,beds,emergency,healthcare:speciality,opening_hours,source
0,Klinik Schöneberg GmbH,POINT (13.34455 52.4988),Klinik Schöneberg GmbH,,Berlin,Fuggerstraße,23,10777.0,,+49 30 23601-0,,https://klinik-schoeneberg.de/,no,,,no,anaesthetics;urology;plastic_surgery;surgery;orthopaedics,,
1,Checkpoint BLN,POINT (13.42456 52.48584),,,Berlin,Hermannstraße,256-258,12049.0,,,,,yes,,,yes,infectious_diseases,,
2,Notaufnahme Park-Klinik Weißensee,POINT (13.45058 52.55501),,,,,,,,,,,,,,,,,


In [382]:
hospital_df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   name                   80 non-null     object  
 1   geometry               80 non-null     geometry
 2   operator               40 non-null     object  
 3   brand                  3 non-null      object  
 4   addr:city              60 non-null     object  
 5   addr:street            61 non-null     object  
 6   addr:housenumber       61 non-null     object  
 7   addr:postcode          61 non-null     object  
 8   addr:suburb            40 non-null     object  
 9   phone                  25 non-null     object  
 10  email                  6 non-null      object  
 11  website                36 non-null     object  
 12  wheelchair             51 non-null     object  
 13  toilets:wheelchair     11 non-null     object  
 14  beds                   2 non-null   

In [366]:
# Ensure geometry type is Point for lat/lon extraction

hospital_df = hospital_df.to_crs(epsg=4326)

## 2.5 extract longitude and latitude

In [383]:
hospital_df['geometry'] = hospital_df['geometry'].apply(lambda geom: geom if geom.geom_type == 'Point' else geom.representative_point())
#Extract latitude and longitude
hospital_df["latitude"] = hospital_df.geometry.y
hospital_df["longitude"] = hospital_df.geometry.x
hospital_df.head(3)

Unnamed: 0,name,geometry,operator,brand,addr:city,addr:street,addr:housenumber,addr:postcode,addr:suburb,phone,email,website,wheelchair,toilets:wheelchair,beds,emergency,healthcare:speciality,opening_hours,source,latitude,longitude
0,Klinik Schöneberg GmbH,POINT (13.34455 52.4988),Klinik Schöneberg GmbH,,Berlin,Fuggerstraße,23,10777.0,,+49 30 23601-0,,https://klinik-schoeneberg.de/,no,,,no,anaesthetics;urology;plastic_surgery;surgery;orthopaedics,,,52.498799,13.344552
1,Checkpoint BLN,POINT (13.42456 52.48584),,,Berlin,Hermannstraße,256-258,12049.0,,,,,yes,,,yes,infectious_diseases,,,52.485835,13.42456
2,Notaufnahme Park-Klinik Weißensee,POINT (13.45058 52.55501),,,,,,,,,,,,,,,,,,52.555011,13.450579


# 3. select required columns

In [384]:
#add country=Germany
hospital_df['country'] = 'Germany'

In [354]:
# Select the columns  and rename accordingly

selected_columns = [
         "name", "operator", "brand", "country", "addr:city", "addr:street", "addr:housenumber", "addr:postcode", "addr:suburb", 
         "phone","email", "website", "wheelchair", "toilets:wheelchair", "beds", "emergency", "healthcare:speciality",
         "opening_hours","latitude", "longitude","geometry", "source"]
      

In [385]:
hospital_df = hospital_df[selected_columns]
hospital_df.head(3)

Unnamed: 0,name,operator,brand,country,addr:city,addr:street,addr:housenumber,addr:postcode,addr:suburb,phone,email,website,wheelchair,toilets:wheelchair,beds,emergency,healthcare:speciality,opening_hours,latitude,longitude,geometry,source
0,Klinik Schöneberg GmbH,Klinik Schöneberg GmbH,,Germany,Berlin,Fuggerstraße,23,10777.0,,+49 30 23601-0,,https://klinik-schoeneberg.de/,no,,,no,anaesthetics;urology;plastic_surgery;surgery;orthopaedics,,52.498799,13.344552,POINT (13.34455 52.4988),
1,Checkpoint BLN,,,Germany,Berlin,Hermannstraße,256-258,12049.0,,,,,yes,,,yes,infectious_diseases,,52.485835,13.42456,POINT (13.42456 52.48584),
2,Notaufnahme Park-Klinik Weißensee,,,Germany,,,,,,,,,,,,,,,52.555011,13.450579,POINT (13.45058 52.55501),


In [386]:
nan_names = hospital_df[hospital_df["name"].isna()]

print(nan_names)
print(len(nan_names), "rows with NaN name")

Empty GeoDataFrame
Columns: [name, operator, brand, country, addr:city, addr:street, addr:housenumber, addr:postcode, addr:suburb, phone, email, website, wheelchair, toilets:wheelchair, beds, emergency, healthcare:speciality, opening_hours, latitude, longitude, geometry, source]
Index: []
0 rows with NaN name


In [387]:
print("\nTop 100 hospitals:")
print(hospital_df["name"].value_counts().head(100))


Top 100 hospitals:
name
Klinik Schöneberg GmbH                                                                            1
Checkpoint BLN                                                                                    1
Klinik für Anästhesiologie und operative Intensivmedizin                                          1
Klinik für MIC Minimal Invasive Chirurgie                                                         1
Klinik Schöneberg                                                                                 1
Evangelische Elisabeth Klinik                                                                     1
Klinikum im Friedrichshain                                                                        1
Arona Klinik für Altersmedizin                                                                    1
Sankt-Marien-Krankenhaus                                                                          1
Krankenhaus Bethel                                                         

In [388]:
rename_map = {
    "addr:street": "street",
    "addr:housenumber": "housenumber",
    "addr:postcode": "postcode",
    "addr:city": "city",
    "addr:suburb": "neighborhood",
    "healthcare:speciality": "speciality",
    "toilets:wheelchair": "toilets_wheelchair"}

In [389]:
# Rename the columns
hospital_df = hospital_df.rename(columns=rename_map)
hospital_df.head(3)

Unnamed: 0,name,operator,brand,country,city,street,housenumber,postcode,neighborhood,phone,email,website,wheelchair,toilets_wheelchair,beds,emergency,speciality,opening_hours,latitude,longitude,geometry,source
0,Klinik Schöneberg GmbH,Klinik Schöneberg GmbH,,Germany,Berlin,Fuggerstraße,23,10777.0,,+49 30 23601-0,,https://klinik-schoeneberg.de/,no,,,no,anaesthetics;urology;plastic_surgery;surgery;orthopaedics,,52.498799,13.344552,POINT (13.34455 52.4988),
1,Checkpoint BLN,,,Germany,Berlin,Hermannstraße,256-258,12049.0,,,,,yes,,,yes,infectious_diseases,,52.485835,13.42456,POINT (13.42456 52.48584),
2,Notaufnahme Park-Klinik Weißensee,,,Germany,,,,,,,,,,,,,,,52.555011,13.450579,POINT (13.45058 52.55501),


In [390]:
hospital_df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   name                80 non-null     object  
 1   operator            40 non-null     object  
 2   brand               3 non-null      object  
 3   country             80 non-null     object  
 4   city                60 non-null     object  
 5   street              61 non-null     object  
 6   housenumber         61 non-null     object  
 7   postcode            61 non-null     object  
 8   neighborhood        40 non-null     object  
 9   phone               25 non-null     object  
 10  email               6 non-null      object  
 11  website             36 non-null     object  
 12  wheelchair          51 non-null     object  
 13  toilets_wheelchair  11 non-null     object  
 14  beds                2 non-null      object  
 15  emergency           38 non-null   

# 4. retrieve district and district_id and neighborhood for missings

In [391]:
from geopy.geocoders import Nominatim
from time import sleep

In [392]:


def fetch_location_info(df, lat_col="latitude", lon_col="longitude", level="district", user_agent="berlin-venues-scraper/1.0", delay=1):
    """
    Fetch district or neighborhood from Nominatim for a DataFrame with lat/lon columns.
    
    Parameters:
        df: pd.DataFrame with latitude and longitude columns
        lat_col, lon_col: names of the lat/lon columns
        level: "district" or "neighborhood"
        user_agent: User-Agent string for Nominatim
        delay: delay in seconds between requests
    
    Returns:
        pd.Series with district or neighborhood names
    """
    def get_info(lat, lon):
        url = "https://nominatim.openstreetmap.org/reverse"
        params = {"lat": lat, "lon": lon, "format": "json", "addressdetails": 1}
        headers = {"User-Agent": user_agent}
        try:
            r = requests.get(url, params=params, headers=headers, timeout=10)
            r.raise_for_status()
            data = r.json()
            address = data.get("address", {})

            if level == "district":
                # Only official Bezirke
                return (
                    address.get("city_district")
                    or address.get("borough")
                    or address.get("county")
                    or address.get("state_district")
                )
            elif level == "neighborhood":
                # Include suburb / neighbourhood / city_district
                return (
                    address.get("suburb")
                    or address.get("city_district")
                    or address.get("borough")
                    or address.get("neighbourhood")
                )
            else:
                return None
        except requests.exceptions.RequestException as e:
            logging.warning(f"Error fetching {level} for ({lat}, {lon}): {e}")
            return None

    # Apply with throttling
    results = []
    for i, row in df.iterrows():
        lat, lon = row[lat_col], row[lon_col]
        if pd.notna(lat) and pd.notna(lon):
            results.append(get_info(lat, lon))
            time.sleep(delay)
        else:
            results.append(None)
    return pd.Series(results, index=df.index)

# Usage examples:
hospital_df["district"] = fetch_location_info(hospital_df, level="district")
hospital_df["neighborhood_new"] = fetch_location_info(hospital_df, level="neighborhood")


In [393]:
hospital_df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   name                80 non-null     object  
 1   operator            40 non-null     object  
 2   brand               3 non-null      object  
 3   country             80 non-null     object  
 4   city                60 non-null     object  
 5   street              61 non-null     object  
 6   housenumber         61 non-null     object  
 7   postcode            61 non-null     object  
 8   neighborhood        40 non-null     object  
 9   phone               25 non-null     object  
 10  email               6 non-null      object  
 11  website             36 non-null     object  
 12  wheelchair          51 non-null     object  
 13  toilets_wheelchair  11 non-null     object  
 14  beds                2 non-null      object  
 15  emergency           38 non-null   

In [394]:
# District mapping (official codes as strings)
district_mapping = {
    'Mitte': '11001001',
    'Friedrichshain-Kreuzberg': '11002002',
    'Pankow': '11003003',
    'Charlottenburg-Wilmersdorf': '11004004',
    'Spandau': '11005005',
    'Steglitz-Zehlendorf': '11006006',
    'Tempelhof-Schöneberg': '11007007',
    'Neukölln': '11008008',
    'Treptow-Köpenick': '11009009',
    'Marzahn-Hellersdorf': '11010010',
    'Lichtenberg': '11011011',
    'Reinickendorf': '11012012'
}

# Apply mapping to create district_id column
hospital_df['district_id'] = (
    hospital_df['district']
    .map(district_mapping)
    .astype(str)
)
 

In [395]:
#for missing fill with retrieve information from neighborhood and drop the temporary column
hospital_df["neighborhood"] = hospital_df["neighborhood"].fillna(hospital_df["neighborhood_new"])
hospital_df.drop(columns=["neighborhood_new"], inplace=True)
hospital_df.head(10)

Unnamed: 0,name,operator,brand,country,city,street,housenumber,postcode,neighborhood,phone,email,website,wheelchair,toilets_wheelchair,beds,emergency,speciality,opening_hours,latitude,longitude,geometry,source,district,district_id
0,Klinik Schöneberg GmbH,Klinik Schöneberg GmbH,,Germany,Berlin,Fuggerstraße,23,10777.0,Schöneberg,+49 30 23601-0,,https://klinik-schoeneberg.de/,no,,,no,anaesthetics;urology;plastic_surgery;surgery;orthopaedics,,52.498799,13.344552,POINT (13.34455 52.4988),,Tempelhof-Schöneberg,11007007
1,Checkpoint BLN,,,Germany,Berlin,Hermannstraße,256-258,12049.0,Neukölln,,,,yes,,,yes,infectious_diseases,,52.485835,13.42456,POINT (13.42456 52.48584),,Neukölln,11008008
2,Notaufnahme Park-Klinik Weißensee,,,Germany,,,,,Weißensee,,,,,,,,,,52.555011,13.450579,POINT (13.45058 52.55501),,Pankow,11003003
3,"Gemeinschaftspraxis Michael Balschin, Vadim Rubinstein, Irina Rabinovich",,,Germany,Berlin,Reinickendorfer Straße,45,13347.0,Wedding,030 / 46 50 78 36,,,no,,,,general,,52.549734,13.368747,POINT (13.36875 52.54973),,Mitte,11001001
4,Franziskus-Krankenhaus Berlin,Thuiner Franziskanerinnen,,Germany,Berlin,Budapester Straße,15-19,10787.0,Tiergarten,,,,yes,,,yes,,,52.506031,13.345056,POINT (13.34506 52.50603),,Mitte,11001001
5,Wichernkrankenhaus,Johannesstift Diakonie,,Germany,Berlin,Schönwalder Allee,26/51,13587.0,Hakenfelde,,,,yes,,,no,,,52.569471,13.192006,POINT (13.19201 52.56947),,Spandau,11005005
6,Evangelisches Krankenhaus Königin Elisabeth Herzberge,Evangelisches Krankenhaus Königin Elisabeth Herzberge gGmbH,,Germany,Berlin,Herzbergstraße,79,10365.0,Lichtenberg,,,,yes,,,yes,,,52.527922,13.508992,POINT (13.50899 52.52792),,Lichtenberg,11011011
7,Charité – Campus Benjamin Franklin,Charité,,Germany,Berlin,Hindenburgdamm,30,12203.0,Lichterfelde,,,https://www.charite.de/die_charite/campi/campus_benjamin_franklin/,yes,yes,,yes,general,,52.441908,13.320544,POINT (13.32054 52.44191),,Steglitz-Zehlendorf,11006006
8,Jüdisches Krankenhaus Berlin,,,Germany,Berlin,Heinz-Galinski-Straße,1,13347.0,Gesundbrunnen,+49 30 49940,,https://www.juedisches-krankenhaus.de/,yes,yes,,yes,,,52.555377,13.370214,POINT (13.37021 52.55538),,Mitte,11001001
9,DRK Kliniken Berlin Mitte,Deutsches Rotes Kreuz Schwesternschaft Berlin,Deutsches Rotes Kreuz,Germany,Berlin,Drontheimer Straße,39-40,13359.0,Gesundbrunnen,+49 30 30356000,,,yes,,,yes,,,52.557402,13.375736,POINT (13.37574 52.5574),,Mitte,11001001


# Step 2: review created data frame

## 2.1 How many rows and columns?

In [396]:


print("Rows, Columns:", hospital_df.shape)

Rows, Columns: (80, 24)


## 2.2 missing values per columns

In [397]:
missing_count = hospital_df.isna().sum().sort_values(ascending=False)
print(missing_count)

beds                  78
source                78
brand                 77
opening_hours         75
email                 74
toilets_wheelchair    69
phone                 55
speciality            49
website               44
emergency             42
operator              40
wheelchair            29
city                  20
postcode              19
housenumber           19
street                19
longitude              0
district               0
geometry               0
name                   0
latitude               0
neighborhood           0
country                0
district_id            0
dtype: int64


In [398]:
# Number of rows (observations, hospitals)
# I need this to compute percentages of missing values below

row_count = len(hospital_df)
print(row_count)

80


In [399]:
# Build table with counts and % of missing values
# What does pd.DataFrame({...}) do? It converts that dictionary into a DataFrame (like an Excel table).
# The keys become column names.
# The values become column data.

missing = pd.DataFrame({
    "missing_count": missing_count,
    "missing_pct": (missing_count / row_count * 100).round(1)
}).sort_values(by="missing_pct", ascending=False)

print(missing)

                    missing_count  missing_pct
beds                           78         97.5
source                         78         97.5
brand                          77         96.2
opening_hours                  75         93.8
email                          74         92.5
toilets_wheelchair             69         86.2
phone                          55         68.8
speciality                     49         61.3
website                        44         55.0
emergency                      42         52.5
operator                       40         50.0
wheelchair                     29         36.2
city                           20         25.0
postcode                       19         23.8
housenumber                    19         23.8
street                         19         23.8
longitude                       0          0.0
district                        0          0.0
geometry                        0          0.0
name                            0          0.0
latitude     

# 3. Decision for keeping columns no/yes
opening_hours                     56         96.6.    drop   
source                            56         96.6.    
beds                              56         96.6.    need to be populated (join by ?), use old datasets  
brand                             55         94.8
email                             53         91.4
toilets:wheelchair                48         82.8

# 4. Handling of missing value / normalization


In [400]:
# Replace NaN with "unknown" and standardize values

text_cols = ["name", "street", "city", "country", "website", "operator", "brand", "phone", "email", "source", "beds","housenumber", "postcode","emergency","wheelchair", "speciality" ,'opening_hours']
for col in text_cols:
    if col in hospital_df.columns:
        hospital_df[col] = hospital_df[col].astype(str).str.strip()
        hospital_df[col] = hospital_df[col].replace({"nan": "unknown", "none": "unknown", "null": "unknown"})   
hospital_df.head(10)

Unnamed: 0,name,operator,brand,country,city,street,housenumber,postcode,neighborhood,phone,email,website,wheelchair,toilets_wheelchair,beds,emergency,speciality,opening_hours,latitude,longitude,geometry,source,district,district_id
0,Klinik Schöneberg GmbH,Klinik Schöneberg GmbH,unknown,Germany,Berlin,Fuggerstraße,23,10777,Schöneberg,+49 30 23601-0,unknown,https://klinik-schoeneberg.de/,no,,unknown,no,anaesthetics;urology;plastic_surgery;surgery;orthopaedics,unknown,52.498799,13.344552,POINT (13.34455 52.4988),unknown,Tempelhof-Schöneberg,11007007
1,Checkpoint BLN,unknown,unknown,Germany,Berlin,Hermannstraße,256-258,12049,Neukölln,unknown,unknown,unknown,yes,,unknown,yes,infectious_diseases,unknown,52.485835,13.42456,POINT (13.42456 52.48584),unknown,Neukölln,11008008
2,Notaufnahme Park-Klinik Weißensee,unknown,unknown,Germany,unknown,unknown,unknown,unknown,Weißensee,unknown,unknown,unknown,unknown,,unknown,unknown,unknown,unknown,52.555011,13.450579,POINT (13.45058 52.55501),unknown,Pankow,11003003
3,"Gemeinschaftspraxis Michael Balschin, Vadim Rubinstein, Irina Rabinovich",unknown,unknown,Germany,Berlin,Reinickendorfer Straße,45,13347,Wedding,030 / 46 50 78 36,unknown,unknown,no,,unknown,unknown,general,unknown,52.549734,13.368747,POINT (13.36875 52.54973),unknown,Mitte,11001001
4,Franziskus-Krankenhaus Berlin,Thuiner Franziskanerinnen,unknown,Germany,Berlin,Budapester Straße,15-19,10787,Tiergarten,unknown,unknown,unknown,yes,,unknown,yes,unknown,unknown,52.506031,13.345056,POINT (13.34506 52.50603),unknown,Mitte,11001001
5,Wichernkrankenhaus,Johannesstift Diakonie,unknown,Germany,Berlin,Schönwalder Allee,26/51,13587,Hakenfelde,unknown,unknown,unknown,yes,,unknown,no,unknown,unknown,52.569471,13.192006,POINT (13.19201 52.56947),unknown,Spandau,11005005
6,Evangelisches Krankenhaus Königin Elisabeth Herzberge,Evangelisches Krankenhaus Königin Elisabeth Herzberge gGmbH,unknown,Germany,Berlin,Herzbergstraße,79,10365,Lichtenberg,unknown,unknown,unknown,yes,,unknown,yes,unknown,unknown,52.527922,13.508992,POINT (13.50899 52.52792),unknown,Lichtenberg,11011011
7,Charité – Campus Benjamin Franklin,Charité,unknown,Germany,Berlin,Hindenburgdamm,30,12203,Lichterfelde,unknown,unknown,https://www.charite.de/die_charite/campi/campus_benjamin_franklin/,yes,yes,unknown,yes,general,unknown,52.441908,13.320544,POINT (13.32054 52.44191),unknown,Steglitz-Zehlendorf,11006006
8,Jüdisches Krankenhaus Berlin,unknown,unknown,Germany,Berlin,Heinz-Galinski-Straße,1,13347,Gesundbrunnen,+49 30 49940,unknown,https://www.juedisches-krankenhaus.de/,yes,yes,unknown,yes,unknown,unknown,52.555377,13.370214,POINT (13.37021 52.55538),unknown,Mitte,11001001
9,DRK Kliniken Berlin Mitte,Deutsches Rotes Kreuz Schwesternschaft Berlin,Deutsches Rotes Kreuz,Germany,Berlin,Drontheimer Straße,39-40,13359,Gesundbrunnen,+49 30 30356000,unknown,unknown,yes,,unknown,yes,unknown,unknown,52.557402,13.375736,POINT (13.37574 52.5574),unknown,Mitte,11001001


In [401]:
# Standardize column names

hospital_df.columns = hospital_df.columns.str.lower().str.strip().str.replace(" ", "_").str.replace("-", "_")

# Convert certain columns to correct type

hospital_df["housenumber"] = hospital_df["housenumber"].astype(str)   # ensure text

hospital_df["postcode"] = hospital_df["postcode"].astype(str)         # keep leading zeros

# Normalize yes/no columns into Boolean (True/False)

hospital_df["wheelchair"] = hospital_df["wheelchair"].map({"yes": True, "no": False})

# Make text values consistent (lowercase to avoid duplicates )
 

In [402]:
hospital_df 

Unnamed: 0,name,operator,brand,country,city,street,housenumber,postcode,neighborhood,phone,email,website,wheelchair,toilets_wheelchair,beds,emergency,speciality,opening_hours,latitude,longitude,geometry,source,district,district_id
0,Klinik Schöneberg GmbH,Klinik Schöneberg GmbH,unknown,Germany,Berlin,Fuggerstraße,23,10777,Schöneberg,+49 30 23601-0,unknown,https://klinik-schoeneberg.de/,False,,unknown,no,anaesthetics;urology;plastic_surgery;surgery;orthopaedics,unknown,52.498799,13.344552,POINT (13.34455 52.4988),unknown,Tempelhof-Schöneberg,11007007
1,Checkpoint BLN,unknown,unknown,Germany,Berlin,Hermannstraße,256-258,12049,Neukölln,unknown,unknown,unknown,True,,unknown,yes,infectious_diseases,unknown,52.485835,13.42456,POINT (13.42456 52.48584),unknown,Neukölln,11008008
2,Notaufnahme Park-Klinik Weißensee,unknown,unknown,Germany,unknown,unknown,unknown,unknown,Weißensee,unknown,unknown,unknown,,,unknown,unknown,unknown,unknown,52.555011,13.450579,POINT (13.45058 52.55501),unknown,Pankow,11003003
3,"Gemeinschaftspraxis Michael Balschin, Vadim Rubinstein, Irina Rabinovich",unknown,unknown,Germany,Berlin,Reinickendorfer Straße,45,13347,Wedding,030 / 46 50 78 36,unknown,unknown,False,,unknown,unknown,general,unknown,52.549734,13.368747,POINT (13.36875 52.54973),unknown,Mitte,11001001
4,Franziskus-Krankenhaus Berlin,Thuiner Franziskanerinnen,unknown,Germany,Berlin,Budapester Straße,15-19,10787,Tiergarten,unknown,unknown,unknown,True,,unknown,yes,unknown,unknown,52.506031,13.345056,POINT (13.34506 52.50603),unknown,Mitte,11001001
5,Wichernkrankenhaus,Johannesstift Diakonie,unknown,Germany,Berlin,Schönwalder Allee,26/51,13587,Hakenfelde,unknown,unknown,unknown,True,,unknown,no,unknown,unknown,52.569471,13.192006,POINT (13.19201 52.56947),unknown,Spandau,11005005
6,Evangelisches Krankenhaus Königin Elisabeth Herzberge,Evangelisches Krankenhaus Königin Elisabeth Herzberge gGmbH,unknown,Germany,Berlin,Herzbergstraße,79,10365,Lichtenberg,unknown,unknown,unknown,True,,unknown,yes,unknown,unknown,52.527922,13.508992,POINT (13.50899 52.52792),unknown,Lichtenberg,11011011
7,Charité – Campus Benjamin Franklin,Charité,unknown,Germany,Berlin,Hindenburgdamm,30,12203,Lichterfelde,unknown,unknown,https://www.charite.de/die_charite/campi/campus_benjamin_franklin/,True,yes,unknown,yes,general,unknown,52.441908,13.320544,POINT (13.32054 52.44191),unknown,Steglitz-Zehlendorf,11006006
8,Jüdisches Krankenhaus Berlin,unknown,unknown,Germany,Berlin,Heinz-Galinski-Straße,1,13347,Gesundbrunnen,+49 30 49940,unknown,https://www.juedisches-krankenhaus.de/,True,yes,unknown,yes,unknown,unknown,52.555377,13.370214,POINT (13.37021 52.55538),unknown,Mitte,11001001
9,DRK Kliniken Berlin Mitte,Deutsches Rotes Kreuz Schwesternschaft Berlin,Deutsches Rotes Kreuz,Germany,Berlin,Drontheimer Straße,39-40,13359,Gesundbrunnen,+49 30 30356000,unknown,unknown,True,,unknown,yes,unknown,unknown,52.557402,13.375736,POINT (13.37574 52.5574),unknown,Mitte,11001001


In [220]:


# Connect to postgres DB
user_name='heike_reichert'
password='0BseedGaL9sMu6X8'

In [221]:
# Conection
host = 'localhost'
port = '5433'
database = 'layereddb'
schema='berlin_source_data'

#connection to db after you opened tunnel
engine = create_engine(f'postgresql+psycopg2://{user_name}:{password}@{host}:{port}/{database}')

In [None]:
##let's query test data!
query = f"""
SELECT * from berlin_source_data.hospitals 
"""

# Execute the query
#with engine.connect() as conn:
#    df= pd.read_sql(text(query), conn)
#    #conn.commit()  # commit the transaction
#df

Unnamed: 0,district_id,name,address,coordinates,latitude,longitude,locality,district,distance,beds,cases
0,11001001,St. Hedwig-Krankenhaus Berlin,Große Hamburger Straße 5-11\n10115 Berlin,"52.52582662684028, 13.396515830691005",52.525827,13.396516,Alexanderplatz,Mitte,0.8,427,64515
1,11001001,Charité - Universitätsmedizin Berlin,Charitéplatz 1\n10117 Berlin,"52.52662465412624, 13.376658301525385",52.526625,13.376658,Alexanderplatz,Mitte,2.2,3011,1891343
2,11002002,Vivantes Klinikum im Friedrichshain,Landsberger Allee 49\n10249 Berlin,"52.52476641204036, 13.43904818247062",52.524766,13.439048,Karl-Marx-Allee-Nord,Friedrichshain-Kreuzberg,2.4,956,111119
3,11003003,Psychiatrisch-Psychotherapeutische Tagesklinik...,Diesterwegstr. 32\n10405 Berlin,"52.5413743809574, 13.430949",52.541374,13.430949,Prenzlauer Berg Süd,Pankow,2.7,21,2004
4,11002002,Vivantes Klinikum Am Urban,Dieffenbachstraße 1\n10967 Berlin,"52.494275898889505, 13.40892918516296",52.494276,13.408929,Tempelhofer Vorstadt,Friedrichshain-Kreuzberg,2.9,607,65066
...,...,...,...,...,...,...,...,...,...,...,...
70,11006006,Kliniken im Theodor-Wenzel-Werk,Potsdamer Chaussee 69\n14129 Berlin,"52.42317751331279, 13.205861907349076",52.423178,13.205862,Zehlendorf Südwest,Steglitz-Zehlendorf,17.3,335,6765
71,11005005,Evangelisches Waldkrankenhaus Spandau,Stadtrandstraße 555\n13589 Berlin,"52.56496347506942, 13.156077626981746",52.564963,13.156078,Falkenhagener Feld,Spandau,17.4,514,96256
72,11005005,MEDIAN Klinik Berlin Kladow Fachkrankenhaus fü...,Kladower Damm 223\n14089 Berlin,"52.46258606091385, 13.161695133614485",52.462586,13.161695,Gatow / Kladow,Spandau,17.6,35,103
73,11005005,Gemeinschaftskrankenhaus Havelhöhe,Kladower Damm 221\n14089 Berlin,"52.46146749258415, 13.159668084007798",52.461467,13.159668,Gatow / Kladow,Spandau,17.8,400,22660


In [None]:
#df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75 entries, 0 to 74
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   district_id  75 non-null     object 
 1   name         75 non-null     object 
 2   address      75 non-null     object 
 3   coordinates  75 non-null     object 
 4   latitude     75 non-null     float64
 5   longitude    75 non-null     float64
 6   locality     75 non-null     object 
 7   district     75 non-null     object 
 8   distance     75 non-null     float64
 9   beds         75 non-null     int64  
 10  cases        75 non-null     int64  
dtypes: float64(3), int64(2), object(6)
memory usage: 6.6+ KB


In [None]:
#df

Unnamed: 0,district_id,name,address,coordinates,latitude,longitude,locality,district,distance,beds,cases
0,11001001,St. Hedwig-Krankenhaus Berlin,Große Hamburger Straße 5-11\n10115 Berlin,"52.52582662684028, 13.396515830691005",52.525827,13.396516,Alexanderplatz,Mitte,0.8,427,64515
1,11001001,Charité - Universitätsmedizin Berlin,Charitéplatz 1\n10117 Berlin,"52.52662465412624, 13.376658301525385",52.526625,13.376658,Alexanderplatz,Mitte,2.2,3011,1891343
2,11002002,Vivantes Klinikum im Friedrichshain,Landsberger Allee 49\n10249 Berlin,"52.52476641204036, 13.43904818247062",52.524766,13.439048,Karl-Marx-Allee-Nord,Friedrichshain-Kreuzberg,2.4,956,111119
3,11003003,Psychiatrisch-Psychotherapeutische Tagesklinik...,Diesterwegstr. 32\n10405 Berlin,"52.5413743809574, 13.430949",52.541374,13.430949,Prenzlauer Berg Süd,Pankow,2.7,21,2004
4,11002002,Vivantes Klinikum Am Urban,Dieffenbachstraße 1\n10967 Berlin,"52.494275898889505, 13.40892918516296",52.494276,13.408929,Tempelhofer Vorstadt,Friedrichshain-Kreuzberg,2.9,607,65066
...,...,...,...,...,...,...,...,...,...,...,...
70,11006006,Kliniken im Theodor-Wenzel-Werk,Potsdamer Chaussee 69\n14129 Berlin,"52.42317751331279, 13.205861907349076",52.423178,13.205862,Zehlendorf Südwest,Steglitz-Zehlendorf,17.3,335,6765
71,11005005,Evangelisches Waldkrankenhaus Spandau,Stadtrandstraße 555\n13589 Berlin,"52.56496347506942, 13.156077626981746",52.564963,13.156078,Falkenhagener Feld,Spandau,17.4,514,96256
72,11005005,MEDIAN Klinik Berlin Kladow Fachkrankenhaus fü...,Kladower Damm 223\n14089 Berlin,"52.46258606091385, 13.161695133614485",52.462586,13.161695,Gatow / Kladow,Spandau,17.6,35,103
73,11005005,Gemeinschaftskrankenhaus Havelhöhe,Kladower Damm 221\n14089 Berlin,"52.46146749258415, 13.159668084007798",52.461467,13.159668,Gatow / Kladow,Spandau,17.8,400,22660


In [226]:
# try to join 2 datasets on name, longitude and latitude
query = f"""
SELECT *, 'old' as datab from berlin_source_data.hospitals 
"""

# Execute the query
with engine.connect() as conn:
    df= pd.read_sql(text(query), conn)
    #conn.commit()  # commit the transaction
df

Unnamed: 0,district_id,name,address,coordinates,latitude,longitude,locality,district,distance,beds,cases,datab
0,11001001,St. Hedwig-Krankenhaus Berlin,Große Hamburger Straße 5-11\n10115 Berlin,"52.52582662684028, 13.396515830691005",52.525827,13.396516,Alexanderplatz,Mitte,0.8,427,64515,old
1,11001001,Charité - Universitätsmedizin Berlin,Charitéplatz 1\n10117 Berlin,"52.52662465412624, 13.376658301525385",52.526625,13.376658,Alexanderplatz,Mitte,2.2,3011,1891343,old
2,11002002,Vivantes Klinikum im Friedrichshain,Landsberger Allee 49\n10249 Berlin,"52.52476641204036, 13.43904818247062",52.524766,13.439048,Karl-Marx-Allee-Nord,Friedrichshain-Kreuzberg,2.4,956,111119,old
3,11003003,Psychiatrisch-Psychotherapeutische Tagesklinik...,Diesterwegstr. 32\n10405 Berlin,"52.5413743809574, 13.430949",52.541374,13.430949,Prenzlauer Berg Süd,Pankow,2.7,21,2004,old
4,11002002,Vivantes Klinikum Am Urban,Dieffenbachstraße 1\n10967 Berlin,"52.494275898889505, 13.40892918516296",52.494276,13.408929,Tempelhofer Vorstadt,Friedrichshain-Kreuzberg,2.9,607,65066,old
...,...,...,...,...,...,...,...,...,...,...,...,...
70,11006006,Kliniken im Theodor-Wenzel-Werk,Potsdamer Chaussee 69\n14129 Berlin,"52.42317751331279, 13.205861907349076",52.423178,13.205862,Zehlendorf Südwest,Steglitz-Zehlendorf,17.3,335,6765,old
71,11005005,Evangelisches Waldkrankenhaus Spandau,Stadtrandstraße 555\n13589 Berlin,"52.56496347506942, 13.156077626981746",52.564963,13.156078,Falkenhagener Feld,Spandau,17.4,514,96256,old
72,11005005,MEDIAN Klinik Berlin Kladow Fachkrankenhaus fü...,Kladower Damm 223\n14089 Berlin,"52.46258606091385, 13.161695133614485",52.462586,13.161695,Gatow / Kladow,Spandau,17.6,35,103,old
73,11005005,Gemeinschaftskrankenhaus Havelhöhe,Kladower Damm 221\n14089 Berlin,"52.46146749258415, 13.159668084007798",52.461467,13.159668,Gatow / Kladow,Spandau,17.8,400,22660,old


In [403]:
# Make a deep copy
df_old = df.copy(deep=True)


In [404]:
# Replace substr in name according to OSM name
# Define replacements: key = string to replace, value = replacement
replacements = {
    "ARONA Klinik für Altersmedizin Berlin-Marzahn & Tagesklinik für Geriatrie": "Arona Klinik für Altersmedizin",
    "Alexianer St. Joseph-Krankenhaus Berlin-Weißensee": "Alexianer St. Joseph Krankenhaus Berlin-Weißensee",
    "Augenklinik Berlin-Marzahn GmbH": "Augenklinik Berlin Marzahn",
    "DRK Kliniken Berlin Köpenick": "DRK Kliniken Berlin - Köpenick",
    "Deutsches Herzzentrum der Charité (DHZC) ": "Deutsches Herzzentrum der Charité - Standort Augustenburger Platz",
    "Charité - Universitätsmedizin Berlin": "Campus Charité Mitte",
    "Evangelisches Geriatriezentrum Berlin gGmbH (EGZB)": "Evangelisches Geriatriezentrum Berlin",
    "Evangelisches Krankenhaus Hubertus Krankenhausbetriebs gGmbH": "Evangelisches Krankenhaus Hubertus",
    "Ev. Krankenhaus Königin Elisabeth Herzberge": "Evangelisches Krankenhaus Königin Elisabeth Herzberge",
    "Havelklinik GmbH & Co. KG": "Havelklinik",
    "Heiligenfeld Kliniken GmbH Berlin": "Heiligenfeld Klinik Berlin",
    "Helios Klinikum Berlin-Buch": "HELIOS Klinikum Berlin Buch",
    "HELIOS Klinikum Emil von Behring GmbH": "Helios Klinikum Emil von Behring",
    "Immanuel Krankenhaus Berlin - Standort Wannsee": "Immanuel Krankenhaus Berlin",
    "Jüdisches Krankenhaus Berlin - Stiftung des bürgerlichen Rechts": "Jüdisches Krankenhaus Berlin",
    "Klinik \"Helle Mitte\" GmbH": "Klinik Helle Mitte",
    "Klinik Schöneberg GmbH - Fuggerstraße": "Klinik Schöneberg",
    "Klinik für MIC": "Klinik für MIC Minimal Invasive Chirurgie",
    "Krankenhaus Bethel Berlin": "Krankenhaus Bethel",
    "Krankenhaus Waldfriede e.V.": "Krankenhaus Waldfriede",
    "Martin Luther Krankenhaus, Berlin": "Martin-Luther-Krankenhaus",
    "Caritas-Klinik Maria Heimsuchung Berlin-Pankow": "Maria Heimsuchung Caritas Klinikum Pankow",
    "Sankt Gertrauden-Krankenhaus GmbH": "Sankt Gertrauden-Krankenhaus",
    "Schlosspark-Klinik GmbH": "Schlosspark-Klinik",
    "St. Hedwig-Krankenhaus Berlin": "St. Hedwig-Krankenhaus",
    "St.Joseph Krankenhaus": "St. Joseph Krankenhaus",
    "Vivantes Klinikum Am Urban": "Vivantes Klinikum am Urban",
    "Wichernkrankenhaus gGmbH": "Wichernkrankenhaus",
    "Caritas-Klinik St. Anna (vormals Malteser-Krankenhaus Berlin-Charlottenburg)": "Malteser-Krankenhaus",
    "Caritas-Klinik Dominikus": "Dominikus-Krankenhaus",
    "Vivantes Auguste-Viktoria-Klinikum": "Auguste-Viktoria-Klinikum",
    "BG Klinikum Unfallkrankenhaus Berlin gGmbH": "Poliklinik am ukb",
    "Vivantes Ida-Wolff-Krankenhaus GmbH": "Ida-Wolff-Krankenhaus",
    "Sana Paulinenkrankenhaus gGmbH": "Paulinen Krankenhaus",
    "St. Marien-Krankenhaus Berlin": "Sankt-Marien-Krankenhaus",
    "Vivantes Klinikum im Friedrichshain": "Klinikum im Friedrichshain"
    }

# Function to apply all replacements
def replace_multiple(text, replacements):
    if pd.isna(text):
        return text
    for old, new in replacements.items():
        text = text.replace(old, new)
    return text

# Apply the function to the 'name' column
df_old["name_clean"] = df_old["name"].apply(lambda x: replace_multiple(x, replacements))


In [405]:
df_old

Unnamed: 0,district_id,name,address,coordinates,latitude,longitude,locality,district,distance,beds,cases,datab,name_clean
0,11001001,St. Hedwig-Krankenhaus Berlin,Große Hamburger Straße 5-11\n10115 Berlin,"52.52582662684028, 13.396515830691005",52.525827,13.396516,Alexanderplatz,Mitte,0.8,427,64515,old,St. Hedwig-Krankenhaus
1,11001001,Charité - Universitätsmedizin Berlin,Charitéplatz 1\n10117 Berlin,"52.52662465412624, 13.376658301525385",52.526625,13.376658,Alexanderplatz,Mitte,2.2,3011,1891343,old,Campus Charité Mitte
2,11002002,Vivantes Klinikum im Friedrichshain,Landsberger Allee 49\n10249 Berlin,"52.52476641204036, 13.43904818247062",52.524766,13.439048,Karl-Marx-Allee-Nord,Friedrichshain-Kreuzberg,2.4,956,111119,old,Klinikum im Friedrichshain
3,11003003,Psychiatrisch-Psychotherapeutische Tagesklinik St. Martha Prenzlauer Berg,Diesterwegstr. 32\n10405 Berlin,"52.5413743809574, 13.430949",52.541374,13.430949,Prenzlauer Berg Süd,Pankow,2.7,21,2004,old,Psychiatrisch-Psychotherapeutische Tagesklinik St. Martha Prenzlauer Berg
4,11002002,Vivantes Klinikum Am Urban,Dieffenbachstraße 1\n10967 Berlin,"52.494275898889505, 13.40892918516296",52.494276,13.408929,Tempelhofer Vorstadt,Friedrichshain-Kreuzberg,2.9,607,65066,old,Vivantes Klinikum am Urban
5,11001001,Bundeswehrkrankenhaus Berlin,Scharnhosrtstraße 13\n10115 Berlin,"52.535402359877416, 13.370445000000002",52.535402,13.370445,Alexanderplatz,Mitte,2.9,367,11522,old,Bundeswehrkrankenhaus Berlin
6,11001001,Evangelische Elisabeth Klinik,Lützowstraße 24-26\n10785 Berlin,"52.5038038841173, 13.364952542327249",52.503804,13.364953,Tiergarten Süd,Mitte,3.3,145,40651,old,Evangelische Elisabeth Klinik
7,11002002,Entgiftungskrankenhaus Count Down,Frankfurter Allee 40\n10247 Berlin,"52.51487928580479, 13.462392084654498",52.514879,13.462392,Frankfurter Allee Nord,Friedrichshain-Kreuzberg,3.9,12,359,old,Entgiftungskrankenhaus Count Down
8,11001001,Deutsches Herzzentrum der Charité (DHZC),Augustenburger Platz 1\n13353 Berlin,"52.542487124999056, 13.347066370325916",52.542487,13.347066,Parkviertel,Mitte,4.3,473,0,old,Deutsches Herzzentrum der Charité - Standort Augustenburger Platz
9,11001001,Franziskus-Krankenhaus Berlin,Budapester Str. 15-19\n10787 Berlin,"52.50637062379895, 13.34512454232725",52.506371,13.345125,Tiergarten Süd,Mitte,4.3,185,25698,old,Franziskus-Krankenhaus Berlin


In [406]:
# Replace only if name_clean is not null
df_old["name"] = df_old["name_clean"].combine_first(df_old["name"])

# Drop the helper column if you don’t need it anymore
df_old = df_old.drop(columns=["name_clean"])


In [407]:
df_old

Unnamed: 0,district_id,name,address,coordinates,latitude,longitude,locality,district,distance,beds,cases,datab
0,11001001,St. Hedwig-Krankenhaus,Große Hamburger Straße 5-11\n10115 Berlin,"52.52582662684028, 13.396515830691005",52.525827,13.396516,Alexanderplatz,Mitte,0.8,427,64515,old
1,11001001,Campus Charité Mitte,Charitéplatz 1\n10117 Berlin,"52.52662465412624, 13.376658301525385",52.526625,13.376658,Alexanderplatz,Mitte,2.2,3011,1891343,old
2,11002002,Klinikum im Friedrichshain,Landsberger Allee 49\n10249 Berlin,"52.52476641204036, 13.43904818247062",52.524766,13.439048,Karl-Marx-Allee-Nord,Friedrichshain-Kreuzberg,2.4,956,111119,old
3,11003003,Psychiatrisch-Psychotherapeutische Tagesklinik St. Martha Prenzlauer Berg,Diesterwegstr. 32\n10405 Berlin,"52.5413743809574, 13.430949",52.541374,13.430949,Prenzlauer Berg Süd,Pankow,2.7,21,2004,old
4,11002002,Vivantes Klinikum am Urban,Dieffenbachstraße 1\n10967 Berlin,"52.494275898889505, 13.40892918516296",52.494276,13.408929,Tempelhofer Vorstadt,Friedrichshain-Kreuzberg,2.9,607,65066,old
5,11001001,Bundeswehrkrankenhaus Berlin,Scharnhosrtstraße 13\n10115 Berlin,"52.535402359877416, 13.370445000000002",52.535402,13.370445,Alexanderplatz,Mitte,2.9,367,11522,old
6,11001001,Evangelische Elisabeth Klinik,Lützowstraße 24-26\n10785 Berlin,"52.5038038841173, 13.364952542327249",52.503804,13.364953,Tiergarten Süd,Mitte,3.3,145,40651,old
7,11002002,Entgiftungskrankenhaus Count Down,Frankfurter Allee 40\n10247 Berlin,"52.51487928580479, 13.462392084654498",52.514879,13.462392,Frankfurter Allee Nord,Friedrichshain-Kreuzberg,3.9,12,359,old
8,11001001,Deutsches Herzzentrum der Charité - Standort Augustenburger Platz,Augustenburger Platz 1\n13353 Berlin,"52.542487124999056, 13.347066370325916",52.542487,13.347066,Parkviertel,Mitte,4.3,473,0,old
9,11001001,Franziskus-Krankenhaus Berlin,Budapester Str. 15-19\n10787 Berlin,"52.50637062379895, 13.34512454232725",52.506371,13.345125,Tiergarten Süd,Mitte,4.3,185,25698,old


In [408]:
# Merge on 3 columns
merged = pd.merge(hospital_df, df_old, on=[ "name"], how="outer")

merged

Unnamed: 0,name,operator,brand,country,city,street,housenumber,postcode,neighborhood,phone,email,website,wheelchair,toilets_wheelchair,beds_x,emergency,speciality,opening_hours,latitude_x,longitude_x,geometry,source,district_x,district_id_x,district_id_y,address,coordinates,latitude_y,longitude_y,locality,district_y,distance,beds_y,cases,datab
0,Alexianer - Müllerstraße,,,,,,,,,,,,,,,,,,,,,,,,11001001.0,Müllerstraße 56-58\n13349 Berlin,"52.55438754582734, 13.346307830691007",52.554388,13.346308,Parkviertel,Mitte,5.5,18.0,159.0,old
1,"Alexianer Krankenhaus Hedwigshöhe, Tageskliniken St. Helena und St. Christophorus Edisonstraße",,,,,,,,,,,,,,,,,,,,,,,,11009009.0,Edisonstraße 15\n12459 Berlin,"52.46389571043177, 13.5141119153455",52.463896,13.514112,Oberschöneweide,Treptow-Köpenick,9.7,30.0,216.0,old
2,Alexianer St. Joseph Krankenhaus Berlin-Weißensee,Alexianer St. Joseph Berlin-Weißensee GmbH,unknown,Germany,unknown,unknown,unknown,unknown,Weißensee,unknown,unknown,unknown,True,,unknown,yes,unknown,unknown,52.550464,13.469168,POINT (13.46917 52.55046),unknown,Pankow,11003003.0,11003003.0,Gartenstr. 1\n13088 Berlin,"52.551043378153665, 13.468248288363757",52.551043,13.468248,Weißensee Ost,Pankow,5.5,323.0,17473.0,old
3,Arona Klinik für Altersmedizin,DZG Berlin Betriebs GmbH & Co. KG,unknown,Germany,Berlin,Blumberger Damm,26,12683,Biesdorf,+49 30 5497570,kontakt@arona-kliniken.de,https://arona-kliniken.de/,True,,unknown,unknown,unknown,unknown,52.515605,13.564055,POINT (13.56405 52.5156),unknown,Marzahn-Hellersdorf,11010010.0,11010010.0,Blumberger Damm 2G\n12683 Berlin,"52.51527257230003, 13.563734436702791",52.515273,13.563734,Biesdorf,Marzahn-Hellersdorf,10.8,70.0,1638.0,old
4,Augenklinik Berlin Marzahn,unknown,unknown,Germany,Berlin,Brebacher Weg,15,12683,Biesdorf,unknown,unknown,unknown,,,unknown,unknown,unknown,unknown,52.514262,13.567573,POINT (13.56757 52.51426),unknown,Marzahn-Hellersdorf,11010010.0,11010010.0,Brebacher Weg 15\n12683 Berlin,"52.517334043273976, 13.569157169958448",52.517334,13.569157,Biesdorf,Marzahn-Hellersdorf,11.1,51.0,30560.0,old
5,Augenklinik im Ring-Center GmbH,,,,,,,,,,,,,,,,,,,,,,,,11002002.0,Frankfurter Allee 111\n10247 Berlin,"52.514456520970285, 13.47390945767275",52.514457,13.473909,Frankfurter Allee Nord,Friedrichshain-Kreuzberg,4.7,17.0,6549.0,old
6,Auguste-Viktoria-Klinikum,Vivantes,unknown,Germany,Berlin,Rubensstraße,125,12157,Schöneberg,+49 30 130200,unknown,unknown,True,,unknown,yes,urology;neurology;cardiology;vascular_surgery;psychiatry;gynaecology,unknown,52.462432,13.346391,POINT (13.34639 52.46243),unknown,Steglitz-Zehlendorf,11006006.0,11007007.0,Rubensstraße 125\n12157 Berlin,"52.46313171417537, 13.346381373018254",52.463132,13.346381,Friedenau,Tempelhof-Schöneberg,7.6,742.0,65235.0,old
7,"Auguste-Viktoria-Klinikum, Tagesklinik Psychosenpsychotherapie/ CBASP (Dominicusstraße)",,,,,,,,,,,,,,,,,,,,,,,,11007007.0,Dominicusstraße 5-9\n10823 Berlin,"52.48421208246301, 13.346080626981747",52.484212,13.346081,Schöneberg-Süd,Tempelhof-Schöneberg,5.6,20.0,103.0,old
8,Avicenna Klinik,unknown,unknown,Germany,Berlin,Paulsborner Straße,2,10709,Wilmersdorf,unknown,unknown,unknown,,,unknown,no,neurosurgery;orthopaedics,unknown,52.498674,13.306612,POINT (13.30661 52.49867),unknown,Charlottenburg-Wilmersdorf,11004004.0,,,,,,,,,,,
9,Bundeswehrkrankenhaus Berlin,unknown,unknown,Germany,Berlin,Scharnhorststraße,13,10115,Mitte,unknown,unknown,https://berlin.bwkrankenhaus.de,True,,unknown,yes,unknown,unknown,52.53567,13.37139,POINT (13.37139 52.53567),unknown,Mitte,11001001.0,11001001.0,Scharnhosrtstraße 13\n10115 Berlin,"52.535402359877416, 13.370445000000002",52.535402,13.370445,Alexanderplatz,Mitte,2.9,367.0,11522.0,old


In [409]:
#for idx, row in merged.iterrows():
 #   print(row["datab"], "-",row["name"], "-", row["street"], "-",row["address"], "-")

In [326]:
print("Rows, Columns:", merged.shape)

Rows, Columns: (105, 35)


In [410]:
#missing_count = merged.isna().sum().sort_values(ascending=False)
#print(missing_count)

In [411]:
row_count = len(merged)
print(row_count)

105


In [412]:
missing = pd.DataFrame({
    "missing_count": missing_count,
    "missing_pct": (missing_count / row_count * 100).round(1)
}).sort_values(by="missing_pct", ascending=False)

print(missing)

                    missing_count  missing_pct
beds                           78         74.3
source                         78         74.3
brand                          77         73.3
opening_hours                  75         71.4
email                          74         70.5
toilets_wheelchair             69         65.7
phone                          55         52.4
speciality                     49         46.7
website                        44         41.9
emergency                      42         40.0
operator                       40         38.1
wheelchair                     29         27.6
city                           20         19.0
postcode                       19         18.1
housenumber                    19         18.1
street                         19         18.1
longitude                       0          0.0
district                        0          0.0
geometry                        0          0.0
name                            0          0.0
latitude     

In [413]:
counts = merged["datab"].value_counts(dropna=False)  # dropna=False keeps NaN count
print(counts)

datab
old    75
NaN    30
Name: count, dtype: int64


In [414]:
filtered = merged[merged["datab"].str.contains("old", case=False, na=False)]
print(filtered)

                                                                                                                name  \
0                                                                                           Alexianer - Müllerstraße   
1                     Alexianer Krankenhaus Hedwigshöhe, Tageskliniken St. Helena und St. Christophorus Edisonstraße   
2                                                                  Alexianer St. Joseph Krankenhaus Berlin-Weißensee   
3                                                                                     Arona Klinik für Altersmedizin   
4                                                                                         Augenklinik Berlin Marzahn   
5                                                                                    Augenklinik im Ring-Center GmbH   
6                                                                                          Auguste-Viktoria-Klinikum   
7                            Auguste-Vik

In [415]:
# Make a deep copy
merged_copy = merged.copy(deep=True)


In [416]:
# If Status == 'old' AND Name == 'John', set Category = 'Special'

merged_copy.loc[(merged_copy["datab"] == "old") & (merged_copy["street"].notna())& (merged_copy["neighborhood"].notna()), "datab"] = "both"
merged_copy.loc[(merged_copy["datab"].isna()) , "datab"] = "new"
counts = merged_copy["datab"].value_counts(dropna=False)  # dropna=False keeps NaN count
print(counts)


datab
both    50
new     30
old     25
Name: count, dtype: int64


In [417]:
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)


# Move column "datab" to the front
col = "datab"
merged_copy = merged_copy[[col] + [datab for datab in merged_copy.columns if datab != col]]

 
merged_copy

Unnamed: 0,datab,name,operator,brand,country,city,street,housenumber,postcode,neighborhood,phone,email,website,wheelchair,toilets_wheelchair,beds_x,emergency,speciality,opening_hours,latitude_x,longitude_x,geometry,source,district_x,district_id_x,district_id_y,address,coordinates,latitude_y,longitude_y,locality,district_y,distance,beds_y,cases
0,old,Alexianer - Müllerstraße,,,,,,,,,,,,,,,,,,,,,,,,11001001.0,Müllerstraße 56-58\n13349 Berlin,"52.55438754582734, 13.346307830691007",52.554388,13.346308,Parkviertel,Mitte,5.5,18.0,159.0
1,old,"Alexianer Krankenhaus Hedwigshöhe, Tageskliniken St. Helena und St. Christophorus Edisonstraße",,,,,,,,,,,,,,,,,,,,,,,,11009009.0,Edisonstraße 15\n12459 Berlin,"52.46389571043177, 13.5141119153455",52.463896,13.514112,Oberschöneweide,Treptow-Köpenick,9.7,30.0,216.0
2,both,Alexianer St. Joseph Krankenhaus Berlin-Weißensee,Alexianer St. Joseph Berlin-Weißensee GmbH,unknown,Germany,unknown,unknown,unknown,unknown,Weißensee,unknown,unknown,unknown,True,,unknown,yes,unknown,unknown,52.550464,13.469168,POINT (13.46917 52.55046),unknown,Pankow,11003003.0,11003003.0,Gartenstr. 1\n13088 Berlin,"52.551043378153665, 13.468248288363757",52.551043,13.468248,Weißensee Ost,Pankow,5.5,323.0,17473.0
3,both,Arona Klinik für Altersmedizin,DZG Berlin Betriebs GmbH & Co. KG,unknown,Germany,Berlin,Blumberger Damm,26,12683,Biesdorf,+49 30 5497570,kontakt@arona-kliniken.de,https://arona-kliniken.de/,True,,unknown,unknown,unknown,unknown,52.515605,13.564055,POINT (13.56405 52.5156),unknown,Marzahn-Hellersdorf,11010010.0,11010010.0,Blumberger Damm 2G\n12683 Berlin,"52.51527257230003, 13.563734436702791",52.515273,13.563734,Biesdorf,Marzahn-Hellersdorf,10.8,70.0,1638.0
4,both,Augenklinik Berlin Marzahn,unknown,unknown,Germany,Berlin,Brebacher Weg,15,12683,Biesdorf,unknown,unknown,unknown,,,unknown,unknown,unknown,unknown,52.514262,13.567573,POINT (13.56757 52.51426),unknown,Marzahn-Hellersdorf,11010010.0,11010010.0,Brebacher Weg 15\n12683 Berlin,"52.517334043273976, 13.569157169958448",52.517334,13.569157,Biesdorf,Marzahn-Hellersdorf,11.1,51.0,30560.0
5,old,Augenklinik im Ring-Center GmbH,,,,,,,,,,,,,,,,,,,,,,,,11002002.0,Frankfurter Allee 111\n10247 Berlin,"52.514456520970285, 13.47390945767275",52.514457,13.473909,Frankfurter Allee Nord,Friedrichshain-Kreuzberg,4.7,17.0,6549.0
6,both,Auguste-Viktoria-Klinikum,Vivantes,unknown,Germany,Berlin,Rubensstraße,125,12157,Schöneberg,+49 30 130200,unknown,unknown,True,,unknown,yes,urology;neurology;cardiology;vascular_surgery;psychiatry;gynaecology,unknown,52.462432,13.346391,POINT (13.34639 52.46243),unknown,Steglitz-Zehlendorf,11006006.0,11007007.0,Rubensstraße 125\n12157 Berlin,"52.46313171417537, 13.346381373018254",52.463132,13.346381,Friedenau,Tempelhof-Schöneberg,7.6,742.0,65235.0
7,old,"Auguste-Viktoria-Klinikum, Tagesklinik Psychosenpsychotherapie/ CBASP (Dominicusstraße)",,,,,,,,,,,,,,,,,,,,,,,,11007007.0,Dominicusstraße 5-9\n10823 Berlin,"52.48421208246301, 13.346080626981747",52.484212,13.346081,Schöneberg-Süd,Tempelhof-Schöneberg,5.6,20.0,103.0
8,new,Avicenna Klinik,unknown,unknown,Germany,Berlin,Paulsborner Straße,2,10709,Wilmersdorf,unknown,unknown,unknown,,,unknown,no,neurosurgery;orthopaedics,unknown,52.498674,13.306612,POINT (13.30661 52.49867),unknown,Charlottenburg-Wilmersdorf,11004004.0,,,,,,,,,,
9,both,Bundeswehrkrankenhaus Berlin,unknown,unknown,Germany,Berlin,Scharnhorststraße,13,10115,Mitte,unknown,unknown,https://berlin.bwkrankenhaus.de,True,,unknown,yes,unknown,unknown,52.53567,13.37139,POINT (13.37139 52.53567),unknown,Mitte,11001001.0,11001001.0,Scharnhosrtstraße 13\n10115 Berlin,"52.535402359877416, 13.370445000000002",52.535402,13.370445,Alexanderplatz,Mitte,2.9,367.0,11522.0
