## 🛠 Step 2: Data Transformation

### Public Parks (Grünenanlage) Berlin

In [13]:
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim
from time import sleep
import psycopg2
from sqlalchemy import create_engine, text
import re
import warnings
warnings.filterwarnings("ignore")

In [4]:
# Load your CSV
df = pd.read_csv("/Users/dianaterraza/Desktop/webeet.io/layered-populate-data-pool-da/recreational_zones/sources/public_parks.csv", sep=';')

In [5]:
df.head()

Unnamed: 0,Technischer Schlüssel,Schlüssel,Objektnummer,Bezirk,Ortsteil,Art der Grünanlage,Name der Grünanlage,Namenszusatz der Grünanlage,Baujahr,letztes Sanierungsjahr,Größe in m² (Kataster),Widmung,Nummer des Planungsraumes,Name des Planungsraumes
0,00008100_001042bb,00008100:001042bb,37,Reinickendorf,Frohnau,Grünanlage,"Im Fischgrund, ""Rosenanger""",Rosenanger,-,-,1699150,gewidmet,12400721,Frohnau Ost
1,00008100_00104621,00008100:00104621,1179,Reinickendorf,Lübars,Grünanlage,Klötzbecken bis Zabel-Krüger-Damm,einschl. Klötzbecken,-,-,5222460,gewidmet,12500929,Lübars
2,00008100_001044bd,00008100:001044bd,1074,Reinickendorf,Hermsdorf,Grünanlage,"Heidenheimer Str. (ab Friedrichsthaler Weg), W...",-,-,-,301200,gewidmet,12400722,Hermsdorf West
3,00008100_00104620,00008100:00104620,1180,Reinickendorf,Lübars,Grünanlage,"Wittenauer Str., südl. AEG-Siedlung",-,-,-,337420,gewidmet,12500929,Lübars
4,00008100_00104438,00008100:00104438,476,Reinickendorf,Reinickendorf,Grünanlage,Kuhnpromenade u. Lindauer Allee 59/61,-,-,-,312200,gewidmet,12100206,Humboldtstraße


In [6]:
df.columns

Index(['Technischer Schlüssel', 'Schlüssel', 'Objektnummer', 'Bezirk',
       'Ortsteil', 'Art der Grünanlage', 'Name der Grünanlage',
       'Namenszusatz der Grünanlage', 'Baujahr', 'letztes Sanierungsjahr',
       'Größe in m² (Kataster)', 'Widmung', 'Nummer des Planungsraumes',
       'Name des Planungsraumes'],
      dtype='object')

### Rename the Columns 

In [7]:
df.rename(columns={
    'Technischer Schlüssel': 'Technical ID',
    'Schlüssel': 'Key',
    'Objektnummer': 'Object Number',
    'Bezirk': 'neighborhood',
    'Ortsteil': 'Locality',
    'Art der Grünanlage': 'Type of Green Space',
    'Name der Grünanlage': 'Green Space Name',
    'Namenszusatz der Grünanlage': 'Name Extension',
    'Baujahr': 'Year Built',
    'letztes Sanierungsjahr': 'Last Renovation Year',
    'Größe in m² (Kataster)': 'Size sqm',
    'Widmung': 'Dedication',
    'Nummer des Planungsraumes': 'Planning Area Number',
    'Name des Planungsraumes': 'Planning Area Name'
}, inplace=True)


In [8]:
df.columns


Index(['Technical ID', 'Key', 'Object Number', 'neighborhood', 'Locality',
       'Type of Green Space', 'Green Space Name', 'Name Extension',
       'Year Built', 'Last Renovation Year', 'Size sqm', 'Dedication',
       'Planning Area Number', 'Planning Area Name'],
      dtype='object')

### Check for null values 

In [9]:
print(df.isnull().sum())

Technical ID            0
Key                     0
Object Number           0
neighborhood            0
Locality                0
Type of Green Space     0
Green Space Name        0
Name Extension          0
Year Built              0
Last Renovation Year    0
Size sqm                0
Dedication              0
Planning Area Number    0
Planning Area Name      0
dtype: int64


### Change the DataType of the columns for analysis 

In [10]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2556 entries, 0 to 2555
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Technical ID          2556 non-null   object
 1   Key                   2556 non-null   object
 2   Object Number         2556 non-null   object
 3   neighborhood          2556 non-null   object
 4   Locality              2556 non-null   object
 5   Type of Green Space   2556 non-null   object
 6   Green Space Name      2556 non-null   object
 7   Name Extension        2556 non-null   object
 8   Year Built            2556 non-null   object
 9   Last Renovation Year  2556 non-null   object
 10  Size sqm              2556 non-null   object
 11  Dedication            2556 non-null   object
 12  Planning Area Number  2556 non-null   object
 13  Planning Area Name    2556 non-null   object
dtypes: object(14)
memory usage: 279.7+ KB


In [11]:
df.head()

Unnamed: 0,Technical ID,Key,Object Number,neighborhood,Locality,Type of Green Space,Green Space Name,Name Extension,Year Built,Last Renovation Year,Size sqm,Dedication,Planning Area Number,Planning Area Name
0,00008100_001042bb,00008100:001042bb,37,Reinickendorf,Frohnau,Grünanlage,"Im Fischgrund, ""Rosenanger""",Rosenanger,-,-,1699150,gewidmet,12400721,Frohnau Ost
1,00008100_00104621,00008100:00104621,1179,Reinickendorf,Lübars,Grünanlage,Klötzbecken bis Zabel-Krüger-Damm,einschl. Klötzbecken,-,-,5222460,gewidmet,12500929,Lübars
2,00008100_001044bd,00008100:001044bd,1074,Reinickendorf,Hermsdorf,Grünanlage,"Heidenheimer Str. (ab Friedrichsthaler Weg), W...",-,-,-,301200,gewidmet,12400722,Hermsdorf West
3,00008100_00104620,00008100:00104620,1180,Reinickendorf,Lübars,Grünanlage,"Wittenauer Str., südl. AEG-Siedlung",-,-,-,337420,gewidmet,12500929,Lübars
4,00008100_00104438,00008100:00104438,476,Reinickendorf,Reinickendorf,Grünanlage,Kuhnpromenade u. Lindauer Allee 59/61,-,-,-,312200,gewidmet,12100206,Humboldtstraße


In [12]:
# Replace "-" with NaN before processing
df["Size sqm"] = df["Size sqm"].replace("-", np.nan)

# Convert "Size sqm" to float: remove thousand separators (dots), replace decimal commas with dots
df["Size sqm"] = df["Size sqm"] \
    .str.replace(".", "", regex=False) \
    .str.replace(",", ".", regex=False) \
    .astype(float)

# Convert year columns to numeric (invalid entries like "-" will become NaN)
df["Year Built"] = pd.to_numeric(df["Year Built"], errors="coerce")
df["Last Renovation Year"] = pd.to_numeric(df["Last Renovation Year"], errors="coerce")

# Convert planning area number to numeric (if needed)
df["Planning Area Number"] = pd.to_numeric(df["Planning Area Number"], errors="coerce")

# Convert selected text columns to category type to reduce memory usage
cat_columns = [
    "neighborhood", "Locality", "Type of Green Space",
    "Green Space Name", "Name Extension", "Dedication",
    "Planning Area Name"
]
df[cat_columns] = df[cat_columns].astype("category")



In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2556 entries, 0 to 2555
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   Technical ID          2556 non-null   object  
 1   Key                   2556 non-null   object  
 2   Object Number         2556 non-null   object  
 3   neighborhood          2556 non-null   category
 4   Locality              2556 non-null   category
 5   Type of Green Space   2556 non-null   category
 6   Green Space Name      2556 non-null   category
 7   Name Extension        2556 non-null   category
 8   Year Built            362 non-null    float64 
 9   Last Renovation Year  167 non-null    float64 
 10  Size sqm              2553 non-null   float64 
 11  Dedication            2556 non-null   category
 12  Planning Area Number  2555 non-null   float64 
 13  Planning Area Name    2556 non-null   category
dtypes: category(7), float64(4), object(3)
memory usage: 315.

### Handle Missing Values after change the datatype

In [14]:
df.isna().sum()

Technical ID               0
Key                        0
Object Number              0
neighborhood               0
Locality                   0
Type of Green Space        0
Green Space Name           0
Name Extension             0
Year Built              2194
Last Renovation Year    2389
Size sqm                   3
Dedication                 0
Planning Area Number       1
Planning Area Name         0
dtype: int64

* Fill missing values (imputation) For numeric columns:

In [15]:
df["Size sqm"] = df["Size sqm"].fillna(df["Size sqm"].median())
df["Year Built"] = df["Year Built"].fillna(df["Year Built"].median())
df["Planning Area Number"] = df["Planning Area Number"].fillna(df["Planning Area Number"].median())
df["Last Renovation Year"] = df["Last Renovation Year"].fillna(0)  # or use median if applicable


In [16]:
df.isna().sum()

Technical ID            0
Key                     0
Object Number           0
neighborhood            0
Locality                0
Type of Green Space     0
Green Space Name        0
Name Extension          0
Year Built              0
Last Renovation Year    0
Size sqm                0
Dedication              0
Planning Area Number    0
Planning Area Name      0
dtype: int64

### Create Address1 Column with Green Space Name

In [17]:
df['Address1'] = (
    df['Green Space Name'].astype(str) + ", Berlin, Germany" #change the datatype before apply the geocoding
)

df.head()

Unnamed: 0,Technical ID,Key,Object Number,neighborhood,Locality,Type of Green Space,Green Space Name,Name Extension,Year Built,Last Renovation Year,Size sqm,Dedication,Planning Area Number,Planning Area Name,Address1
0,00008100_001042bb,00008100:001042bb,37,Reinickendorf,Frohnau,Grünanlage,"Im Fischgrund, ""Rosenanger""",Rosenanger,1995.0,0.0,16991.5,gewidmet,12400721.0,Frohnau Ost,"Im Fischgrund, ""Rosenanger"", Berlin, Germany"
1,00008100_00104621,00008100:00104621,1179,Reinickendorf,Lübars,Grünanlage,Klötzbecken bis Zabel-Krüger-Damm,einschl. Klötzbecken,1995.0,0.0,52224.6,gewidmet,12500929.0,Lübars,"Klötzbecken bis Zabel-Krüger-Damm, Berlin, Ger..."
2,00008100_001044bd,00008100:001044bd,1074,Reinickendorf,Hermsdorf,Grünanlage,"Heidenheimer Str. (ab Friedrichsthaler Weg), W...",-,1995.0,0.0,3012.0,gewidmet,12400722.0,Hermsdorf West,"Heidenheimer Str. (ab Friedrichsthaler Weg), W..."
3,00008100_00104620,00008100:00104620,1180,Reinickendorf,Lübars,Grünanlage,"Wittenauer Str., südl. AEG-Siedlung",-,1995.0,0.0,3374.2,gewidmet,12500929.0,Lübars,"Wittenauer Str., südl. AEG-Siedlung, Berlin, G..."
4,00008100_00104438,00008100:00104438,476,Reinickendorf,Reinickendorf,Grünanlage,Kuhnpromenade u. Lindauer Allee 59/61,-,1995.0,0.0,3122.0,gewidmet,12100206.0,Humboldtstraße,"Kuhnpromenade u. Lindauer Allee 59/61, Berlin,..."


### Lets look for duplicates 

In [18]:
# Count duplicate Address values
duplicates = df['Address1'].duplicated().sum()
print(f"Found {duplicates} duplicate addresses.")

Found 19 duplicate addresses.


### Create a unique address DataFrame

In [19]:
unique_addresses = df[['Address1']].drop_duplicates().copy()
unique_addresses

Unnamed: 0,Address1
0,"Im Fischgrund, ""Rosenanger"", Berlin, Germany"
1,"Klötzbecken bis Zabel-Krüger-Damm, Berlin, Ger..."
2,"Heidenheimer Str. (ab Friedrichsthaler Weg), W..."
3,"Wittenauer Str., südl. AEG-Siedlung, Berlin, G..."
4,"Kuhnpromenade u. Lindauer Allee 59/61, Berlin,..."
...,...
2550,"Weddingplatz, Berlin, Germany"
2551,"Essener Park, Berlin, Germany"
2553,"Leopoldplatz an der Alten Nazarethkirche, Berl..."
2554,"Mollstr. 15-18, Berlin, Germany"


### Geocode only the sample_df (10 rows) of unique addresses using OpenStreetMap’s Nominatim API 

In [20]:
sample_df = unique_addresses.loc[0:10]
sample_df

Unnamed: 0,Address1
0,"Im Fischgrund, ""Rosenanger"", Berlin, Germany"
1,"Klötzbecken bis Zabel-Krüger-Damm, Berlin, Ger..."
2,"Heidenheimer Str. (ab Friedrichsthaler Weg), W..."
3,"Wittenauer Str., südl. AEG-Siedlung, Berlin, G..."
4,"Kuhnpromenade u. Lindauer Allee 59/61, Berlin,..."
5,"Avenue Charles de Gaulle 32-33, Berlin, Germany"
6,"Platz der US-Berlin-Brigaden WG, Berlin, Germany"
7,"Schünemannweg N, Berlin, Germany"
8,"Grabens. Hlgs., Lindengraben, Berlin, Germany"
9,"BAB, Überbauung Tunnel Tegel, Berlin, Germany"


In [21]:
geolocator = Nominatim(user_agent="berlin-geocoder")

def geocode_address(address):
    try:
        location = geolocator.geocode(address)
        sleep(1)
        if location:
            return pd.Series([location.latitude, location.longitude])
        else: 
            return pd.Series([None, None])
    except:
        return pd.Series([None, None])

In [22]:
# Geocode sample_df
sample_df[['Latitude', 'Longitude']] = sample_df['Address1'].apply(geocode_address)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_df[['Latitude', 'Longitude']] = sample_df['Address1'].apply(geocode_address)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_df[['Latitude', 'Longitude']] = sample_df['Address1'].apply(geocode_address)


In [23]:
sample_df

Unnamed: 0,Address1,Latitude,Longitude
0,"Im Fischgrund, ""Rosenanger"", Berlin, Germany",,
1,"Klötzbecken bis Zabel-Krüger-Damm, Berlin, Ger...",,
2,"Heidenheimer Str. (ab Friedrichsthaler Weg), W...",,
3,"Wittenauer Str., südl. AEG-Siedlung, Berlin, G...",,
4,"Kuhnpromenade u. Lindauer Allee 59/61, Berlin,...",,
5,"Avenue Charles de Gaulle 32-33, Berlin, Germany",52.601249,13.319022
6,"Platz der US-Berlin-Brigaden WG, Berlin, Germany",,
7,"Schünemannweg N, Berlin, Germany",52.444492,13.352584
8,"Grabens. Hlgs., Lindengraben, Berlin, Germany",,
9,"BAB, Überbauung Tunnel Tegel, Berlin, Germany",,


### Geocode the unique addresses of the entire dataset using OpenStreetMap’s Nominatim API 

In [24]:
geolocator = Nominatim(user_agent="berlin-geocoder")

def geocode_address(address):
    try:
        location = geolocator.geocode(address)
        sleep(1)
        if location:
            return pd.Series([location.latitude, location.longitude])
        else: 
            return pd.Series([None, None])
    except:
        return pd.Series([None, None])

In [25]:
# Geocode unique addresses
unique_addresses[['Latitude', 'Longitude']] = unique_addresses['Address1'].apply(geocode_address)

In [26]:
unique_addresses

Unnamed: 0,Address1,Latitude,Longitude
0,"Im Fischgrund, ""Rosenanger"", Berlin, Germany",,
1,"Klötzbecken bis Zabel-Krüger-Damm, Berlin, Ger...",,
2,"Heidenheimer Str. (ab Friedrichsthaler Weg), W...",,
3,"Wittenauer Str., südl. AEG-Siedlung, Berlin, G...",,
4,"Kuhnpromenade u. Lindauer Allee 59/61, Berlin,...",,
...,...,...,...
2550,"Weddingplatz, Berlin, Germany",52.540801,13.369760
2551,"Essener Park, Berlin, Germany",52.524730,13.340990
2553,"Leopoldplatz an der Alten Nazarethkirche, Berl...",,
2554,"Mollstr. 15-18, Berlin, Germany",52.526567,13.417049


In [27]:
df.head()

Unnamed: 0,Technical ID,Key,Object Number,neighborhood,Locality,Type of Green Space,Green Space Name,Name Extension,Year Built,Last Renovation Year,Size sqm,Dedication,Planning Area Number,Planning Area Name,Address1
0,00008100_001042bb,00008100:001042bb,37,Reinickendorf,Frohnau,Grünanlage,"Im Fischgrund, ""Rosenanger""",Rosenanger,1995.0,0.0,16991.5,gewidmet,12400721.0,Frohnau Ost,"Im Fischgrund, ""Rosenanger"", Berlin, Germany"
1,00008100_00104621,00008100:00104621,1179,Reinickendorf,Lübars,Grünanlage,Klötzbecken bis Zabel-Krüger-Damm,einschl. Klötzbecken,1995.0,0.0,52224.6,gewidmet,12500929.0,Lübars,"Klötzbecken bis Zabel-Krüger-Damm, Berlin, Ger..."
2,00008100_001044bd,00008100:001044bd,1074,Reinickendorf,Hermsdorf,Grünanlage,"Heidenheimer Str. (ab Friedrichsthaler Weg), W...",-,1995.0,0.0,3012.0,gewidmet,12400722.0,Hermsdorf West,"Heidenheimer Str. (ab Friedrichsthaler Weg), W..."
3,00008100_00104620,00008100:00104620,1180,Reinickendorf,Lübars,Grünanlage,"Wittenauer Str., südl. AEG-Siedlung",-,1995.0,0.0,3374.2,gewidmet,12500929.0,Lübars,"Wittenauer Str., südl. AEG-Siedlung, Berlin, G..."
4,00008100_00104438,00008100:00104438,476,Reinickendorf,Reinickendorf,Grünanlage,Kuhnpromenade u. Lindauer Allee 59/61,-,1995.0,0.0,3122.0,gewidmet,12100206.0,Humboldtstraße,"Kuhnpromenade u. Lindauer Allee 59/61, Berlin,..."


### For full workflow is recomendable to save the table into a csv for avoid the long wait (101 minutes) for run the geocoder with Nominatim API

1. After geocoding (takes 101 mins):

In [28]:
unique_addresses.to_csv("unique_addresses_geocoded.csv", index=False)

2. In future sessions (fast, ~2s):

In [29]:
unique_addresses = pd.read_csv("unique_addresses_geocoded.csv")

3. Then merge cleanly:

In [30]:
df = df.drop(columns=[col for col in df.columns if 'Latitude' in col or 'Longitude' in col], errors='ignore')
df = df.merge(unique_addresses[['Address1', 'Latitude', 'Longitude']], on='Address1', how='left')
df.head()

Unnamed: 0,Technical ID,Key,Object Number,neighborhood,Locality,Type of Green Space,Green Space Name,Name Extension,Year Built,Last Renovation Year,Size sqm,Dedication,Planning Area Number,Planning Area Name,Address1,Latitude,Longitude
0,00008100_001042bb,00008100:001042bb,37,Reinickendorf,Frohnau,Grünanlage,"Im Fischgrund, ""Rosenanger""",Rosenanger,1995.0,0.0,16991.5,gewidmet,12400721.0,Frohnau Ost,"Im Fischgrund, ""Rosenanger"", Berlin, Germany",,
1,00008100_00104621,00008100:00104621,1179,Reinickendorf,Lübars,Grünanlage,Klötzbecken bis Zabel-Krüger-Damm,einschl. Klötzbecken,1995.0,0.0,52224.6,gewidmet,12500929.0,Lübars,"Klötzbecken bis Zabel-Krüger-Damm, Berlin, Ger...",,
2,00008100_001044bd,00008100:001044bd,1074,Reinickendorf,Hermsdorf,Grünanlage,"Heidenheimer Str. (ab Friedrichsthaler Weg), W...",-,1995.0,0.0,3012.0,gewidmet,12400722.0,Hermsdorf West,"Heidenheimer Str. (ab Friedrichsthaler Weg), W...",,
3,00008100_00104620,00008100:00104620,1180,Reinickendorf,Lübars,Grünanlage,"Wittenauer Str., südl. AEG-Siedlung",-,1995.0,0.0,3374.2,gewidmet,12500929.0,Lübars,"Wittenauer Str., südl. AEG-Siedlung, Berlin, G...",,
4,00008100_00104438,00008100:00104438,476,Reinickendorf,Reinickendorf,Grünanlage,Kuhnpromenade u. Lindauer Allee 59/61,-,1995.0,0.0,3122.0,gewidmet,12100206.0,Humboldtstraße,"Kuhnpromenade u. Lindauer Allee 59/61, Berlin,...",,


### Change column names to snake_case

In [31]:
df.columns

Index(['Technical ID', 'Key', 'Object Number', 'neighborhood', 'Locality',
       'Type of Green Space', 'Green Space Name', 'Name Extension',
       'Year Built', 'Last Renovation Year', 'Size sqm', 'Dedication',
       'Planning Area Number', 'Planning Area Name', 'Address1', 'Latitude',
       'Longitude'],
      dtype='object')

In [32]:
def to_snake_case(col):
    col = col.strip()  # remove leading/trailing whitespace
    col = re.sub(r'[\s\-]+', '_', col)  # replace spaces and hyphens with underscores
    col = re.sub(r'([a-z])([A-Z])', r'\1_\2', col)  # add underscore between camelCase words
    col = col.lower()
    return col

In [33]:
df.columns = [to_snake_case(col) for col in df.columns]


In [34]:
df.columns

Index(['technical_id', 'key', 'object_number', 'neighborhood', 'locality',
       'type_of_green_space', 'green_space_name', 'name_extension',
       'year_built', 'last_renovation_year', 'size_sqm', 'dedication',
       'planning_area_number', 'planning_area_name', 'address1', 'latitude',
       'longitude'],
      dtype='object')

### Save the final table in CSV Format 

In [10]:
df.to_csv('public_parks_transformed.csv', index=False)


In [9]:
df = pd.read_csv('/Users/dianaterraza/Desktop/webeet.io/layered-populate-data-pool-da/recreational_zones/sources/public_parks_transformed.csv')

In [15]:
df.head(10)

Unnamed: 0,technical_id,key,object_number,neighborhood,locality,type_of_green_space,green_space_name,name_extension,year_built,last_renovation_year,size_sqm,dedication,planning_area_number,planning_area_name,address1,latitude,longitude
0,00008100_001042bb,00008100:001042bb,37,Reinickendorf,Frohnau,Grünanlage,"Im Fischgrund, ""Rosenanger""",Rosenanger,1995.0,0.0,16991.5,gewidmet,12400721.0,Frohnau Ost,"Im Fischgrund, ""Rosenanger"", Berlin, Germany",,
1,00008100_00104621,00008100:00104621,1179,Reinickendorf,Lübars,Grünanlage,Klötzbecken bis Zabel-Krüger-Damm,einschl. Klötzbecken,1995.0,0.0,52224.6,gewidmet,12500929.0,Lübars,"Klötzbecken bis Zabel-Krüger-Damm, Berlin, Ger...",,
2,00008100_001044bd,00008100:001044bd,1074,Reinickendorf,Hermsdorf,Grünanlage,"Heidenheimer Str. (ab Friedrichsthaler Weg), W...",-,1995.0,0.0,3012.0,gewidmet,12400722.0,Hermsdorf West,"Heidenheimer Str. (ab Friedrichsthaler Weg), W...",,
3,00008100_00104620,00008100:00104620,1180,Reinickendorf,Lübars,Grünanlage,"Wittenauer Str., südl. AEG-Siedlung",-,1995.0,0.0,3374.2,gewidmet,12500929.0,Lübars,"Wittenauer Str., südl. AEG-Siedlung, Berlin, G...",,
4,00008100_00104438,00008100:00104438,476,Reinickendorf,Reinickendorf,Grünanlage,Kuhnpromenade u. Lindauer Allee 59/61,-,1995.0,0.0,3122.0,gewidmet,12100206.0,Humboldtstraße,"Kuhnpromenade u. Lindauer Allee 59/61, Berlin,...",,
5,00008100_00104357,00008100:00104357,35060,Reinickendorf,Wittenau,Grünanlage,Avenue Charles de Gaulle 32-33,hinter Nimrodstr. u. am Packereigraben,1995.0,0.0,8738.3,gewidmet,12500927.0,Wittenau Nord,"Avenue Charles de Gaulle 32-33, Berlin, Germany",,
6,00008100_00315c33,00008100:00315c33,103014,Steglitz-Zehlendorf,Lichterfelde,Grünanlage,Platz der US-Berlin-Brigaden WG,-,1995.0,0.0,2194.0,gewidmet,6300632.0,Schweizer Viertel,"Platz der US-Berlin-Brigaden WG, Berlin, Germany",,
7,00008100_000e3bb3,00008100:000e3bb3,102180,Steglitz-Zehlendorf,Steglitz,Grünanlage,Schünemannweg N,Schünemannweg 6A u.16A; Tuttlinger Weg,1995.0,0.0,3889.0,gewidmet,6100205.0,Südende,"Schünemannweg N, Berlin, Germany",52.444492,13.352584
8,00008100_00104488,00008100:00104488,599,Reinickendorf,Heiligensee,Grünanlage,"Grabens. Hlgs., Lindengraben",-,1995.0,0.0,12490.9,gewidmet,12400617.0,Alt-Heiligensee,"Grabens. Hlgs., Lindengraben, Berlin, Germany",,
9,00008100_00104409,00008100:00104409,714,Reinickendorf,Tegel,Grünanlage,"BAB, Überbauung Tunnel Tegel",Ernststr. - Waidmannsluster Damm,1995.0,0.0,29894.7,gewidmet,12500824.0,Ziekowstraße/Freie Scholle,"BAB, Überbauung Tunnel Tegel, Berlin, Germany",,


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2556 entries, 0 to 2555
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   technical_id          2556 non-null   object 
 1   key                   2556 non-null   object 
 2   object_number         2556 non-null   object 
 3   neighborhood          2556 non-null   object 
 4   locality              2556 non-null   object 
 5   type_of_green_space   2556 non-null   object 
 6   green_space_name      2556 non-null   object 
 7   name_extension        2556 non-null   object 
 8   year_built            2556 non-null   float64
 9   last_renovation_year  2556 non-null   float64
 10  size_sqm              2556 non-null   float64
 11  dedication            2556 non-null   object 
 12  planning_area_number  2556 non-null   float64
 13  planning_area_name    2556 non-null   object 
 14  address1              2556 non-null   object 
 15  latitude             

In [56]:
from sqlalchemy.exc import DisconnectionError

def block_new_connections(*args, **kwargs):
    raise DisconnectionError("This engine has been disabled.")

engine.dispose()
engine.connect = block_new_connections

### Append to DB (Populate the Database)

In [58]:
# SQLAlchemy connection string format:
# postgresql+psycopg2://user:password@host:port/dbname

DATABASE_URL = (
    "postgresql+psycopg2://neondb_owner:npg_CeS9fJg2azZD"
    "@ep-falling-glitter-a5m0j5gk-pooler.us-east-2.aws.neon.tech:5432/neondb"
    "?sslmode=require"
)

# Create engine and establish connection
engine = create_engine(DATABASE_URL)

In [59]:
# Creating the new table with the specified schema
with engine.connect() as conn:
    conn.execute(text("""
    CREATE TABLE IF NOT EXISTS test_berlin_data.green_spaces (
        technical_id VARCHAR(20) NOT NULL,
        key VARCHAR(20) NOT NULL,
        object_number INT,
        neighborhood VARCHAR(100) NOT NULL, -- FK (foreign key),
        locality VARCHAR(100) UNIQUE NOT NULL,
        type_of_green_space VARCHAR(100) NOT NULL,
        green_space_name VARCHAR(100) UNIQUE NOT NULL,
        name_extension VARCHAR(100) UNIQUE,
        year_built INT CHECK (year_built >= 1800 AND year_built <= 2025),
        last_renovation_year INT CHECK (last_renovation_year >= 1800 AND last_renovation_year <= 2025),
        size_sqm DECIMAL(9,6) NOT NULL,
        dedication VARCHAR(20),
        planning_area_number DECIMAL(9,6) NOT NULL,
        planning_area_name VARCHAR(20),
        address1 VARCHAR(225) UNIQUE NOT NULL,
        longitude DECIMAL(9,6),
        latitude DECIMAL(9,6),
        PRIMARY KEY (technical_id)
    );
    """))
conn.commit()

In [60]:
query = "SELECT * FROM test_berlin_data.green_spaces LIMIT 5;"
df = pd.read_sql(query, engine)
df.head()

ProgrammingError: (psycopg2.errors.UndefinedTable) relation "test_berlin_data.green_spaces" does not exist
LINE 1: SELECT * FROM test_berlin_data.green_spaces LIMIT 5;
                      ^

[SQL: SELECT * FROM test_berlin_data.green_spaces LIMIT 5;]
(Background on this error at: https://sqlalche.me/e/20/f405)