# Check and fix all past tables to get to one standard before joining into unioned table

### As most of the tables did not have correct columns, column names or data types I first needed to clean them up using this notebook and manually in DBeaver to ALTER or DELETE where needed

#### Import libraries

In [1]:
import psycopg2
from sqlalchemy import create_engine, text, inspect
import warnings
import pandas as pd

warnings.filterwarnings("ignore")

#### Credentials

In [None]:
user_name=''
password=''

#### Connect to berlin_source_data schema

In [3]:
# Conection
host = ''
port = ''
database = '' 
schema='' 

#connection to db after you opened tunnel
engine = create_engine(f'postgresql+psycopg2://{user_name}:{password}@{host}:{port}/{database}')

### Add neighbourhood_id column to tables without (used neighborhood to join)
- Added neighborhood_id column to bus_tram_stops, milieuschutz_protection_zones & short_term_listings manually

In [26]:
alter_table_query = f"""

-- gym table
    ALTER TABLE {schema}.gyms
    ADD COLUMN neighborhood_id VARCHAR(20);

    UPDATE {schema}.gyms
    SET neighborhood_id = n.neighborhood_id
    FROM {schema}.neighborhoods n
    WHERE gyms.neighborhood = n.neighborhood;

-- hospitals table
    ALTER TABLE {schema}.hospitals
    ADD COLUMN neighborhood_id VARCHAR(20);

    UPDATE {schema}.hospitals
    SET neighborhood_id = n.neighborhood_id
    FROM {schema}.neighborhoods n
    WHERE hospitals.neighborhood = n.neighborhood;

-- long_term_listings
    ALTER TABLE {schema}.long_term_listings
    ADD COLUMN neighborhood_id VARCHAR(20);

    UPDATE {schema}.long_term_listings
    SET neighborhood_id = n.neighborhood_id
    FROM {schema}.neighborhoods n
    WHERE long_term_listings.neighborhood = n.neighborhood;

-- pharmacies
    ALTER TABLE {schema}.pharmacies
    ADD COLUMN neighborhood_id VARCHAR(20);

    UPDATE {schema}.pharmacies
    SET neighborhood_id = n.neighborhood_id
    FROM {schema}.neighborhoods n
    WHERE pharmacies.neighborhood = n.neighborhood;

-- short_term_listings
    ALTER TABLE {schema}.short_term_listings
    ADD COLUMN neighborhood_id VARCHAR(20);

    UPDATE {schema}.short_term_listings
    SET neighborhood_id = n.neighborhood_id
    FROM {schema}.neighborhoods n
    WHERE short_term_listings.neighborhood = n.neighborhood;

-- ubahn
    ALTER TABLE {schema}.ubahn
    ADD COLUMN neighborhood_id VARCHAR(20);

    UPDATE {schema}.ubahn
    SET neighborhood_id = n.neighborhood_id
    FROM {schema}.neighborhoods n
    WHERE ubahn.neighborhood = n.neighborhood;

-- universities
    ALTER TABLE {schema}.universities
    ADD COLUMN neighborhood_id VARCHAR(20);

    UPDATE {schema}.universities
    SET neighborhood_id = n.neighborhood_id
    FROM {schema}.neighborhoods n
    WHERE universities.neighborhood = n.neighborhood;

-- venues
    ALTER TABLE {schema}.venues
    ADD COLUMN neighborhood_id VARCHAR(20);

    UPDATE {schema}.venues
    SET neighborhood_id = n.neighborhood_id
    FROM {schema}.neighborhoods n
    WHERE venues.neighborhood = n.neighborhood;

;
"""
with engine.connect() as conn:
    conn.execute(text(alter_table_query))
    conn.commit()  # commit the transaction
    print("Table has been altered.")  

Table has been altered.


### Add id to ubahn table (primary key)

In [27]:
query = f"""

    -- 1. Add a temporary serial column
    ALTER TABLE berlin_source_data.ubahn
    ADD COLUMN temp_id SERIAL;

    -- 2. Add the final VARCHAR id column
    ALTER TABLE berlin_source_data.ubahn
    ADD COLUMN id VARCHAR(20);

    -- 3. Copy serial values into VARCHAR column
    UPDATE berlin_source_data.ubahn
    SET id = temp_id::text;

    -- 4. Drop temporary column
    ALTER TABLE berlin_source_data.ubahn
    DROP COLUMN temp_id;

    -- 5. Add primary key on id
    ALTER TABLE berlin_source_data.ubahn
    ADD CONSTRAINT ubahn_pkey PRIMARY KEY (id);

"""
with engine.connect() as conn:
    conn.execute(text(query))
    conn.commit()

print("Column 'id' added.")

Column 'id' added.


- Check insert worked

In [11]:
query = f"""
SELECT *
FROM berlin_source_data.ubahn
LIMIT 5;
"""

with engine.connect() as conn:
    df= pd.read_sql(text(query), conn)
    conn.commit()  # commit the transaction
df


Unnamed: 0,name,line,latitude,longitude,postcode,neighborhood,district,district_id,neighborhood_id,id
0,Mohrenstraße,U2,52.51167,13.38472,10117,Mitte,Mitte,11001001,101,135
1,Mohrenstraße,U6,52.51167,13.38472,10117,Mitte,Mitte,11001001,101,136
2,Märkisches Museum,U2,52.511944,13.408889,10179,Mitte,Mitte,11001001,101,137
3,Möckernbrücke,U7,52.499167,13.382778,10963,Kreuzberg,Friedrichshain-Kreuzberg,11002002,202,138
4,Möckernbrücke,U1,52.499167,13.382778,10963,Kreuzberg,Friedrichshain-Kreuzberg,11002002,202,139


### Add geometry to missing tables


- Done manually with banks and bus_tram_stops

In [9]:
add_geometry = f"""

ALTER TABLE berlin_source_data.banks
ADD COLUMN geometry VARCHAR(255);

UPDATE berlin_source_data.banks
SET geometry = ST_AsText(ST_SetSRID(ST_MakePoint(banks.longitude, banks.latitude), 4326))

"""
with engine.begin() as conn:
    conn.execute(text(add_geometry))
    

print("Column 'geometry' added.")

Column 'geometry' added.


- created a loop as a lot of tables missing the columngeometry column so this is easier to fix

In [None]:
# Get all tables in the schema
inspector = inspect(engine)
tables = inspector.get_table_names(schema=schema)

print(f"Found {len(tables)} tables in {schema}")

for table_name in tables:
    # Skip tables in skip list
    if table_name in skip_tables:
        print(f"⊘ {table_name} - skipped")
        continue
    
    # Get columns in this table
    columns = inspector.get_columns(table_name, schema=schema)
    column_names = [col['name'] for col in columns]
    
    # Check if geometry column exists
    if 'geometry' in column_names:
        print(f"✓ {table_name} - geometry column already exists")
        continue
    
    # Check if longitude and latitude columns exist
    if 'longitude' not in column_names or 'latitude' not in column_names:
        print(f"✗ {table_name} - missing longitude/latitude columns, skipping")
        continue
    
    # Add geometry column and populate it
    try:
        sql = f"""
        ALTER TABLE {schema}.{table_name}
        ADD COLUMN geometry VARCHAR(255);
        
        UPDATE {schema}.{table_name}
        SET geometry = ST_AsText(ST_SetSRID(ST_MakePoint({table_name}.longitude, {table_name}.latitude), 4326));
        """
        
        with engine.begin() as conn:
            conn.execute(text(sql))
        
        print(f"✓ {table_name} - geometry column added and populated")
    
    except Exception as e:
        print(f"✗ {table_name} - error: {str(e)}")

print("\nAll tables processed!")

Found 38 tables in berlin_source_data
✓ galleries - geometry column added and populated
✓ theaters - geometry column added and populated
✓ pools - geometry column added and populated
✓ public_artworks - geometry column added and populated
✓ religious_institutions - geometry column already exists
✓ government_offices - geometry column already exists
✓ bus_tram_stops - geometry column already exists
✓ parking_spaces - geometry column already exists
✓ malls - geometry column added and populated
✓ banks - geometry column already exists
✓ doctors - geometry column added and populated
✓ food_markets - geometry column added and populated
✓ social_clubs_activities - geometry column already exists
✓ veterinary_clinics - geometry column added and populated
✓ pharmacies - geometry column added and populated
✓ supermarkets - geometry column added and populated
⊘ bike_lanes - skipped
⊘ libraries - skipped
✓ venues - geometry column added and populated
✓ post_offices - geometry column added and popu

### Add neighbourhood using Nomatim 

- Adjusted following tables
    - banks
    - dental_offices
    - s_bahn
    - schools

In [50]:
import requests
import time

table_name = "schools"  # Change this to your table

# Fetch data
query = f"SELECT id, latitude, longitude FROM {schema}.{table_name}"
df = pd.read_sql(query, engine)
print(f"Processing {len(df)} rows...")

def get_neighbourhood(lat, lon):
    """Reverse geocode to get neighbourhood"""
    url = f"https://nominatim.openstreetmap.org/reverse?format=json&lat={lat}&lon={lon}"
    try:
        response = requests.get(url, timeout=10, headers={'User-Agent': 'BerlinDataApp'})
        if response.status_code == 200:
            data = response.json()
            address = data.get('address', {})
            # Try neighbourhood first, then suburb, then city_district
            neighbourhood = address.get('neighbourhood') or address.get('suburb') or address.get('city_district')
            return neighbourhood if neighbourhood else None
        return None
    except Exception as e:
        print(f"Error for ({lat}, {lon}): {e}")
        return None

# Add neighbourhood column if it doesn't exist
with engine.begin() as conn:
    conn.execute(text(f"ALTER TABLE {schema}.{table_name} ADD COLUMN IF NOT EXISTS neighborhood VARCHAR(255)"))

# Process each row with rate limiting (1 request per second)
neighbourhoods = []
for idx, row in df.iterrows():
    if idx > 0:
        time.sleep(1)
    
    neighbourhood = get_neighbourhood(row['latitude'], row['longitude'])
    neighbourhoods.append(neighbourhood)
    
    if (idx + 1) % 10 == 0:
        print(f"Processed {idx + 1}/{len(df)} rows")

df['neighbourhood'] = neighbourhoods

# Update the database
with engine.begin() as conn:
    for idx, row in df.iterrows():
        conn.execute(
            text(f"UPDATE {schema}.{table_name} SET neighborhood = :neighbourhood WHERE id = :id"),
            {'neighbourhood': row['neighbourhood'], 'id': row['id']}
        )

print(f"✓ Updated {len(df)} rows with neighbourhood data")
print("\nSample results:")
print(df[['id', 'latitude', 'longitude', 'neighbourhood']].head(10))

Processing 927 rows...
Processed 10/927 rows
Processed 20/927 rows
Processed 30/927 rows
Processed 40/927 rows
Processed 50/927 rows
Processed 60/927 rows
Processed 70/927 rows
Processed 80/927 rows
Processed 90/927 rows
Processed 100/927 rows
Processed 110/927 rows
Processed 120/927 rows
Processed 130/927 rows
Processed 140/927 rows
Processed 150/927 rows
Processed 160/927 rows
Processed 170/927 rows
Processed 180/927 rows
Processed 190/927 rows
Processed 200/927 rows
Processed 210/927 rows
Processed 220/927 rows
Processed 230/927 rows
Processed 240/927 rows
Processed 250/927 rows
Processed 260/927 rows
Processed 270/927 rows
Processed 280/927 rows
Processed 290/927 rows
Processed 300/927 rows
Processed 310/927 rows
Processed 320/927 rows
Processed 330/927 rows
Processed 340/927 rows
Processed 350/927 rows
Processed 360/927 rows
Processed 370/927 rows
Processed 380/927 rows
Processed 390/927 rows
Processed 400/927 rows
Processed 410/927 rows
Processed 420/927 rows
Processed 430/927 ro

### Add neighbourhood_id column - used neighborhood to join

- Run on following tables
    - banks
    - dental_offices
    - sbahn
    - schools

In [51]:

alter_table_query = f"""

ALTER TABLE {schema}.schools
ADD COLUMN neighborhood_id VARCHAR(20);

UPDATE {schema}.schools bts
SET neighborhood_id = n.neighborhood_id
FROM {schema}.neighborhoods n
WHERE bts.neighborhood = n.neighborhood
;
"""
with engine.connect() as conn:
    conn.execute(text(alter_table_query))
    conn.commit()  # commit the transaction
    print("Table has been altered.")  

Table has been altered.


- Query the school table to check it worked

In [52]:
query = f"""
SELECT *
FROM berlin_source_data.schools
order by neighborhood_id asc
LIMIT 5;
"""

with engine.connect() as conn:
    df= pd.read_sql(text(query), conn)
    conn.commit()  # commit the transaction
df

Unnamed: 0,id,bsn,name,school_type_de,ownership_en,school_category_de,school_category_en,district_id,district,quarter,...,students_m,teachers_total,teachers_f,teachers_m,startchancen_flag,longitude,latitude,geometry,neighborhood,neighborhood_id
0,39,01G49,49. Schule (Grundschule),Grundschule,Public,Grundschule,Primary School,11001001,Mitte,Mitte,...,89.0,17.0,17.0,0.0,False,13.424,52.508,POINT(13.424 52.508),Mitte,101
1,45,01K07,Hemingway-Schule,Integrierte Sekundarschule,Public,Integrierte Sekundarschule,Integrated Secondary School,11001001,Mitte,Mitte,...,259.0,54.0,33.0,21.0,True,13.392,52.53,POINT(13.392 52.53),Mitte,101
2,12,01G10,City-Grundschule,Grundschule,Public,Grundschule,Primary School,11001001,Mitte,Mitte,...,300.0,41.0,32.0,9.0,False,13.409,52.508,POINT(13.409 52.508),Mitte,101
3,36,01G46,Grundschule am Koppenplatz,Grundschule,Public,Grundschule,Primary School,11001001,Mitte,Mitte,...,381.0,56.0,41.0,15.0,False,13.398,52.528,POINT(13.398 52.528),Mitte,101
4,49,01P01,Evangelische Schule Berlin Mitte (Gemeinschaft...,Grundschule,Private,Privatschule,Private School,11001001,Mitte,Mitte,...,,,,,False,13.406,52.523,POINT(13.406 52.523),Mitte,101


### Using districts and neighborhoods tables - add the district or neighbourhood names to the other layers tables (id as reference)

In [None]:
# Get all tables in the schema
inspector = inspect(engine)
tables = inspector.get_table_names(schema=schema)

print(f"Found {len(tables)} tables in {schema}\n")

for table_name in tables:
    # Get columns in this table
    columns = inspector.get_columns(table_name, schema=schema)
    column_names = [col['name'] for col in columns]
    
    # Check if district_id column exists
    if 'district_id' not in column_names:
        print(f"⊘ {table_name} - no district_id column, skipping")
        continue
    
    updates_made = []
    
    # Check if district column is missing
    if 'district' not in column_names:
        try:
            sql = f"""
            ALTER TABLE {schema}.{table_name}
            ADD COLUMN district VARCHAR(100);
            
            UPDATE {schema}.{table_name} t
            SET district = d.district
            FROM {schema}.districts d
            WHERE t.district_id = d.district_id;
            """
            
            with engine.begin() as conn:
                conn.execute(text(sql))
            
            updates_made.append("district")
            print(f"✓ {table_name} - added district column")
        except Exception as e:
            print(f"✗ {table_name} - error adding district: {str(e)}")
    
    # Check if neighborhood column is missing
    if 'neighborhood' not in column_names:
        try:
            sql = f"""
            ALTER TABLE {schema}.{table_name}
            ADD COLUMN neighborhood VARCHAR(100);
            
            UPDATE {schema}.{table_name} t
            SET neighborhood = n.neighborhood
            FROM {schema}.districts d
            JOIN {schema}.neighborhoods n ON d.district_id = n.district_id
            WHERE t.district_id = d.district_id;
            """
            
            with engine.begin() as conn:
                conn.execute(text(sql))
            
            updates_made.append("neighborhood")
            print(f"✓ {table_name} - added neighborhood column")
        except Exception as e:
            print(f"✗ {table_name} - error adding neighborhood: {str(e)}")
    
    if not updates_made:
        print(f"✓ {table_name} - district and neighborhood columns already exist")

print("\nAll tables processed!")

### Add latitude & longitude to milieuschutz_protection_zones

In [63]:
with engine.begin() as conn:
    conn.execute(text('''
        ALTER TABLE berlin_source_data.milieuschutz_protection_zones
        ADD COLUMN latitude DECIMAL(10, 6),
        ADD COLUMN longitude DECIMAL(10, 6);
        
        UPDATE berlin_source_data.milieuschutz_protection_zones
        SET latitude = ST_Y(ST_Centroid(geometry)),
            longitude = ST_X(ST_Centroid(geometry));
    '''))

print("Latitude and longitude columns added (using centroid).")

Latitude and longitude columns added (using centroid).


### Correct districts for tables with old district numbers
    - vetinary_clinics

In [69]:
add_district_id = f"""

-- 1. Add district_id column if it does not exist
ALTER TABLE berlin_source_data.veterinary_clinics
ADD COLUMN IF NOT EXISTS district_id VARCHAR(20);

-- 2. Update district_id using your mapping
UPDATE berlin_source_data.veterinary_clinics
SET district_id = CASE district
    WHEN 'Mitte' THEN '11001001'
    WHEN 'Friedrichshain-Kreuzberg' THEN '11002002'
    WHEN 'Pankow' THEN '11003003'
    WHEN 'Charlottenburg-Wilmersdorf' THEN '11004004'
    WHEN 'Spandau' THEN '11005005'
    WHEN 'Steglitz-Zehlendorf' THEN '11006006'
    WHEN 'Tempelhof-Schöneberg' THEN '11007007'
    WHEN 'Neukölln' THEN '11008008'
    WHEN 'Treptow-Köpenick' THEN '11009009'
    WHEN 'Marzahn-Hellersdorf' THEN '11010010'
    WHEN 'Lichtenberg' THEN '11011011'
    WHEN 'Reinickendorf' THEN '11012012'
    ELSE NULL
END;

-- 3. Add the foreign key constraint with specified rules
ALTER TABLE berlin_source_data.veterinary_clinics
ADD CONSTRAINT district_id_fk
FOREIGN KEY (district_id)
REFERENCES berlin_source_data.districts(district_id)
ON DELETE RESTRICT
ON UPDATE CASCADE;
"""

with engine.connect() as conn:
    conn.execute(text(add_district_id))
    conn.commit()

print("Column 'district_id' added.")

Column 'district_id' added.
