# Street Directory Rental Map 2024

## Environment and raw data

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load street directory 2024 CSV
df_2024 = pd.read_csv("../data/street_directory_rental_map_2024.csv")
df_2024.head()

Unnamed: 0,Straßenname1,Bezirk2 G3 Hausnr.4,Unnamed: 2,B5,WL6,Straßenname1.1,Bezirk2 G3,Hausnr.4,B5.1,WL6.1
0,A,,,,,Agricolastraße,Mitt W,28 - 33 A,F,gut
1,Aachener Straße,ChWi W 1,- 4 A,F,mittel,Ahlbecker Straße,Pank O,,K,gut
2,Aachener Straße,ChWi W 5,- 16,F,gut,Ahlbeerensteig,Span O,,K,einfach
3,Aachener Straße,ChWi W 17,- 26,F,mittel,Ahlener Weg,StZe W,,K,einfach
4,Aachener Straße,ChWi W 27,- 45,F,gut,Ahornallee,ChWi W,2,F,mittel


## Slit table

In [3]:
# Split into left and right halves
df_left = df_2024.iloc[:, 0:5].copy()   
df_right = df_2024.iloc[:, 5:12].copy() 

In [4]:
df_left.columns = ["street_name", "location_code", "house_number_range", "house_number_scheme", "classification"]
df_right.columns = ["street_name", "location_code", "house_number_range", "house_number_scheme", "classification"]

df_left = df_left.dropna(subset=["street_name"])
df_right = df_right.dropna(subset=["street_name"])

df_combined = pd.concat([df_left, df_right], ignore_index=True)

# First, ensure column is string and NaNs are handled
df_combined["location_code"] = df_combined["location_code"].astype(str).fillna("")

# Split into up to 3 parts
split_cols = df_combined["location_code"].str.split(" ", n=2, expand=True)

# Assign with fallbacks
df_combined["district_code"] = split_cols[0]
df_combined["street_side"] = split_cols[1]
df_combined["house_number"] = split_cols[2]  # will be NaN if missing

df_combined.drop(columns=["location_code"], inplace=True)

## NaN values 

In [5]:
df_combined.isnull().sum()

street_name                0
house_number_range      9624
house_number_scheme       26
classification            26
district_code              0
street_side              101
house_number           12809
dtype: int64

In [6]:
df_combined = df_combined[df_combined["classification"].notna()]

## Duplicates

In [7]:
df_combined.duplicated().sum()


np.int64(185)

In [8]:
df_combined = df_combined.drop_duplicates()

## House Number Ranges

In [9]:
# If house_number is missing, keep house_number_range only
df_combined["house_number_range"] = df_combined.apply(
    lambda row: f"{row['house_number']} {row['house_number_range']}".strip()
    if pd.notnull(row['house_number']) and pd.notnull(row['house_number_range'])
    else row['house_number_range'] if pd.isnull(row['house_number']) else str(row['house_number']),
    axis=1
)

# Drop the now redundant 'house_number' column
df_combined.drop(columns=["house_number"], inplace=True)

In [10]:
df_combined["house_number_range"] = df_combined["house_number_range"].fillna("whole street")

## House Number Scheme

In [11]:
scheme_map = {
    "K": "Complete street",
    "F": "Consecutive numbering",
    "G": "Even numbers only",
    "U": "Odd numbers only"
}

df_combined["house_number_scheme_label"] = df_combined["house_number_scheme"].map(scheme_map)

## District

In [12]:
district_map = {
    "ChWi": "Charlottenburg-Wilmersdorf",
    "FrKr": "Friedrichshain-Kreuzberg",
    "Lich": "Lichtenberg",
    "MaHe": "Marzahn-Hellersdorf",
    "Mitt": "Mitte",
    "Neuk": "Neukölln",
    "Pank": "Pankow",
    "Rein": "Reinickendorf",
    "Span": "Spandau",
    "StZe": "Steglitz-Zehlendorf",
    "TrKö": "Treptow-Köpenick",
    "TSch": "Tempelhof-Schöneberg"
}

df_combined["district"] = df_combined["district_code"].map(district_map)

In [13]:
# drop district code column
df_combined.drop(columns=["district_code"], inplace=True)

## Terretorial Side

In [14]:
side_map = {
    "W": "West (pre-2000)",
    "O": "East + West-Staaken (pre-2000)"
}
df_combined["street_side"] = df_combined["street_side"].map(side_map)

## Rename and reorder columns

In [15]:
# rename columns for clarity
df_combined.rename(columns={
    "house_number_scheme": "house_number_scheme_code",
    "house_number_scheme_label": "house_number_scheme",
    "street_side": "territorial_side"
}, inplace=True)

In [16]:
# reorder columns for clarity
columns_order = [
    "street_name", 
    "district",
    "territorial_side",
    "house_number_range",
    "house_number_scheme_code",
    "house_number_scheme",
    "classification"
]

df_combined = df_combined[columns_order]

## Data Overview

In [17]:
df_combined.head()

Unnamed: 0,street_name,district,territorial_side,house_number_range,house_number_scheme_code,house_number_scheme,classification
1,Aachener Straße,Charlottenburg-Wilmersdorf,West (pre-2000),1 - 4 A,F,Consecutive numbering,mittel
2,Aachener Straße,Charlottenburg-Wilmersdorf,West (pre-2000),5 - 16,F,Consecutive numbering,gut
3,Aachener Straße,Charlottenburg-Wilmersdorf,West (pre-2000),17 - 26,F,Consecutive numbering,mittel
4,Aachener Straße,Charlottenburg-Wilmersdorf,West (pre-2000),27 - 45,F,Consecutive numbering,gut
5,Aalemannufer,Spandau,West (pre-2000),whole street,K,Complete street,mittel


# Street District Rental Map 2023

## Environment and raw data

In [18]:
# import street directory 2023 CSV
df_2023 = pd.read_csv("../data/street_directory_rental_map_2023.csv", on_bad_lines='skip')

In [19]:
df_2023.head()

Unnamed: 0,Straßenname1,Bezirk2,G3,Hausnr.4,B5,WL6 WL7,Unnamed: 6,Straßenname1.1,Bezirk2.1,G3 Hausnr.4,Unnamed: 10,B5.1,WL6 WL7.1
0,A,,,,,,,Adalbertstraße,FrKr,W 9,- 23 A,F,einfach * Z
1,Aachener Straße,ChWi,W,1 - 8,F,mittel Z,,Adalbertstraße,FrKr,W 23 B,,F,einfach Z
2,Aachener Straße,ChWi,W,12 - 26,F,mittel * Z,,Adalbertstraße,FrKr,W 67,- 75,F,einfach * Z
3,Aachener Straße,ChWi,W,27 - 30,F,gut * Z,,Adalbertstraße,FrKr,W 75 A,,F,einfach Z
4,Aachener Straße,ChWi,W,31 - 44,F,gut Z,,Adalbertstraße,FrKr,W 76,- 81,F,einfach * Z


In [20]:
# Split into left and right halves
df_left_23 = df_2024.iloc[:, 0:5].copy()   
df_right_23 = df_2024.iloc[:, 5:12].copy() 

In [21]:
df_left_23.head()

Unnamed: 0,Straßenname1,Bezirk2 G3 Hausnr.4,Unnamed: 2,B5,WL6
0,A,,,,
1,Aachener Straße,ChWi W 1,- 4 A,F,mittel
2,Aachener Straße,ChWi W 5,- 16,F,gut
3,Aachener Straße,ChWi W 17,- 26,F,mittel
4,Aachener Straße,ChWi W 27,- 45,F,gut


In [22]:
# Rename columns for clarity
df_left_23.columns = ["street_name", "district", "house_number", "house_number_scheme_code", "classification"]

In [23]:
split_columns = df_left_23["district"].str.split(" ", n=2, expand=True)

print("Split columns shape:", split_columns.shape)
print(split_columns.head())

Split columns shape: (7095, 3)
      0    1    2
0   NaN  NaN  NaN
1  ChWi    W    1
2  ChWi    W    5
3  ChWi    W   17
4  ChWi    W   27


In [24]:
# Assign with fallbacks
df_left_23["district_code"] = split_columns[0]
df_left_23["territorial_side"] = split_columns[1]
df_left_23["house_number_range"] = split_columns[2]  # will be NaN if missing

In [28]:
df_left_23.head()

Unnamed: 0,street_name,house_number_scheme_code,classification,district_code,territorial_side,house_number_range
0,A,,,,,
1,Aachener Straße,F,mittel,ChWi,W,1 - 4 A
2,Aachener Straße,F,gut,ChWi,W,5 - 16
3,Aachener Straße,F,mittel,ChWi,W,17 - 26
4,Aachener Straße,F,gut,ChWi,W,27 - 45


In [26]:
# drop district column
df_left_23.drop(columns=["district"], inplace=True)

In [27]:
# If house_number is missing, keep house_number_range only
df_left_23["house_number_range"] = df_left_23.apply(
    lambda row: f"{row['house_number_range']} {row['house_number']}".strip()
    if pd.notnull(row['house_number_range']) and pd.notnull(row['house_number'])
    else row['house_number'] if pd.isnull(row['house_number_range']) else str(row['house_number_range']),
    axis=1
)

# Drop the now redundant 'house_number' column
df_left_23.drop(columns=["house_number"], inplace=True)

In [40]:
df_right_23.head()

Unnamed: 0,street_name,house_number_range,house_number_scheme_code,classification,district_code,territorial_side
0,Agricolastraße,28 - 33 A,F,gut,Mitt,W
1,Ahlbecker Straße,,K,gut,Pank,O
2,Ahlbeerensteig,,K,einfach,Span,O
3,Ahlener Weg,,K,einfach,StZe,W
4,Ahornallee,2,F,mittel,ChWi,W


In [35]:
# Rename columns for clarity
df_right_23.rename(columns={
    "Straßenname1.1": "street_name",
    "Bezirk2 G3": "district",
    "Hausnr.4": "house_number_range",
    "B5.1": "house_number_scheme_code",
    "WL6.1": "classification"
}, inplace=True)

In [39]:
# split district column in 2
split_colmns = df_right_23["district"].str.split(" ", n=2, expand=True)

df_right_23["district_code"] = split_colmns[0]
df_right_23["territorial_side"] = split_colmns[1]

# drop district column
df_right_23.drop(columns=["district"], inplace=True)

In [41]:
df_combined_23 = pd.concat([df_left_23, df_right_23], ignore_index=True)

In [55]:
df_combined_23.head()

Unnamed: 0,street_name,house_number_scheme_code,classification,territorial_side,house_number_range,house_number_scheme,district
1,Aachener Straße,F,mittel,W,1 - 4 A,Consecutive numbering,Charlottenburg-Wilmersdorf
2,Aachener Straße,F,gut,W,5 - 16,Consecutive numbering,Charlottenburg-Wilmersdorf
3,Aachener Straße,F,mittel,W,17 - 26,Consecutive numbering,Charlottenburg-Wilmersdorf
4,Aachener Straße,F,gut,W,27 - 45,Consecutive numbering,Charlottenburg-Wilmersdorf
5,Aalemannufer,K,mittel,W,,Complete street,Spandau


In [44]:
df_combined_23 = df_combined_23[df_combined_23["classification"].notna()]

In [46]:
# check duplicates
df_combined_23.duplicated().sum()

np.int64(186)

In [47]:
# drop duplicates
df_combined_23 = df_combined_23.drop_duplicates()

In [50]:
df_combined_23["house_number_scheme"] = df_combined_23["house_number_scheme_code"].map(scheme_map)

In [54]:
df_combined_23["district"] = df_combined_23["district_code"].map(district_map)
# drop district code column
df_combined_23.drop(columns=["district_code"], inplace=True)

In [56]:
# Reorder columns for clarity
columns_order_23 = [
    "street_name",
    "district",
    "territorial_side",
    "house_number_range",
    "house_number_scheme_code",
    "house_number_scheme",
    "classification"
]

df_combined_23 = df_combined_23[columns_order_23]

In [61]:
df_combined_23.head()

Unnamed: 0,street_name,district,territorial_side,house_number_range,house_number_scheme_code,house_number_scheme,classification
1,Aachener Straße,Charlottenburg-Wilmersdorf,West (pre-2000),1 - 4 A,F,Consecutive numbering,mittel
2,Aachener Straße,Charlottenburg-Wilmersdorf,West (pre-2000),5 - 16,F,Consecutive numbering,gut
3,Aachener Straße,Charlottenburg-Wilmersdorf,West (pre-2000),17 - 26,F,Consecutive numbering,mittel
4,Aachener Straße,Charlottenburg-Wilmersdorf,West (pre-2000),27 - 45,F,Consecutive numbering,gut
5,Aalemannufer,Spandau,West (pre-2000),,K,Complete street,mittel


In [60]:
df_combined_23["territorial_side"] = df_combined_23["territorial_side"].map(side_map)

In [62]:
# Export the cleaned data to CSV
output_path = "../data/cleaned_data/street_directory_cleaned_2023.csv"
df_combined_23.to_csv(output_path, index=False)

In [63]:
# Export df_comnined to CSV
output_path_combined = "../data/cleaned_data/street_directory_cleaned_2024.csv"
df_combined.to_csv(output_path_combined, index=False)

# Kaggle dataset

In [64]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("nofullnames33/berlin-rent-per-square-meter-by-bezirk-district")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/nofullnames33/berlin-rent-per-square-meter-by-bezirk-district?dataset_version_number=4...


100%|██████████| 3.03k/3.03k [00:00<00:00, 1.70MB/s]

Extracting files...
Path to dataset files: /Users/didodeboodt/.cache/kagglehub/datasets/nofullnames33/berlin-rent-per-square-meter-by-bezirk-district/versions/4





In [65]:
df_bezirk = pd.read_csv("../data/raw_data/bezirks.csv")
df_bezirk.head()

Unnamed: 0,borough,bezirk,minRent,avgRent,maxRent,minBuy,avgBuy,maxBuy,url
0,Charlottenburg (Charlottenburg),Charlottenburg-Wilmersdorf,8.85,16.18,28.55,2079,5065,11014,https://www.immobilienscout24.de/immobilienpre...
1,Grunewald (Wilmersdorf),Charlottenburg-Wilmersdorf,11.89,18.17,28.61,4412,7359,11834,https://www.immobilienscout24.de/immobilienpre...
2,Schmargendorf (Wilmersdorf),Charlottenburg-Wilmersdorf,13.54,17.81,29.07,4790,7006,11762,https://www.immobilienscout24.de/immobilienpre...
3,Wilmersdorf (Wilmersdorf),Charlottenburg-Wilmersdorf,13.59,15.77,28.04,3261,5259,11003,https://www.immobilienscout24.de/immobilienpre...
4,Friedrichshain (Friedrichshain),Friedrichshain-Kreuzberg,13.68,16.79,28.59,3175,5052,9776,https://www.immobilienscout24.de/immobilienpre...
