In [12]:
from pathlib import Path
import re
import numpy as np
import pandas as pd


# fester Pfad relativ zum Notebook
DATA = Path("../data")
CSV_0004 = DATA / "4000W-0004_de.csv"
CLEAN = DATA / "clean"
CLEAN.mkdir(exist_ok=True)

print("Datei:", CSV_0004.resolve(), "| existiert:", CSV_0004.exists())



Datei: D:\Hausverwaltung\rental-price-prediction-dashboard\data\4000W-0004_de.csv | existiert: True


In [5]:
df = pd.read_csv("../data/4000W-0004_de.csv", sep=";", header=None, names=["GKZ","Gemeindename","Miete_raw"], encoding="utf-8-sig")
df

Unnamed: 0,GKZ,Gemeindename,Miete_raw
0,010010000000,"Flensburg, Stadt",696
1,010020000000,"Kiel, Landeshauptstadt",764
2,010030000000,"Lübeck, Hansestadt",747
3,010040000000,"Neumünster, Stadt",622
4,010510011011,"Brunsbüttel, Stadt",574
...,...,...,...
10781,"1,60775E+11",Göpfersdorf,419
10782,"1,60775E+11",Langenleuba-Niederhain,44
10783,"1,60775E+11",Nobitz,469
10784,"1,60775E+11",Dobitschen,444


In [10]:
df["Gemeindename"] = df["Gemeindename"].astype(str).str.split(",", n=1).str[0].str.strip()
df

Unnamed: 0,GKZ,Gemeindename,Miete_raw
0,010010000000,Flensburg,696
1,010020000000,Kiel,764
2,010030000000,Lübeck,747
3,010040000000,Neumünster,622
4,010510011011,Brunsbüttel,574
...,...,...,...
10781,"1,60775E+11",Göpfersdorf,419
10782,"1,60775E+11",Langenleuba-Niederhain,44
10783,"1,60775E+11",Nobitz,469
10784,"1,60775E+11",Dobitschen,444


In [12]:
df["Gemeindename"] = df["Gemeindename"].astype(str).str.split("(", n=1).str[0].str.strip()
df

Unnamed: 0,GKZ,Gemeindename,Miete_raw
0,010010000000,Flensburg,696
1,010020000000,Kiel,764
2,010030000000,Lübeck,747
3,010040000000,Neumünster,622
4,010510011011,Brunsbüttel,574
...,...,...,...
10781,"1,60775E+11",Göpfersdorf,419
10782,"1,60775E+11",Langenleuba-Niederhain,44
10783,"1,60775E+11",Nobitz,469
10784,"1,60775E+11",Dobitschen,444


In [15]:
# Miete in float umwandeln
#    - Leerzeichen weg
#    - Euro/Einheiten weg
#    - deutsches Komma in Punkt umwandeln
def to_float_de(s):
    if pd.isna(s): 
        return None
    s = str(s)
    s = s.replace("€", "").replace("€/m²", "").strip()
    s = s.replace(".", "")      # tausenderpunkte weg (falls vorhanden)
    s = s.replace(",", ".")     # deutsches Komma -> Punkt
    try:
        return float(s)
    except:
        return None
df["Miete"] = df["Miete_raw"].map(to_float_de)
df

Unnamed: 0,GKZ,Gemeindename,Miete_raw,Miete
0,010010000000,Flensburg,696,6.96
1,010020000000,Kiel,764,7.64
2,010030000000,Lübeck,747,7.47
3,010040000000,Neumünster,622,6.22
4,010510011011,Brunsbüttel,574,5.74
...,...,...,...,...
10781,"1,60775E+11",Göpfersdorf,419,4.19
10782,"1,60775E+11",Langenleuba-Niederhain,44,4.40
10783,"1,60775E+11",Nobitz,469,4.69
10784,"1,60775E+11",Dobitschen,444,4.44


In [19]:
df = df.dropna(subset=["Miete"])
df

Unnamed: 0,GKZ,Gemeindename,Miete_raw,Miete
0,010010000000,Flensburg,696,6.96
1,010020000000,Kiel,764,7.64
2,010030000000,Lübeck,747,7.47
3,010040000000,Neumünster,622,6.22
4,010510011011,Brunsbüttel,574,5.74
...,...,...,...,...
10781,"1,60775E+11",Göpfersdorf,419,4.19
10782,"1,60775E+11",Langenleuba-Niederhain,44,4.40
10783,"1,60775E+11",Nobitz,469,4.69
10784,"1,60775E+11",Dobitschen,444,4.44


In [25]:
df = df.drop(columns=["Miete_raw"])

In [26]:
# --- 3. Datentypen prüfen ---
print("\nDatentypen:")
print(df.dtypes)


Datentypen:
GKZ              object
Gemeindename     object
Miete           float64
dtype: object


In [27]:
# --- 4. Fehlende Werte prüfen ---
print("\nFehlende Werte (in %):")
print(df.isna().mean().sort_values(ascending=False).head(10))


Fehlende Werte (in %):
GKZ             0.0
Gemeindename    0.0
Miete           0.0
dtype: float64


In [28]:
# --- 5. Basisstatistik ---
print("\nBasisstatistik:")
print(df.describe(include="all"))


Basisstatistik:
                GKZ  Gemeindename         Miete
count         10713         10713  10713.000000
unique         8439         10120           NaN
top     1,30715E+11  Neuenkirchen           NaN
freq            141            11           NaN
mean            NaN           NaN      5.776486
std             NaN           NaN      1.323485
min             NaN           NaN      0.090000
25%             NaN           NaN      4.830000
50%             NaN           NaN      5.540000
75%             NaN           NaN      6.460000
max             NaN           NaN     13.840000


In [30]:
# --- 6. Eindeutigkeit der GKZ prüfen ---
print("\nEindeutige GKZ:", df["GKZ"].nunique())
print("Duplikate?", df["GKZ"].duplicated().any())


Eindeutige GKZ: 8439
Duplikate? True


In [31]:
# 7) Übersicht drucken
print("Zeilen gesamt:", len(df))
print("Eindeutige GKZ:", df["GKZ"].nunique())
print("Hat Duplikate (GKZ)?", df["GKZ"].duplicated().any())
print("\nBeispiele:")
print(df.head(10))

Zeilen gesamt: 10713
Eindeutige GKZ: 8439
Hat Duplikate (GKZ)? True

Beispiele:
            GKZ Gemeindename  Miete
0  010010000000    Flensburg   6.96
1  010020000000         Kiel   7.64
2  010030000000       Lübeck   7.47
3  010040000000   Neumünster   6.22
4  010510011011  Brunsbüttel   5.74
5  010510044044        Heide   6.68
6  010515163003      Averlak   5.12
7  010515163010     Brickeln   5.73
8  010515163012     Buchholz   5.00
9  010515163016         Burg   5.67


In [32]:
# 8) Speichern
import os, re
out_dir = "../data/clean"
os.makedirs(out_dir, exist_ok=True)
out_path = f"{out_dir}/zensus_0004_clean.csv"
df.to_csv(out_path, index=False, encoding="utf-8")  # Komma-getrennt
print("\nGespeichert →", out_path)
print("Finale Spalten:", df.columns.tolist())


Gespeichert → ../data/clean/zensus_0004_clean.csv
Finale Spalten: ['GKZ', 'Gemeindename', 'Miete']


In [40]:
df5 = pd.read_csv("../data/4000W-0005_de.csv", sep=";", header=None, 
                  names=["GKZ","Gemeindename",
                         "Miete_raw",
                        "befor 1919",
                        "1919-1949",
                        "1950-1959",
                        "1960-1969",
                        "1970-1979",
                        "1980-1989",
                        "1990-1999",
                        "2000-2009",
                        "2010-2015",
                        " after 2016"
                        ], encoding="utf-8-sig")
df5

Unnamed: 0,GKZ,Gemeindename,Miete_raw,befor 1919,1919-1949,1950-1959,1960-1969,1970-1979,1980-1989,1990-1999,2000-2009,2010-2015,after 2016
0,GKZ,Gemeindename,Insgesamt,Vor 1919,1919 - 1949,1950 - 1959,1960 - 1969,1970 - 1979,1980 - 1989,1990 - 1999,2000 - 2009,2010 - 2015,2016 und später
1,010010000000,"Flensburg, Stadt",696,728,655,64,639,671,676,652,744,836,925
2,010020000000,"Kiel, Landeshauptstadt",764,796,747,765,72,701,782,74,848,905,1086
3,010030000000,"Lübeck, Hansestadt",747,806,759,729,69,694,79,705,779,851,964
4,010040000000,"Neumünster, Stadt",622,627,58,594,615,612,631,605,711,789,872
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10782,"1,60775E+11",Göpfersdorf,419,483,-,-,-,-,-,-,-,-,-
10783,"1,60775E+11",Langenleuba-Niederhain,44,4,473,521,466,457,368,436,435,375,532
10784,"1,60775E+11",Nobitz,469,45,461,459,465,485,452,485,563,657,496
10785,"1,60775E+11",Dobitschen,444,352,-,498,386,-,469,487,-,-,8


In [41]:
# --- 3. Datentypen prüfen ---
print("\nDatentypen:")
print(df5.dtypes)


Datentypen:
GKZ             object
Gemeindename    object
Miete_raw       object
befor 1919      object
1919-1949       object
1950-1959       object
1960-1969       object
1970-1979       object
1980-1989       object
1990-1999       object
2000-2009       object
2010-2015       object
 after 2016     object
dtype: object


In [45]:
# --- 4. Fehlende Werte prüfen ---
print("\nFehlende Werte (in %):")
print(df5.isna().mean().sort_values(ascending=False).head(15))


Fehlende Werte (in %):
GKZ             0.0
Gemeindename    0.0
Miete_raw       0.0
befor 1919      0.0
1919-1949       0.0
1950-1959       0.0
1960-1969       0.0
1970-1979       0.0
1980-1989       0.0
1990-1999       0.0
2000-2009       0.0
2010-2015       0.0
 after 2016     0.0
dtype: float64


In [43]:
# --- 5. Basisstatistik ---
print("\nBasisstatistik:")
print(df5.describe(include="all"))


Basisstatistik:
                GKZ  Gemeindename Miete_raw befor 1919 1919-1949 1950-1959  \
count         10787         10787     10787      10787     10787     10787   
unique         8480         10760       753        816       813       787   
top     1,30715E+11  Neuenkirchen      4,94          -         -         -   
freq            142             5        60        634      1267      1779   

       1960-1969 1970-1979 1980-1989 1990-1999 2000-2009 2010-2015  after 2016  
count      10787     10787     10787     10787     10787     10787       10787  
unique       786       793       815       820       875       950        1045  
top            -         -         -         -         -         -           -  
freq        1261      1469      1705      1193      2217      4000        3508  


In [44]:
# 7) Übersicht drucken
print("Zeilen gesamt:", len(df5))
print("Eindeutige GKZ:", df5["GKZ"].nunique())
print("Hat Duplikate (GKZ)?", df5["GKZ"].duplicated().any())
print("\nBeispiele:")
print(df5.head(10))

Zeilen gesamt: 10787
Eindeutige GKZ: 8480
Hat Duplikate (GKZ)? True

Beispiele:
            GKZ                   Gemeindename  Miete_raw befor 1919  \
0           GKZ                   Gemeindename  Insgesamt   Vor 1919   
1  010010000000               Flensburg, Stadt       6,96       7,28   
2  010020000000         Kiel, Landeshauptstadt       7,64       7,96   
3  010030000000             Lübeck, Hansestadt       7,47       8,06   
4  010040000000              Neumünster, Stadt       6,22       6,27   
5  010510011011             Brunsbüttel, Stadt       5,74       6,26   
6  010510044044                   Heide, Stadt       6,68       6,58   
7  010515163003                        Averlak       5,12       5,49   
8  010515163010                       Brickeln       5,73       6,51   
9  010515163012  Buchholz (Kreis Dithmarschen)          5        5,1   

     1919-1949    1950-1959    1960-1969    1970-1979    1980-1989  \
0  1919 - 1949  1950 - 1959  1960 - 1969  1970 - 1979  19

In [46]:
import pandas as pd
import os

# --- 0005: Baujahre ---
# 1) Einlesen (Delimiter automatisch erkennen, vorhandene Kopfzeile nutzen)
df5 = pd.read_csv("../data/4000W-0005_de.csv", sep=None, engine="python", dtype=str)

# 2) Spalten sauber benennen (Tippfehler ausbessern)
wanted_cols = [
    "GKZ", "Gemeindename", "Insgesamt",
    "vor_1919", "1919_1949", "1950_1959", "1960_1969",
    "1970_1979", "1980_1989", "1990_1999",
    "2000_2009", "2010_2015", "2016_plus"
]
# Mögliche Originalnamen → Zielnamen
rename_map = {
    "Miete_raw": "Insgesamt", "Insgesamt": "Insgesamt",
    "Vor 1919": "vor_1919", "befor 1919": "vor_1919",
    "1919-1949": "1919_1949", "1919 - 1949": "1919_1949",
    "1950-1959": "1950_1959", "1950 - 1959": "1950_1959",
    "1960-1969": "1960_1969", "1660-1969": "1960_1969",
    "1970-1979": "1970_1979", "1970 - 1979": "1970_1979",
    "1980-1989": "1980_1989", "1980 - 1989": "1980_1989",
    "1990-1999": "1990_1999", "1690-1999": "1990_1999",
    "2000-2009": "2000_2009", "2000 - 2009": "2000_2009",
    "2010-2015": "2010_2015", "2010 - 2015": "2010_2015",
    "2016 und später": "2016_plus", "after 2016": "2016_plus",
}
df5 = df5.rename(columns=rename_map)

# Falls die Datei noch andere/alte Kopfzeilenzeilen enthält: nur die gewünschten Spalten behalten
keep = [c for c in wanted_cols if c in df5.columns]
df5 = df5[keep].copy()

# 3) Zahlen konvertieren
num_cols = [c for c in df5.columns if c not in ("GKZ", "Gemeindename")]
for c in num_cols:
    df5[c] = (
        df5[c]
        .astype(str)
        .str.replace(r"[^0-9,.\-]", "", regex=True)  # Fußnoten/Leerzeichen raus
        .str.replace(",", ".", regex=False)
    )
    df5[c] = pd.to_numeric(df5[c], errors="coerce")

# 4) NaNs füllen: erst zeilenweise mit 'Insgesamt', dann Spaltenmedian
for c in num_cols:
    if c == "Insgesamt":
        continue
    df5[c] = df5[c].fillna(df5["Insgesamt"])
med = df5[num_cols].median(numeric_only=True)
df5[num_cols] = df5[num_cols].fillna(med)

print("0005 – Größe:", df5.shape)
print(df5.head(3))

# 5) Speichern
os.makedirs("../data/clean", exist_ok=True)
df5.to_csv("../data/clean/zensus_0005_clean.csv", index=False, encoding="utf-8")
print("Gespeichert → ../data/clean/zensus_0005_clean.csv")


0005 – Größe: (10786, 10)
             Gemeindename  Insgesamt  vor_1919  1919_1949  1950_1959  \
0        Flensburg, Stadt       6.96      7.28       6.55       6.40   
1  Kiel, Landeshauptstadt       7.64      7.96       7.47       7.65   
2      Lübeck, Hansestadt       7.47      8.06       7.59       7.29   

   1970_1979  1980_1989  2000_2009  2010_2015  2016_plus  
0       6.71       6.76       7.44       8.36       9.25  
1       7.01       7.82       8.48       9.05      10.86  
2       6.94       7.90       7.79       8.51       9.64  
Gespeichert → ../data/clean/zensus_0005_clean.csv


In [47]:
# --- 0008: Wohnfläche ---
df8 = pd.read_csv("../data/4000W-0008_de.csv", sep=None, engine="python", dtype=str)

# erwartete Basis
base_cols = ["GKZ", "Gemeindename", "Insgesamt"]
df8.columns = [c.strip() for c in df8.columns]

# numerische Spalten (alles außer GKZ/Gemeindename)
num8 = [c for c in df8.columns if c not in ("GKZ", "Gemeindename")]

for c in num8:
    df8[c] = (
        df8[c].astype(str)
        .str.replace(r"[^0-9,.\-]", "", regex=True)
        .str.replace(",", ".", regex=False)
    )
    df8[c] = pd.to_numeric(df8[c], errors="coerce")

for c in num8:
    if c == "Insgesamt":
        continue
    df8[c] = df8[c].fillna(df8["Insgesamt"])
med8 = df8[num8].median(numeric_only=True)
df8[num8] = df8[num8].fillna(med8)

print("0008 – Größe:", df8.shape)
print(df8.head(3))

df8.to_csv("../data/clean/zensus_0008_clean.csv", index=False, encoding="utf-8")
print("Gespeichert → ../data/clean/zensus_0008_clean.csv")


0008 – Größe: (10786, 20)
           ﻿GKZ            Gemeindename  Insgesamt  Unter 30 m²  30 - 39 m²  \
0  1.001000e+10        Flensburg, Stadt       6.96        11.18        8.25   
1  1.002000e+10  Kiel, Landeshauptstadt       7.64        10.34        8.55   
2  1.003000e+10      Lübeck, Hansestadt       7.47        10.01        8.06   

   40 - 49 m²  50 - 59 m²  60 - 69 m²  70 - 79 m²  80 - 89 m²  90 - 99 m²  \
0        6.85        6.87        6.67        6.57        6.63        6.60   
1        7.71        7.42        7.34        7.32        7.42        7.60   
2        7.54        7.33        7.12        7.08        7.41        7.62   

   100 - 109 m²  110 - 119 m²  120 - 129 m²  130 - 139 m²  140 - 149 m²  \
0          6.80          6.99          6.88          6.59          6.36   
1          7.72          7.85          7.92          7.52          7.43   
2          7.77          7.99          7.76          7.63          7.52   

   150 - 159 m²  160 - 169 m²  170 - 179 m²  18