In [12]:
from pathlib import Path
import re
import numpy as np
import pandas as pd


# fester Pfad relativ zum Notebook
DATA = Path("../data")
CSV_0004 = DATA / "4000W-0004_de.csv"
CLEAN = DATA / "clean"
CLEAN.mkdir(exist_ok=True)

print("Datei:", CSV_0004.resolve(), "| existiert:", CSV_0004.exists())



Datei: D:\Hausverwaltung\rental-price-prediction-dashboard\data\4000W-0004_de.csv | existiert: True


In [5]:
df = pd.read_csv("../data/4000W-0004_de.csv", sep=";", header=None, names=["GKZ","Gemeindename","Miete_raw"], encoding="utf-8-sig")
df

Unnamed: 0,GKZ,Gemeindename,Miete_raw
0,010010000000,"Flensburg, Stadt",696
1,010020000000,"Kiel, Landeshauptstadt",764
2,010030000000,"Lübeck, Hansestadt",747
3,010040000000,"Neumünster, Stadt",622
4,010510011011,"Brunsbüttel, Stadt",574
...,...,...,...
10781,"1,60775E+11",Göpfersdorf,419
10782,"1,60775E+11",Langenleuba-Niederhain,44
10783,"1,60775E+11",Nobitz,469
10784,"1,60775E+11",Dobitschen,444


In [10]:
df["Gemeindename"] = df["Gemeindename"].astype(str).str.split(",", n=1).str[0].str.strip()
df

Unnamed: 0,GKZ,Gemeindename,Miete_raw
0,010010000000,Flensburg,696
1,010020000000,Kiel,764
2,010030000000,Lübeck,747
3,010040000000,Neumünster,622
4,010510011011,Brunsbüttel,574
...,...,...,...
10781,"1,60775E+11",Göpfersdorf,419
10782,"1,60775E+11",Langenleuba-Niederhain,44
10783,"1,60775E+11",Nobitz,469
10784,"1,60775E+11",Dobitschen,444


In [12]:
df["Gemeindename"] = df["Gemeindename"].astype(str).str.split("(", n=1).str[0].str.strip()
df

Unnamed: 0,GKZ,Gemeindename,Miete_raw
0,010010000000,Flensburg,696
1,010020000000,Kiel,764
2,010030000000,Lübeck,747
3,010040000000,Neumünster,622
4,010510011011,Brunsbüttel,574
...,...,...,...
10781,"1,60775E+11",Göpfersdorf,419
10782,"1,60775E+11",Langenleuba-Niederhain,44
10783,"1,60775E+11",Nobitz,469
10784,"1,60775E+11",Dobitschen,444


In [15]:
# Miete in float umwandeln
#    - Leerzeichen weg
#    - Euro/Einheiten weg
#    - deutsches Komma in Punkt umwandeln
def to_float_de(s):
    if pd.isna(s): 
        return None
    s = str(s)
    s = s.replace("€", "").replace("€/m²", "").strip()
    s = s.replace(".", "")      # tausenderpunkte weg (falls vorhanden)
    s = s.replace(",", ".")     # deutsches Komma -> Punkt
    try:
        return float(s)
    except:
        return None
df["Miete"] = df["Miete_raw"].map(to_float_de)
df

Unnamed: 0,GKZ,Gemeindename,Miete_raw,Miete
0,010010000000,Flensburg,696,6.96
1,010020000000,Kiel,764,7.64
2,010030000000,Lübeck,747,7.47
3,010040000000,Neumünster,622,6.22
4,010510011011,Brunsbüttel,574,5.74
...,...,...,...,...
10781,"1,60775E+11",Göpfersdorf,419,4.19
10782,"1,60775E+11",Langenleuba-Niederhain,44,4.40
10783,"1,60775E+11",Nobitz,469,4.69
10784,"1,60775E+11",Dobitschen,444,4.44


In [19]:
df = df.dropna(subset=["Miete"])
df

Unnamed: 0,GKZ,Gemeindename,Miete_raw,Miete
0,010010000000,Flensburg,696,6.96
1,010020000000,Kiel,764,7.64
2,010030000000,Lübeck,747,7.47
3,010040000000,Neumünster,622,6.22
4,010510011011,Brunsbüttel,574,5.74
...,...,...,...,...
10781,"1,60775E+11",Göpfersdorf,419,4.19
10782,"1,60775E+11",Langenleuba-Niederhain,44,4.40
10783,"1,60775E+11",Nobitz,469,4.69
10784,"1,60775E+11",Dobitschen,444,4.44


In [25]:
df = df.drop(columns=["Miete_raw"])

In [26]:
# --- 3. Datentypen prüfen ---
print("\nDatentypen:")
print(df.dtypes)


Datentypen:
GKZ              object
Gemeindename     object
Miete           float64
dtype: object


In [27]:
# --- 4. Fehlende Werte prüfen ---
print("\nFehlende Werte (in %):")
print(df.isna().mean().sort_values(ascending=False).head(10))


Fehlende Werte (in %):
GKZ             0.0
Gemeindename    0.0
Miete           0.0
dtype: float64


In [28]:
# --- 5. Basisstatistik ---
print("\nBasisstatistik:")
print(df.describe(include="all"))


Basisstatistik:
                GKZ  Gemeindename         Miete
count         10713         10713  10713.000000
unique         8439         10120           NaN
top     1,30715E+11  Neuenkirchen           NaN
freq            141            11           NaN
mean            NaN           NaN      5.776486
std             NaN           NaN      1.323485
min             NaN           NaN      0.090000
25%             NaN           NaN      4.830000
50%             NaN           NaN      5.540000
75%             NaN           NaN      6.460000
max             NaN           NaN     13.840000


In [30]:
# --- 6. Eindeutigkeit der GKZ prüfen ---
print("\nEindeutige GKZ:", df["GKZ"].nunique())
print("Duplikate?", df["GKZ"].duplicated().any())


Eindeutige GKZ: 8439
Duplikate? True


In [31]:
# 7) Übersicht drucken
print("Zeilen gesamt:", len(df))
print("Eindeutige GKZ:", df["GKZ"].nunique())
print("Hat Duplikate (GKZ)?", df["GKZ"].duplicated().any())
print("\nBeispiele:")
print(df.head(10))

Zeilen gesamt: 10713
Eindeutige GKZ: 8439
Hat Duplikate (GKZ)? True

Beispiele:
            GKZ Gemeindename  Miete
0  010010000000    Flensburg   6.96
1  010020000000         Kiel   7.64
2  010030000000       Lübeck   7.47
3  010040000000   Neumünster   6.22
4  010510011011  Brunsbüttel   5.74
5  010510044044        Heide   6.68
6  010515163003      Averlak   5.12
7  010515163010     Brickeln   5.73
8  010515163012     Buchholz   5.00
9  010515163016         Burg   5.67


In [32]:
# 8) Speichern
import os, re
out_dir = "../data/clean"
os.makedirs(out_dir, exist_ok=True)
out_path = f"{out_dir}/zensus_0004_clean.csv"
df.to_csv(out_path, index=False, encoding="utf-8")  # Komma-getrennt
print("\nGespeichert →", out_path)
print("Finale Spalten:", df.columns.tolist())


Gespeichert → ../data/clean/zensus_0004_clean.csv
Finale Spalten: ['GKZ', 'Gemeindename', 'Miete']
