In [126]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import ipywidgets as widgets
from IPython.display import display


Table of contents

Import and preprocessing

In [132]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import ipywidgets as widgets
from IPython.display import display

# Lade deine Daten (Pfade müssen angepasst werden)
df_location = pd.read_csv('filtered_property_location_clean.csv', delimiter=';')
df_features = pd.read_csv('df_property_features_encoded.csv', delimiter=';')

df_location ['YearBuilt'] = df_location ['YearBuilt'].str.split('-').str[0]  # Split and keep the first part
df_location ['YearBuilt'] = df_location ['YearBuilt'].fillna(0)               # Replace NaN values with 0
df_location ['YearBuilt'] = df_location ['YearBuilt'].astype(int) 




# Umbenennen für den Join
df_features.rename(columns={'ListingId': 'LocationId'}, inplace=True)

# Join der Datensätze
df = pd.merge(df_location, df_features, on='LocationId', how='left')

# Entferne die Timestamp-Spalte, falls vorhanden
if 'Timestamp' in df.columns:
    df.drop('Timestamp', axis=1, inplace=True)

# Überprüfe, ob die Spalte 'Canton' vorhanden ist
print(df.columns)












Index(['ListingId', 'Rooms', 'SquareMeter', 'Floor', 'Availability',
       'ObjectType', 'YearBuilt', 'Price', 'AdditionalCost', 'NetPrice',
       'LocationId', 'PropertyAdditionalFeaturesId', 'PropertyDescription',
       'Vendor', 'Canton', 'street', 'zip', 'longitude', 'latitude',
       'bfs_number', 'municipality', 'incometax_canton',
       'incometax_municipality', 'wealthtax_canton', 'wealthtax_municipality',
       'population_density', 'public_transport_count', 'supermarket_count',
       'foodandbeverage_count', 'Price_per_m2', 'Abwasseranschluss', 'Altbau',
       'Anfahrrampe LKW', 'Aussicht', 'Balkon / Terrasse',
       'Bauland erschlossen', 'Cheminée', 'Eckhaus', 'Estrich', 'Garage',
       'Gasanschluss', 'Gedeckt', 'Geschirrspüler', 'Haustiere erlaubt',
       'Hebebühne', 'Hochparterre', 'Im Baurecht', 'In Wohngemeinschaft',
       'Kabel-TV', 'Keller', 'Kinderfreundlich', 'Lift', 'Minergie Bauweise',
       'Minergie zertifiziert', 'Mit Gartenhaus', 'Mittelhaus', 

Schritt 3: Daten Joinen

In [128]:
# Rename 'ListingId' in df_features to 'LocationId' for the join
# df_features = df_features.rename(columns={"ListingId": "LocationId"})

# Join der Daten anhand der LocationId
# df_joined = pd.merge(df_location, df_features, on="LocationId", how="left")


Schritt 4: Daten vorbereiten

In [129]:
cantons = ['ZH', 'BE', 'LU', 'UR', 'SZ', 'OW', 'NW', 'GL', 'ZG', 'FR', 
           'SO', 'BS', 'BL', 'SH', 'AR', 'AI', 'SG', 'GR', 'AG', 'TG', 
           'TI', 'VD', 'VS', 'NE', 'GE', 'JU']

# Korrektur der YearBuilt-Spalte
#df['YearBuilt'] = pd.to_datetime(df['YearBuilt'], errors='coerce').dt.year
#df['YearBuilt'].fillna(df['YearBuilt'].median(), inplace=True)

# Überprüfung, ob die Umwandlung erfolgreich war
print(df['YearBuilt'].dtype)
print(df['YearBuilt'].head())

# Sicherstellen, dass YearBuilt als float behandelt wird
df['YearBuilt'] = df['YearBuilt'].astype(int)

# Konvertiere 'EG' zu 0 in der relevanten Spalte (hier angenommen, dass 'Floor' die relevante Spalte ist)
if 'Floor' in df.columns:
    df['Floor'] = df['Floor'].replace('EG', 0)
    df['Floor'] = df['Floor'].astype(float)

# OneHotEncoder vorbereiten mit allen Kantonen
encoder = OneHotEncoder(categories=[cantons], drop=None, handle_unknown='ignore')
encoded_features = encoder.fit_transform(df[['Canton']])

# Erstellen der Spaltennamen für die kodierten Features
feature_names = ['Canton_' + c for c in cantons]

# Erstellen des DataFrames mit kodierten Features
df_encoded = pd.DataFrame(encoded_features.toarray(), columns=feature_names)

# Hinzufügen der kodierten Kantone zurück zum DataFrame
df = df.join(df_encoded)

# Entfernen der originalen Canton-Spalte
df.drop('Canton', axis=1, inplace=True)

# Sicherstellen, dass alle numerischen Spalten tatsächlich numerisch sind
df['Rooms'] = df['Rooms'].astype(float)
df['SquareMeter'] = df['SquareMeter'].astype(float)

# Überprüfung der Datentypen
print(df.dtypes)
print(df['Rooms'])


df.to_clipboard()


int64
0    2020
1    2020
2    2020
3    2020
4    2020
Name: YearBuilt, dtype: int64
ListingId         int64
Rooms           float64
SquareMeter     float64
Floor           float64
Availability     object
                 ...   
Canton_VD       float64
Canton_VS       float64
Canton_NE       float64
Canton_GE       float64
Canton_JU       float64
Length: 94, dtype: object
0        2.5
1        2.5
2        2.5
3        2.5
4        2.5
        ... 
35179    2.5
35180    2.5
35181    5.0
35182    1.5
35183    1.5
Name: Rooms, Length: 35184, dtype: float64


Schritt 5 Modell erstellen und trainieren

In [131]:
df.to_clipboard()

# Vorbereitung der Features und der Zielvariable
X = df.drop('Price', axis=1)
y = df['Price']

df['Rooms']

# Aufteilung in Trainings- und Testdatensätze
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipeline für die Modellierung
model = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])


# Trainieren des Modells
model.fit(X_train, y_train)

# Modell evaluieren
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'MSE: {mse}')


ValueError: Cannot use median strategy with non-numeric data:
could not convert string to float: '01.07.2024'

Modell bewerten

Schritt 7: Modell verwenden für Vorhersagen

In [None]:
def predict(year_built, rooms, square_meter, **kwargs):
    data = {'YearBuilt': [year_built], 'Rooms': [rooms], 'SquareMeter': [square_meter]}
    for key, value in kwargs.items():
        data[key] = [value]
    df_predict = pd.DataFrame(data)
    price_predicted = model.predict(df_predict)
    print(f"Predicted Price: {price_predicted[0]}")

widgets.interact(predict,
                 year_built=widgets.IntSlider(min=1900, max=2023, value=2010, description='Year Built:', style={'description_width': 'initial'}),
                 rooms=widgets.FloatSlider(min=1, max=20, step=0.5, value=3, description='Rooms:', style={'description_width': 'initial'}),
                 square_meter=widgets.FloatSlider(min=10, max=1000, step=10, value=100, description='Square Meter:', style={'description_width': 'initial'}),
                 **{f'Canton_{canton}': widgets.Checkbox(value=False, description=canton, disabled=False) for canton in cantons})


interactive(children=(IntSlider(value=2010, description='Year Built:', max=2023, min=1900, style=SliderStyle(d…

<function __main__.predict(year_built, rooms, square_meter, **kwargs)>