Um das Model zu testen, ändern Sie den Dateinamen im Code-Teil unter "Unabhängige Validierung".

# Bibliotheken hinzufügen

In [None]:
# Data Science + Mathe Bibliotheken
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sb

# Metriken
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import r2_score
from sklearn.metrics import explained_variance_score
from sklearn.metrics import max_error

from sklearn.utils.validation import check_array as check_arrays

# Validierung
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

# Modelle
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor

# Warnungen 
import warnings

# MAPE definieren

In [None]:
#https://www.statology.org/mape-python/

def mape(actual, pred): 
    actual, pred = np.array(actual), np.array(pred)
    return np.mean(np.abs((actual - pred) / actual)) * 100

In [None]:
housing_data = pd.read_csv('DatenAusgegeben1.2_UTF8_manuell.csv', sep=";", keep_default_na=False)

warnings.filterwarnings('ignore')

# Bereinigung und Feature Engeneering

# für €/qm² für besseren Vergleich der Daten
housing_data["Preisproqm"] = housing_data["Preis"]/housing_data["Wohnflaeche in qm"]


# Monate seit 2136
housing_data["Monate"] = (housing_data["Verkaufsjahr"]-2136)*12 + housing_data["Verkaufsmonat"] -1

# Werte der Heizungsqualitaet in Zahlen umwandeln
housing_data["Heizungsqualitaet"] = [1 if x=="Schl" else 2 if x=="Ud" else 3 if x=="Ty" else 4 if x=="Gut" else 5 for x in housing_data["Heizungsqualitaet"]]

# Zustand, 1-10 wird auf 1-5 gemapt, das ist zwar ein bisschen ungenauer, aber einfacher zu vergleichen
housing_data["Zustandf"] = [int(x/2) + x%2 for x in housing_data["Zustand"]]

# Alle Zustände werden von 1...X auf 0....X-1 geschoben
housing_data["HeizungsqualitaetN"] = [ x-1 for x in housing_data["Heizungsqualitaet"]]
housing_data["ZustandN"] = [ x-1 for x in housing_data["Zustandf"]]
housing_data["FassadeN"] = [ x-1 for x in housing_data["Zustand Fassade"]]
housing_data["KuecheN"] = [ x-1 for x in housing_data["Kuechenqualitaet"]]

# Klimaanlage Ja/Nein  -> 1/0
housing_data["Klimaanlage"] = [1 if x == "Y" else 0 for x in housing_data["Klimaanlage"]]

# Besonders Große Daten beim Preis + Wohnfläche in qm rausschmeißen
max_value = housing_data["Preis"].unique()[-1:][0]
housing_data[housing_data["Preis"] != max_value]

max_value = housing_data["Wohnflaeche in qm"].unique()[-1:][0]
housing_data[housing_data["Wohnflaeche in qm"] != max_value]

max_value = housing_data["Grundstueck in qm"].unique()[-2:][0]
housing_data[housing_data["Grundstueck in qm"] != max_value]


# NA in Garage Typ wird zu "Keine Garage"
housing_data["Garage Typ"] = ['keine Garage' if x=='NA' else x for x in housing_data["Garage Typ"]]


print(housing_data.keys())

# Merkmale auswählen

In [None]:
# X und Y aus dem Datenset auslesen

# Zu testende Merkmale festlegen (X)
features = [ "Grundstueck in qm", "Wohnflaeche in qm","ZustandN","KuecheN","Klimaanlage", "Gebaut", "Garagenkapazitaet", "HeizungsqualitaetN", "Raeume"]
# Preis (Y)
predict= "Preis"

X = {}
for feature in features:
    X[feature] = housing_data[feature]
X = pd.DataFrame(X)

Y = pd.DataFrame(housing_data[predict])

# Regression

In [None]:
# K-fache Cross-Validierung
split = 10
Kfold = KFold(n_splits=split, random_state=42)
kfold = Kfold.split(X, Y)
scoresr = []
scoresl = []
scoresri = []
scoresgbt = []
scoresrf = []
scoressgd = []
reg = [None]*split
lasso = [None]*split
ridge = [None]*split
gbt = [None]*split
rf = [None]*split
sgd = [None]*split

for k, (train, test) in enumerate(kfold):
    # Polynomielle Regression
    poly = PolynomialFeatures(degree=3)
    x_poly = poly.fit_transform(X.iloc[train, :])
    poly.fit(X.iloc[train, :], Y.iloc[train])
    x_poly_test = poly.fit_transform(X.iloc[test, :])
    
    # Ordinary Least Squared
    reg[k] = linear_model.LinearRegression()
    reg[k].fit(x_poly, Y.iloc[train])
    scorer = r2_score(reg[k].predict(x_poly_test), Y.iloc[test])
    scoresr.append(scorer)
    
    # Lasso Regression
    lasso[k] = linear_model.Lasso()
    lasso[k].fit(x_poly, Y.iloc[train])
    scorel = r2_score(lasso[k].predict(x_poly_test), Y.iloc[test])
    scoresl.append(scorel)
    
    # Ridge Regression
    ridge[k] = linear_model.Ridge()
    ridge[k].fit(x_poly, Y.iloc[train])
    scoreri = r2_score(ridge[k].predict(x_poly_test), Y.iloc[test])
    scoresri.append(scoreri)

    # Gradient Boosting Tree
    gbt[k] = GradientBoostingRegressor(max_depth=4, n_estimators=100, learning_rate=0.15, loss="ls")
    gbt[k].fit(x_poly, Y.iloc[train]) 
    scoregbt = r2_score(gbt[k].predict(x_poly_test), Y.iloc[test])
    scoresgbt.append(scoregbt)

    # Random Forest 
    rf[k] = RandomForestRegressor(max_depth=10, n_estimators=10, criterion="mse")
    rf[k].fit(x_poly, Y.iloc[train])
    scorerf = r2_score(rf[k].predict(x_poly_test), Y.iloc[test])
    scoresrf.append(scorerf)

    # Stochastic Gradiend Descend
    sgd[k] = SGDRegressor(eta0=0.5)
    sgd[k].fit(x_poly, Y.iloc[train])
    scoresgd = r2_score(sgd[k].predict(x_poly_test), Y.iloc[test])
    scoressgd.append(scoresgd)

    
print("Ordinary Squared Loss:" + str(np.sort(scoresr)[-1:]))
best = scoresr.index(np.sort(scoresr)[-1:])
best_reg = reg[k]

print("Lasso:" + str(np.sort(scoresl)[-1:]))
best = scoresl.index(np.sort(scoresl)[-1:])
best_lasso = lasso[k]

print("Ridge:" + str(np.sort(scoresri)[-1:]))
best = scoresri.index(np.sort(scoresri)[-1:])
best_ridge = ridge[k]

print("Gradient Boosting Tree:" + str(np.sort(scoresgbt)[-1:]))
best = scoresgbt.index(np.sort(scoresgbt)[-1:])
best_gbt = gbt[k]

print("Random Forest :" + str(np.sort(scoresrf)[-1:]))
best = scoresrf.index(np.sort(scoresrf)[-1:])
best_rf = rf[k]

print("Stochastic Gradiend Descend:" + str(np.sort(scoressgd)[-1:]))
best = scoressgd.index(np.sort(scoressgd)[-1:])
best_sgd = sgd[k]

# Vergleich von Model und Testdaten

In [None]:
for feature in features:
    plt.scatter(X_test[feature], y_test, c="blue")
    plt.scatter(X_test[feature], best_model.predict(X_test[feature]), c="red")
    plt.suptitle(feature)
    plt.show()

plt.hist(best_model.predict(X_test[feature]) / y_test)

# Vorhersage 

In [None]:
# "Grundstueck in qm", "Wohnflaeche in qm","ZustandN","KuecheN","Klimaanlage", "Gebaut", 
# "Garagenkapazitaet", "HeizungsqualitaetN", "Raeume"]

# Beste Regression anhand r2 wählen
#TO-DO get best model by r2 value
best_model = 

y = best_model.predict(poly.fit_transform([[2000, 160, 4, 4, 1, 2130, 2, 4, 4]]))

print(round(y[0], 2), "GC Dollar")

# Unabhängige Validierung

In [None]:
# housing_data2 = pd.read_csv('DatenAusgegeben1.2_UTF8_manuell.csv', sep=";", keep_default_na=False)


# housing_data2["Preisproqm"] = housing_data2["Preis"]/housing_data2["Wohnflaeche in qm"]

# # Monate seit 2136
# housing_data2["Monate"] = (housing_data2["Verkaufsjahr"]-2136)*12 + housing_data2["Verkaufsmonat"] -1

# # Werte der Heizungsqualitaet in Zahlen umwandeln
# housing_data2["Heizungsqualitaet"] = [1 if x=="Schl" else 2 if x=="Ud" else 3 if x=="Ty" else 4 if x=="Gut" else 5 for x in housing_data2["Heizungsqualitaet"]]

# # Zustand, 1-10 wird auf 1-5 gemapt, das ist zwar ein bisschen ungenauer, aber einfacher zu vergleichen
# housing_data2["Zustandf"] = [int(x/2) + x%2 for x in housing_data2["Zustand"]]

# # Alle Zustände werden von 1...X auf 0....X-1 geschoben
# housing_data2["HeizungsqualitaetN"] = [ x-1 for x in housing_data2["Heizungsqualitaet"]]
# housing_data2["ZustandN"] = [ x-1 for x in housing_data2["Zustandf"]]
# housing_data2["FassadeN"] = [ x-1 for x in housing_data2["Zustand Fassade"]]
# housing_data2["KuecheN"] = [ x-1 for x in housing_data2["Kuechenqualitaet"]]

# # Klimaanlage Ja/Nein  -> 1/0
# housing_data2["Klimaanlage"] = [1 if x == "Y" else 0 for x in housing_data2["Klimaanlage"]]

# # Besonders Große Daten beim Preis + Wohnfläche in qm rausschmeißen
# max_value = housing_data2["Preis"].unique()[-1:][0]
# housing_data2[housing_data2["Preis"] != max_value]

# max_value = housing_data2["Wohnflaeche in qm"].unique()[-1:][0]
# housing_data2[housing_data2["Wohnflaeche in qm"] != max_value]

# max_value = housing_data2["Grundstueck in qm"].unique()[-2:][0]
# housing_data2[housing_data2["Grundstueck in qm"] != max_value]

# # NA in Garage Typ wird zu "Keine Garage"
# housing_data2["Garage Typ"] = ['keine Garage' if x=='NA' else x for x in housing_data2["Garage Typ"]]


# X2 = {}
# for feature in features:
#     X2[feature] = housing_data2[feature]
# X2 = pd.DataFrame(X)

# Y2 = pd.DataFrame(housing_data2[predict])

# x2_poly = poly.fit_transform(X2.iloc[train, :])
# poly.fit(X2.iloc[train, :], Y2.iloc[train])
# x2_poly_test = poly.fit_transform(X2.iloc[test, :])

# # R2, MSE, RMSE, MAPE, MAX
# r2_test = r2_score(lasso[k].predict(x2_poly_test), Y2.iloc[test])
# mse_test = mean_squared_error(lasso[k].predict(x2_poly_test), Y2.iloc[test])
# rmse_test = mean_squared_error(lasso[k].predict(x2_poly_test), Y2.iloc[test], squared=False)
# mape_test = mape(lasso[k].predict(x2_poly_test).reshape(-1, 1), Y2.iloc[test])
# max_test = max_error(lasso[k].predict(x2_poly_test), Y2.iloc[test])

# print("R2 Wert:", r2_test)
# print("Mean Squared Error Wert:", mse_test)
# print("Negativer Mean Square Error Wert:", rmse_test)
# print("Mean Absolute Percentage Error Wert:", mape_test)
# print("Max Error Wert:", max_test)

In [None]:
# Daten vorbereiten

# Wichtigste Merkmale 
# Klimaanlage, Heizungsqualität, Küchenqualität, Zustand, Fassadenqualität 

# In Trainings- und Testdaten unterteilen
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.2)

poly = PolynomialFeatures(degree=3)
x_poly = poly.fit_transform(X_train)
poly.fit(X_train, y_train)


print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
# Regression

reg = linear_model.LinearRegression()
ridge = linear_model.Ridge(alpha=0.05)
lasso = linear_model.Lasso(alpha=0.05)

reg.fit(x_poly, y_train)
ridge.fit(x_poly, y_train)
lasso.fit(x_poly, y_train)

#V alidierung
y_pred_reg = reg.predict(poly.fit_transform(X_test))
y_pred_ridge = ridge.predict(poly.fit_transform(X_test))
y_pred_lasso = lasso.predict(poly.fit_transform(X_test))


print(r2_score(y_test, y_pred_reg), mean_squared_error(y_test, y_pred_reg), max_error(y_test, y_pred_reg))
print(r2_score(y_test, y_pred_ridge), mean_squared_error(y_test, y_pred_ridge), max_error(y_test, y_pred_ridge))
print(r2_score(y_test, y_pred_lasso), mean_squared_error(y_test, y_pred_lasso), max_error(y_test, y_pred_lasso))

In [None]:
plt.hist(y_pred_reg / y_test)

In [None]:
for feature in features:
    plt.scatter(X_test[feature], y_test, c="blue")
    #plt.scatter(X_test[feature], y_pred_reg, c="red")
    #plt.scatter(X_test[feature], y_pred_ridge, c="orange")
    plt.scatter(X_test[feature], y_pred_lasso, c="yellow")
    plt.suptitle(feature)
    plt.show()


In [None]:
#"Grundstueck in qm", "Wohnflaeche in qm","ZustandN","KuecheN","Klimaanlage", "Gebaut", #"Garagenkapazitaet", "HeizungsqualitaetN", "Raeume"]


y = reg.predict(poly.fit_transform([[3000, 180, 4, 4, 1, 2130, 2, 2, 20]]))
y