# Bibliotheken importieren

In [None]:
# Data Science + Mathe Bibliotheken
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sb

# Metriken
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import r2_score
from sklearn.metrics import explained_variance_score
from sklearn.metrics import max_error

# Validierung
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

# Modelle
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor

# Warnungen 
import warnings

# Anomalien entfernen und Daten bereinigen

In [None]:
housing_data = pd.read_csv('DatenAusgegeben1.2_UTF8_manuell.csv', sep=";", keep_default_na=False)

warnings.filterwarnings('ignore')

# Bereinigung und Feature Engeneering

# für €/qm² für besseren Vergleich der Daten
housing_data["Preisproqm"] = housing_data["Preis"]/housing_data["Wohnflaeche in qm"]

# Monate seit 2136
housing_data["Monate"] = (housing_data["Verkaufsjahr"]-2136)*12 + housing_data["Verkaufsmonat"] -1

# Werte der Heizungsqualitaet in Zahlen umwandeln
housing_data["Heizungsqualitaet"] = [1 if x=="Schl" else 2 if x=="Ud" else 3 if x=="Ty" else 4 if x=="Gut" else 5 for x in housing_data["Heizungsqualitaet"]]

# Zustand, 1-10 wird auf 1-5 gemapt, das ist zwar ein bisschen ungenauer, aber einfacher zu vergleichen
housing_data["Zustandf"] = [int(x/2) + x%2 for x in housing_data["Zustand"]]

# Alle Zustände werden von 1...X auf 0....X-1 geschoben
housing_data["HeizungsqualitaetN"] = [ x-1 for x in housing_data["Heizungsqualitaet"]]
housing_data["ZustandN"] = [ x-1 for x in housing_data["Zustandf"]]
housing_data["FassadeN"] = [ x-1 for x in housing_data["Zustand Fassade"]]
housing_data["KuecheN"] = [ x-1 for x in housing_data["Kuechenqualitaet"]]

# Klimaanlage Ja/Nein  -> 1/0
housing_data["Klimaanlage"] = [1 if x == "Y" else 0 for x in housing_data["Klimaanlage"]]

# Besonders Große Daten beim Preis + Wohnfläche in qm rausschmeißen
max_value = housing_data["Preis"].unique()[-1:][0]
housing_data[housing_data["Preis"] != max_value]

max_value = housing_data["Wohnflaeche in qm"].unique()[-1:][0]
housing_data[housing_data["Wohnflaeche in qm"] != max_value]

max_value = housing_data["Grundstueck in qm"].unique()[-2:][0]
housing_data[housing_data["Grundstueck in qm"] != max_value]


# NA in Garage Typ wird zu "Keine Garage"
housing_data["Garage Typ"] = ['keine Garage' if x=='NA' else x for x in housing_data["Garage Typ"]]


print(housing_data.keys())

# Merkmale auswählen

In [None]:
# X und Y aus dem Datenset auslesen

# Zu testende Merkmale festlegen (X)
features = [ "Grundstueck in qm", "Wohnflaeche in qm","ZustandN","KuecheN","Klimaanlage", "Gebaut", "Garagenkapazitaet", "HeizungsqualitaetN", "Raeume"]
# Preis (Y)
predict= "Preis"

X = {}
for feature in features:
    X[feature] = housing_data[feature]
X = pd.DataFrame(X)

Y = pd.DataFrame(housing_data[predict])

# Regression

In [None]:
# K-fache Cross-Validierung
split = 10
Kfold = KFold(n_splits=split, random_state=102)
kfold = Kfold.split(X, Y)

model = {"data":[], "scores":{}, "models" : {}, "best" : {}, "types":["ols", "lasso", "ridge", "gbt", "rf", "sgd"]}
for mtype in model["types"]:
    model["scores"][mtype] = []
    model["models"][mtype] = []


for k, (train, test) in enumerate(kfold):
    model["models"]["ols"].append(linear_model.LinearRegression())
    model["models"]["lasso"].append(linear_model.Lasso())
    model["models"]["ridge"].append(linear_model.Ridge())
    model["models"]["gbt"].append(GradientBoostingRegressor(
        max_depth=4, n_estimators=100, learning_rate=0.15, loss="ls"))
    model["models"]["rf"].append(RandomForestRegressor(max_depth=10, n_estimators=10, criterion="mse"))
    model["models"]["sgd"].append(SGDRegressor(eta0=0.5))
    
    # Polynomielle Regression
    poly = PolynomialFeatures(degree=3)
    x_poly = poly.fit_transform(X.iloc[train, :])
    poly.fit(X.iloc[train, :], Y.iloc[train])
    x_poly_test = poly.fit_transform(X.iloc[test, :])
    y_train = Y.iloc[train]
    y_test = Y.iloc[test]
    model["data"].append({"xpoly": x_poly, "x_train": X.iloc[train, :], "y_train": y_train, 
                          "x_poly_test": x_poly_test, "x_test": X.iloc[test, :], "y_test": y_test})
    
    for mtype in model["types"]:
        curr_model = model["models"][mtype][k]
        curr_model.fit(x_poly, y_train)
        score = r2_score(curr_model.predict(x_poly_test), y_test)
        model["scores"][mtype].append(score)
        
print("Done")

# Modelle Bewerten


In [None]:
# Beste Modelle finden, r2_score ausgeben

for mtype in model["types"]:
    best = model["scores"][mtype].index(np.sort(model["scores"][mtype])[-1:][0])
    model["best"][mtype] = best    
    mean_score = np.mean(model["scores"][mtype])
    std_score = np.std(model["scores"][mtype])
    max_score = model["scores"][mtype][model["best"][mtype]]
    print("rs_score von %s:\t%.3f +- %.3f (%.3f max)" % (mtype, mean_score, std_score, max_score))

    
print(model["best"])

fig, axs = plt.subplots(1,len(model["types"]), sharex=False, sharey=True, figsize=(15,6))
for i in range(len(model["types"])):
    mtype = model["types"][i]
    best_model = model["models"][mtype][model["best"][mtype]]
    x_test = model["data"][model["best"][mtype]]["x_test"]
    y_test = model["data"][model["best"][mtype]]["y_test"]
    pred_y = np.reshape(best_model.predict(poly.fit_transform(x_test)), (-1,1))
    
    
    axs[i].hist(pred_y /y_test)
    axs[i].set_xlabel("HELP" + str(i))
    
    #index = scoresl.index(np.sort(scoresl)[-1:])
#use_data = data[index]

#predit = best_lasso.predict(use_data["x_poly_test"])
#scorel = r2_score(lasso[k].predict(x_poly_test), Y.iloc[test])
# Gleicher 




# Vergleich von Model und Testdaten

In [None]:

[]

In [None]:
for feature in features:
    plt.scatter(poly.fit_transform(X_test[feature]), y_test, c="blue")
    plt.scatter(X_test[feature], best_model.predict(X_test[feature]), c="red")
    plt.suptitle(feature)
    plt.show()

plt.hist(best_model.predict(X_test[feature]) / y_test)

# Vorhersage 

In [None]:
# "Grundstueck in qm", "Wohnflaeche in qm","ZustandN","KuecheN","Klimaanlage", "Gebaut", 
# "Garagenkapazitaet", "HeizungsqualitaetN", "Raeume"]

# Beste Regression anhand r2 wählen
#TO-DO get best model by r2 value
best_model = 

y = best_model.predict(poly.fit_transform([[2000, 160, 4, 4, 1, 2130, 2, 4, 4]]))

print(round(y[0], 2), "GC Dollar")

# Veraltete Regression

In [None]:
# Daten vorbereiten

# Wichtigste Merkmale 
# Klimaanlage, Heizungsqualität, Küchenqualität, Zustand, Fassadenqualität 

# In Trainings- und Testdaten unterteilen
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.2)

poly = PolynomialFeatures(degree=3)
x_poly = poly.fit_transform(X_train)
poly.fit(X_train, y_train)


print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
# Regression

reg = linear_model.LinearRegression()
ridge = linear_model.Ridge(alpha=0.05)
lasso = linear_model.Lasso(alpha=0.05)

reg.fit(x_poly, y_train)
ridge.fit(x_poly, y_train)
lasso.fit(x_poly, y_train)

#V alidierung
y_pred_reg = reg.predict(poly.fit_transform(X_test))
y_pred_ridge = ridge.predict(poly.fit_transform(X_test))
y_pred_lasso = lasso.predict(poly.fit_transform(X_test))


print(r2_score(y_test, y_pred_reg), mean_squared_error(y_test, y_pred_reg), max_error(y_test, y_pred_reg))
print(r2_score(y_test, y_pred_ridge), mean_squared_error(y_test, y_pred_ridge), max_error(y_test, y_pred_ridge))
print(r2_score(y_test, y_pred_lasso), mean_squared_error(y_test, y_pred_lasso), max_error(y_test, y_pred_lasso))

In [None]:
plt.hist(y_pred_reg / y_test)

In [None]:
for feature in features:
    plt.scatter(X_test[feature], y_test, c="blue")
    #plt.scatter(X_test[feature], y_pred_reg, c="red")
    #plt.scatter(X_test[feature], y_pred_ridge, c="orange")
    plt.scatter(X_test[feature], y_pred_lasso, c="yellow")
    plt.suptitle(feature)
    plt.show()


In [None]:
#"Grundstueck in qm", "Wohnflaeche in qm","ZustandN","KuecheN","Klimaanlage", "Gebaut", #"Garagenkapazitaet", "HeizungsqualitaetN", "Raeume"]


y = reg.predict(poly.fit_transform([[3000, 180, 4, 4, 1, 2130, 2, 2, 20]]))
y