In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression

# Wir laden unsere Daten
Hierfür benötigen wir ein Dataset

In [2]:
df = pd.read_csv('house_prices.csv')
df = df.drop(['id', 'date', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long'], axis=1)
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,sqft_living15,sqft_lot15
0,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1340,5650
1,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1690,7639
2,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,2720,8062
3,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1360,5000
4,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1800,7503


In [3]:
x = df.drop('price', axis=1)
y = df['price']

trainX, testX, trainY, testY = train_test_split(x, y, test_size = 0.2)

In [4]:
sc=StandardScaler()

scaler = sc.fit(trainX)

trainX_scaled = scaler.transform(trainX)
testX_scaled = scaler.transform(testX)

# Lineare Regression
Für das Beispiel verwenden wir eine Lineare Regression mit Trainingsdaten und wenden diese auf den Test Daten an

In [5]:
lm = LinearRegression()
lm.fit(trainX_scaled, trainY)

In [6]:
y_pred = lm.predict(testX_scaled)

In [7]:
df_results = pd.DataFrame({'Actual': testY, 'Predicted': y_pred})
df_results.head()

Unnamed: 0,Actual,Predicted
5132,507000.0,1036798.0
5824,386000.0,632363.5
6950,482000.0,513188.5
6837,299000.0,396395.2
6561,1337500.0,967735.9


# Berechnung des MSE
Die Berechnung machen wir zuerst manuell um die einzelnen Schritte zu erläutern. Der große Unterschied zum MAE liegt daran, dass wir hier den Fehler zum quadrat nehmen.

In [41]:
import math

In [42]:
df_results['error'] = df_results['Actual'] - df_results['Predicted']
df_results.head()

Unnamed: 0,Actual,Predicted,actual_abs,error,error_abs,percentage_error,predicted_abs
5132,507000.0,1036798.0,507000.0,-529797.658953,529797.658953,51.099427,1036798.0
5824,386000.0,632363.5,386000.0,-246363.462546,246363.462546,38.959155,632363.5
6950,482000.0,513188.5,482000.0,-31188.470358,31188.470358,6.077391,513188.5
6837,299000.0,396395.2,299000.0,-97395.153708,97395.153708,24.570218,396395.2
6561,1337500.0,967735.9,1337500.0,369764.093037,369764.093037,38.209194,967735.9


In [43]:
df_results['error_abs'] = df_results['error'].abs()
df_results.head()

Unnamed: 0,Actual,Predicted,actual_abs,error,error_abs,percentage_error,predicted_abs
5132,507000.0,1036798.0,507000.0,-529797.658953,529797.658953,51.099427,1036798.0
5824,386000.0,632363.5,386000.0,-246363.462546,246363.462546,38.959155,632363.5
6950,482000.0,513188.5,482000.0,-31188.470358,31188.470358,6.077391,513188.5
6837,299000.0,396395.2,299000.0,-97395.153708,97395.153708,24.570218,396395.2
6561,1337500.0,967735.9,1337500.0,369764.093037,369764.093037,38.209194,967735.9


In [44]:
df_results['actual_abs'] = df_results['Actual'].abs()
df_results.head()

Unnamed: 0,Actual,Predicted,actual_abs,error,error_abs,percentage_error,predicted_abs
5132,507000.0,1036798.0,507000.0,-529797.658953,529797.658953,51.099427,1036798.0
5824,386000.0,632363.5,386000.0,-246363.462546,246363.462546,38.959155,632363.5
6950,482000.0,513188.5,482000.0,-31188.470358,31188.470358,6.077391,513188.5
6837,299000.0,396395.2,299000.0,-97395.153708,97395.153708,24.570218,396395.2
6561,1337500.0,967735.9,1337500.0,369764.093037,369764.093037,38.209194,967735.9


In [45]:
df_results['percentage_error'] = (df_results['error_abs'] / df_results['actual_abs'])
df_results.head()

Unnamed: 0,Actual,Predicted,actual_abs,error,error_abs,percentage_error,predicted_abs
5132,507000.0,1036798.0,507000.0,-529797.658953,529797.658953,1.044966,1036798.0
5824,386000.0,632363.5,386000.0,-246363.462546,246363.462546,0.638247,632363.5
6950,482000.0,513188.5,482000.0,-31188.470358,31188.470358,0.064706,513188.5
6837,299000.0,396395.2,299000.0,-97395.153708,97395.153708,0.325736,396395.2
6561,1337500.0,967735.9,1337500.0,369764.093037,369764.093037,0.276459,967735.9


In [46]:
sum_error_abs = df_results['percentage_error'].sum()
sum_error_abs

1368.6070340847923

In [47]:
no_observations = len(df_results)
no_observations

4323

In [48]:
mape = sum_error_abs / no_observations
mape

0.31658733150238083

# In Python geht es viel schneller
Mit der sklearn Bibliothek kann man ganz einfach die metrics importieren und hierbei den Mean Squared Error berechnen. Das ganze geht mit dem Befehl <span style="background-color:lightgrey;">metrics.mean_squared_error(testY, y_pred)</span>

In [25]:
from sklearn import metrics
print('Mean Absolute Percentage Error:', metrics.mean_absolute_percentage_error(testY, y_pred))


Mean Absolute Percentage Error: 0.31658733150238083
