In [66]:
import joblib
import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

In [2]:
df = joblib.load('./dataframes/final_all.pkl')

In [3]:
df = df[df['year']>=2010]

In [4]:
#convert year to datetime object and set as index

df['year'] = pd.to_datetime(df['year'], format='%Y')
df.set_index('year', inplace=True)

In [6]:
#drop column Bezirksname

df.drop('Bezirksname', axis=1, inplace=True)

In [7]:
#create dummies for urban_style

dummies = pd.get_dummies(df['urban_style'])
df = pd.concat([df, dummies], axis=1)
df.drop('urban_style', axis=1, inplace=True)
df.rename(columns={'Einfamilienhausgebiete': 'urban_style_family',
                    'Großsiedlungen der 60er - 80er Jahre': 'urban_style_60s',
                    'Siedlungsbau der 20er - 30er Jahre': 'urban_style_20s',
                    'Siedlungsbau der 50er Jahre': 'urban_style_50s',
                    'Siedlungsbau der 90er - 2000er Jahre': 'urban_style_90s',
                    'Verdichtete Blockrandbebauung': 'urban_style_old'
                    }, inplace=True)

In [8]:
#split in train, cv and test set

train_df = df[(df.index>='2010-01-01') & (df.index<'2017-01-01')]
val_df = df[(df.index=='2017-01-01') | (df.index=='2018-01-01')]
test_df = df[df.index=='2019-01-01']

num_features = df.shape[1]

### Functions

In [72]:
def split_dataset(df, label=None):
    X = df
    
    if label:
        X = df.drop(label, axis=1)
        y = df[label]
        
        return X, y

    return X

In [76]:
def scaling(df, train_df=df, mode='train'):
    scaler = StandardScaler()
    
    if mode=='train':
        df_scaled = scaler.fit_transform(df)
        
    else:
        scaler.fit_transform(train_df)
        df_scaled = scaler.transform(df)
        
    return df_scaled

### Predicting on sqm_price_all

In [77]:
#split datasets in features and target

X_train, y_train = split_dataset(train_df, label='sqm_price_all')
X_cv, y_cv = split_dataset(val_df, label='sqm_price_all')
X_test, y_test = split_dataset(test_df, label='sqm_price_all')

In [78]:
#scale dataframes

scaled_X_train = scaling(X_train, 'train')
scaled_X_cv = scaling(X_cv, X_train, 'cv')
scaled_X_test = scaling(X_test, X_train, 'test')

#### Linear Regression

In [79]:
reg = LinearRegression()
reg.fit(scaled_X_train, y_train)
predictions = reg.predict(scaled_X_cv)

print('Mean Squared Error: {}'.format(str(mean_squared_error(np.array(y_cv), predictions))))
print('R2 Score: {}'.format(str(r2_score(np.array(y_cv), predictions))))

Mean Squared Error: 0.030933482829500456
R2 Score: 0.9885529534608014


#### XGBOOST

In [58]:
#manual gridsearch

def grid_search_estimators(train_X, cv_X, train_y, cv_y):
    start = time.time()

    #Grid Search over n_estimators
    for i in [500, 1000]:
        
        #Initialize XGB
        reg = GradientBoostingRegressor(n_estimators=i)

        reg.fit(train_X, train_y)
        predictions = reg.predict(cv_X)

        result = 'n_estimators: {}'.format(i)
        print(result)
        print('-'*len(result))
        print('Mean Squared Error: {}'.format(str(mean_squared_error(np.array(cv_y), predictions))))
        print('R2 Score: {}'.format(str(r2_score(np.array(cv_y), predictions))))
        print('\n')

    print(f'Time: {time.time() - start}')

In [59]:
grid_search_estimators(scaled_X_train, scaled_X_cv, y_train, y_cv)

n_estimators: 500
-----------------
Mean Squared Error: 0.3378950891252702
R2 Score: 0.8749607074023064


n_estimators: 1000
------------------
Mean Squared Error: 0.33422907421428577
R2 Score: 0.8763173294008941


Time: 43.93093490600586


In [63]:
def grid_search_depth(train_X, cv_X, train_y, cv_y, n_estimators):
    start = time.time()

    for i in range(4, 10, 2):
        #Initialize XGB
        reg = GradientBoostingRegressor(n_estimators=n_estimators, max_depth=i)

        reg.fit(train_X, train_y)
        predictions = reg.predict(cv_X)

        result = 'n_estimators: {} | max_depth: {}'.format(n_estimators, i)
        print(result)
        print('-'*len(result))
        print('Mean Squared Error: {}'.format(str(mean_squared_error(np.array(cv_y), predictions))))
        print('R2 Score: {}'.format(str(r2_score(np.array(cv_y), predictions))))
        print('\n')

    print(f'Time: {time.time() - start}')

In [64]:
grid_search_depth(scaled_X_train, scaled_X_cv, y_train, y_cv, 1000)

n_estimators: 1000 | max_depth: 4
---------------------------------
Mean Squared Error: 0.32857035906540605
R2 Score: 0.8784113572870628


n_estimators: 1000 | max_depth: 6
---------------------------------
Mean Squared Error: 0.38590473056252494
R2 Score: 0.857194566974744


n_estimators: 1000 | max_depth: 8
---------------------------------
Mean Squared Error: 0.3982570463682638
R2 Score: 0.8526235480993549


Time: 159.90856099128723


In [99]:
def xgb_gridsearch(train_X, train_y, test_X, test_y, n_splits=3, n_jobs=-1):
    '''
    Function performs GridSearch using TimeSeries CV
    X_train, y_train
    n_splits=number of splits in TimeSeriesCV; default:5
    n_jobs=default: -1
    '''
    
    model = GradientBoostingRegressor()

    tscv = TimeSeriesSplit(n_splits=n_splits)
    gsearch = GridSearchCV(estimator=model, cv=tscv,
                            param_grid=params, n_jobs=n_jobs)

    gsearch.fit(X_train, y_train)
    
    print("Best params were: {}".format(gsearch.best_params_))
    print('\n')
    
    best_model = gsearch.best_estimator_
    
    print('Mean Squared Error on cv set: {}'.format(str(mean_squared_error(np.array(test_y), best_model.predict(test_X)))))
    print('R2 Score on cv set: {}'.format(str(r2_score(np.array(test_y), best_model.predict(test_X)))))
    print('\n')
    print('Mean Squared Error on train set: {}'.format(str(mean_squared_error(np.array(train_y), best_model.predict(train_X)))))
    print('R2 Score on train set: {}'.format(str(r2_score(np.array(train_y), best_model.predict(train_X)))))
    

In [102]:
max_depth = [2, 3, 4]                   #depth of trees
n_estimators = [25, 50, 75, 100, 150]   #number of base learners

params = {
    'max_depth': max_depth,
    'n_estimators': n_estimators,
        }

In [103]:
xgb_gridsearch(scaled_X_train, y_train, scaled_X_cv, y_cv, n_splits=3, n_jobs=-1)

Best params were: {'max_depth': 4, 'n_estimators': 50}


Mean Squared Error on cv set: 20.635275080427967
R2 Score on cv set: -6.636157735512235


Mean Squared Error on train set: 7.876427299684027
R2 Score on train set: -1.921652613603852
