# Training model

In [49]:
#Import Packages
from sklearn.linear_model import Lasso
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split
import tqdm

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)


import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd 
import seaborn as sns

df = pd.read_csv('Boliga - Final for training.csv')
df = df.drop(columns = ['Kommune'])
df_sample = df.sample(500)

In [50]:
df.shape

(61618, 36)

In [51]:
#Split into X and Y 
df_dum = pd.get_dummies(df_sample, drop_first = True, columns = ['isForeclosure', 'Type'])
X = df_dum.drop(columns = ['price'])
y = df_dum['price']

KeyError: "['Kommune'] not in index"

### Split data into Test, Development, Validation and Train

In [38]:
from sklearn.model_selection import train_test_split

X_dev, X_test, y_dev, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

X_train, X_val, y_train, y_val = train_test_split(X_dev, y_dev, test_size = 0.5, random_state = 1)

In [40]:
y_train

55519     2795000
25611     1250000
8355      2150000
44988     1000000
57635      520000
43478     3695000
52878     1945000
49577     2590000
23539     1945000
46006     1445000
36739     1895000
1001      2950000
29432     3450000
32129     1448000
5478      2995000
39312     2397000
37021     4860000
5669      3450000
14070     1875000
17137      595000
34699     1395000
43302     3295000
24310     1895000
27521     1195000
48611     3495000
5335       795000
59566     1450000
20062      149000
17310      745000
47563      895000
57659      645000
46861     1535000
1233      2195000
48169     1295000
2669      1670314
32679     1875000
25739     1595000
41042      750000
26116      695000
26711      589000
41768      845000
2687      1995000
10932     1995000
41719     1075000
948       2275000
59229     2395000
56221     1685000
48722     1650000
17049      525000
59901     1595000
57496      995000
54495      395000
31723     1095000
11503     2695000
38408     3185000
50577     

### Scale, fit and transform data

In [None]:
from sklearn.preprocessing import StandardScaler

stdscale = StandardScaler(with_mean = 1, with_std = True)

#Scale explanatory variables
X_train_s = stdscale.fit_transform(X_train)
X_test_s = stdscale.transform(X_test)


#Scale target variable (because of regression model - no need when categorical)
y_train_s = stdscale.fit_transform(y_train)
y_test_s = stdscale.transform(y_test)


### Optimize hyperparameter with K-fold CV 

In [14]:
pd.options.display.max_rows = None
pd.options.display.max_columns = None
df

Unnamed: 0,price,basementSize,buildYear,Ejerudgift,isForeclosure,lotSize,rooms,size,Type,Relativ Ledighed,Grundskole,Gymnasiale uddannelser,Erhvervsfaglige uddannelser,KVU,MVU,Bacheloruddannelser,LVU,Kommunal_gennemsnitsinkomst_2017,Total_reported,Population_in_urban_development,Socioeconomic_index,expenses_per_school_student,average_class_size,expenses_sport_and_other_cultural_activities,lake_distance,forest_distance,doctor_distance,supermarket_distance,school_distance,daycare_distance,hospital_distance,train_distance,pharmacy_distance,library_distance,coast_distance,junction_distance
0,18750000,0,1767,16872,False,0,6.0,280,Ejerlejlighed,7.926682,13.754726,7.646977,19.268325,4.928181,17.977495,3.627649,26.658495,319745,0.18805,100.0,1.08,69710,22.8,1062,1.13204,7.22733,0.11111,0.24096,0.35005,0.39428,2.67332,1056.5,0.38665,0.92771,0.41379,4.65646
1,10500000,88,1880,5702,False,0,4.0,83,Ejerlejlighed,7.926682,13.754726,7.646977,19.268325,4.928181,17.977495,3.627649,26.658495,319745,0.18805,100.0,1.08,69710,22.8,1062,1.20441,7.30106,0.2986,0.59852,0.42607,0.43139,2.95715,1411.22,0.70159,1.23705,0.03508,4.94359
2,15000000,0,1755,9486,False,0,7.0,274,Ejerlejlighed,7.926682,13.754726,7.646977,19.268325,4.928181,17.977495,3.627649,26.658495,319745,0.18805,100.0,1.08,69710,22.8,1062,1.18899,7.36108,0.26852,0.71105,0.4861,0.40131,2.92707,1381.15,0.67141,1.29708,0.07032,4.91351
3,19995000,0,1880,17609,False,0,7.0,296,Ejerlejlighed,7.926682,13.754726,7.646977,19.268325,4.928181,17.977495,3.627649,26.658495,319745,0.18805,100.0,1.08,69710,22.8,1062,1.20441,7.30106,0.2986,0.59852,0.42607,0.43139,2.95715,1411.22,0.70159,1.23705,0.03508,4.94359
4,8750000,0,1873,5590,False,0,7.0,163,Ejerlejlighed,7.926682,13.754726,7.646977,19.268325,4.928181,17.977495,3.627649,26.658495,319745,0.18805,100.0,1.08,69710,22.8,1062,1.25292,7.23167,0.39571,0.64693,0.36048,0.5284,3.05426,1508.33,0.7986,1.16766,0.12353,5.0406
5,11500000,0,1873,5739,False,0,4.0,164,Ejerlejlighed,7.926682,13.754726,7.646977,19.268325,4.928181,17.977495,3.627649,26.658495,319745,0.18805,100.0,1.08,69710,22.8,1062,1.29041,7.15658,0.40142,0.73205,0.3662,0.53411,3.05997,1514.05,0.80431,1.09268,0.14719,5.04631
6,4495000,0,1870,4461,False,0,3.0,76,Ejerlejlighed,7.926682,13.754726,7.646977,19.268325,4.928181,17.977495,3.627649,26.658495,319745,0.18805,100.0,1.08,69710,22.8,1062,1.09059,6.95677,0.62901,0.81589,0.40758,0.7618,3.11336,1567.43,0.82495,0.89286,0.19007,5.0998
7,4494000,0,1872,9790,False,0,6.0,132,Andelsbolig,7.926682,13.754726,7.646977,19.268325,4.928181,17.977495,3.627649,26.658495,319745,0.18805,100.0,1.08,69710,22.8,1062,1.0755,6.94177,0.59185,0.68329,0.27518,0.7048,3.03906,1449.73,0.73779,0.87776,0.22029,5.0254
8,9500000,0,1900,5606,False,0,4.0,122,Ejerlejlighed,7.926682,13.754726,7.646977,19.268325,4.928181,17.977495,3.627649,26.658495,319745,0.18805,100.0,1.08,69710,22.8,1062,1.09335,6.95952,0.59336,0.77569,0.45853,0.72605,3.16441,1618.48,0.8277,0.89561,0.14033,5.15075
9,5995000,0,1796,2771,False,0,3.0,125,Ejerlejlighed,7.926682,13.754726,7.646977,19.268325,4.928181,17.977495,3.627649,26.658495,319745,0.18805,100.0,1.08,69710,22.8,1062,1.21384,6.74242,0.40809,0.26543,0.57113,0.53086,2.8054,1000.5,0.26553,0.80509,0.09079,4.85385


In [47]:
#Find optimized lambda
from sklearn.model_selection import KFold

lambdas =  np.logspace(-4, 4, 12) # define lambda - devide into 12 lambdas between 10**-4 and 10**4
kfolds = KFold(n_splits=5) # define nr. of folds

#Creates lists to append values of the loop:
mse_val_avr = []
mse_test_avr = []
lambda_list = []

#Outer-loop (Lambdas)
for lambda_ in tqdm.tqdm(lambdas):
#Inner-loop (Folds)
    mse_val_ = []
    mse_test_ = []
    #Split the dataset into 5 folds, and loop through each fold. 
    # In the iteration, the fold in question acts as the training-set. 
    for train_idx,val_idx in kfolds.split(X_dev, y_dev):
    # train model and compute MSE on test fold
        pipe_Lasso_CV = make_pipeline(PolynomialFeatures(include_bias=False, degree = 3), 
                        StandardScaler(with_mean = 1, with_std = True ),
                        Lasso(alpha = lambda_, random_state=1))
        # Assign X_train & y_train by extracting elements if the folds from X_dev and y_dev
        X_train, y_train = X_dev.iloc[train_idx], y_dev.iloc[train_idx]
        # Assign X_val & y_val by extracting elements of the folds from X_dev and y_dev
        X_val, y_val = X_dev.iloc[val_idx], y_dev.iloc[val_idx] 
        # Fit Lasso-model to training data
        pipe_Lasso_CV.fit(X_train, y_train)   
        #Calculate and append the mse into mse_val list:
        mse_val_.append(mse(pipe_Lasso_CV.predict(X_val), y_val))
        ##Calculate and append the mse into mse_test list:
        mse_test_.append(mse(pipe_Lasso_CV.predict(X_test), y_test))
        
    mse_val_avr.append(sum(mse_val_)/len(mse_val_))
    mse_test_avr.append(sum(mse_val_)/len(mse_val_))

df_val = pd.DataFrame(mse_val_avr, index = lambdas)
df_test = pd.DataFrame(mse_test_avr, index = lambdas)
df_join = pd.concat([df_test, df_val], axis=1)
df_with_index = df_join.reset_index()

#Convert to list
mses = df_with_index.values.tolist()



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:10<01:57, 10.71s/it][A
 17%|█▋        | 2/12 [00:21<01:47, 10.73s/it][A
 25%|██▌       | 3/12 [00:32<01:37, 10.79s/it][A
 33%|███▎      | 4/12 [00:42<01:25, 10.71s/it][A
 42%|████▏     | 5/12 [00:53<01:15, 10.72s/it][A
 50%|█████     | 6/12 [01:04<01:04, 10.70s/it][A
 58%|█████▊    | 7/12 [01:15<00:53, 10.73s/it][A
 67%|██████▋   | 8/12 [01:47<01:08, 17.11s/it][A
 75%|███████▌  | 9/12 [02:10<00:56, 18.99s/it][A
 83%|████████▎ | 10/12 [02:29<00:38, 19.04s/it][A
 92%|█████████▏| 11/12 [02:48<00:18, 18.84s/it][A
100%|██████████| 12/12 [03:06<00:00, 18.60s/it][A

In [48]:
df_join

Unnamed: 0,0,0.1
0.0001,25425700000000.0,25425700000000.0
0.000534,25423970000000.0,25423970000000.0
0.002848,25414720000000.0,25414720000000.0
0.015199,25365420000000.0,25365420000000.0
0.081113,25101570000000.0,25101570000000.0
0.432876,23708810000000.0,23708810000000.0
2.31013,17481110000000.0,17481110000000.0
12.328467,11948110000000.0,11948110000000.0
65.793322,8290316000000.0,8290316000000.0
351.119173,5729008000000.0,5729008000000.0
