# Training model

In [None]:
#Import Packages
from sklearn.linear_model import Lasso
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split
import tqdm

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)


import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd 
import seaborn as sns

df = pd.read_csv('Boliga - Final for training.csv')
df = df.drop(columns = ['Kommune'])

In [None]:
#Split into X and Y 
df_dum = pd.get_dummies(df, drop_first = True, columns = ['isForeclosure', 'Type'])
X = df_dum.drop(columns = ['price'])
y = df_dum['price']

### Split data into Test, Development, Validation and Train

In [None]:
from sklearn.model_selection import train_test_split

X_dev, X_test, y_dev, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

X_train, X_val, y_train, y_val = train_test_split(X_dev, y_dev, test_size = 0.5, random_state = 1)

### Scale, fit and transform data

In [None]:
from sklearn.preprocessing import StandardScaler

stdscale = StandardScaler(with_mean = 1, with_std = True)

#Scale explanatory variables
X_train_s = stdscale.fit_transform(X_train)
X_test_s = stdscale.transform(X_test)


#Scale target variable (because of regression model - no need when categorical)
y_train_s = stdscale.fit_transform(y_train)
y_test_s = stdscale.transform(y_test)


### Optimize hyperparameter with K-fold CV 

In [None]:
#Find optimized lambda
from sklearn.model_selection import KFold

lambdas =  np.logspace(-4, 4, 12) # define lambda - devide into 12 lambdas between 10**-4 and 10**4
kfolds = KFold(n_splits=5) # define nr. of folds

#Creates lists to append values of the loop:
mse_val_avr = []
mse_test_avr = []
lambda_list = []

#Outer-loop (Lambdas)
for lambda_ in tqdm.tqdm(lambdas):
#Inner-loop (Folds)
    mse_val_ = []
    mse_test_ = []
    #Split the dataset into 5 folds, and loop through each fold. 
    # In the iteration, the fold in question acts as the training-set. 
    for train_idx,val_idx in kfolds.split(X_dev, y_dev):
    # train model and compute MSE on test fold
        pipe_Lasso_CV = make_pipeline(PolynomialFeatures(include_bias=False, degree = 3), 
                        StandardScaler(with_mean = 1, with_std = True ),
                        Lasso(alpha = lambda_, random_state=1))
        # Assign X_train & y_train by extracting elements if the folds from X_dev and y_dev
        X_train, y_train = X_dev.iloc[train_idx], y_dev[train_idx]
        # Assign X_val & y_val by extracting elements of the folds from X_dev and y_dev
        X_val, y_val = X_dev.iloc[val_idx], y_dev[val_idx] 
        # Fit Lasso-model to training data
        pipe_Lasso_CV.fit(X_train, y_train)   
        #Calculate and append the mse into mse_val list:
        mse_val_.append(mse(pipe_Lasso_CV.predict(X_val), y_val))
        ##Calculate and append the mse into mse_test list:
        mse_test_.append(mse(pipe_Lasso_CV.predict(X_test), y_test))
        
    mse_val_avr.append(sum(mse_val_)/len(mse_val_))
    mse_test_avr.append(sum(mse_val_)/len(mse_val_))
    mse_val.append(mse_val_)
    mse_test.append(mse_test_)

df_val = pd.DataFrame(mse_val_avr, index = lambdas)
df_test = pd.DataFrame(mse_test_avr, index = lambdas)
df_join = pd.concat([df_test, df_val], axis=1)
df_with_index = df_join.reset_index()

#Convert to list
mses = df_with_index.values.tolist()
