# ML Pipeline - Model Training

In [None]:
# data wrangling
import pandas as pd
import numpy as np

# plotting
import matplotlib.pyplot as plt

# to  save model
import joblib 

# build model
from sklearn.linear_model import Lasso

# model evaluation
from sklearn.metrics import mean_squared_error, r2_score

# visualise all columns 
pd.pandas.set_option('display.max_columns', None)

In [None]:
X_train = pd.read_csv('xtrain.csv')
X_test = pd.read_csv('xtest.csv')


X_train.head(3)

In [None]:
y_train = pd.read_csv('ytrain.csv')
y_test = pd.read_csv('ytest.csv')

y_train.head(4)

In [None]:
# load pre-selected feature from feature selection notebook

features = pd.read_csv('selected_features.csv')
features.head(2)

In [None]:
features = features['0'].to_list()

features

In [None]:
X_train = X_train[features]
X_test = X_test[features]

### Regularise linear Regression: Lasso

In [None]:
# random seed -> for reproduceability purpose

lin_model = Lasso(alpha=0.001, random_state=0)

lin_model.fit(X_train, y_train)

In [None]:
# evaluate the model: 
# mse, rmse and r2 will be used for model evaluation

# make prediction on train set
pred = lin_model.predict(X_train)

print(f'train mse: {int(mean_squared_error(np.exp(y_train), np.exp(pred)))}')

print(f'train rmse: {int(mean_squared_error(np.exp(y_train), np.exp(pred), squared=False))}')

print(f'train r3: {r2_score(np.exp(y_train), np.exp(pred))}')

print('======================')


pred = lin_model.predict(X_test)


print(f'train mse: {int(mean_squared_error(np.exp(y_test), np.exp(pred)))}')

print(f'train rmse: {int(mean_squared_error(np.exp(pred), squared=False))}')

print(f'train r3: {r2_score(np.exp(y_train), np.exp(pred))}')