## 04 Model Training

#### Import relevant libraries

In [1]:
import os
import sklearn as skl
import pandas as pd
import numpy as np

In [2]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

#### Load model training data

In [45]:
input_dir = os.path.join('..', 'data', 'processed')
df = pd.read_csv(os.path.join(input_dir, 'model_data.csv'))

In [46]:
df.head()

Unnamed: 0,respondent_id,target,x0,x1,x2,x3,x4,x5,x6,x7,x8
0,2.0,0.012967,0.268538,0.094636,27.887101,-0.244293,0,0.859689,0.801151,0.194781,0
1,3.0,0.019325,0.358297,0.122841,12.17258,0.219762,0,0.857886,0.0,1.0,0
2,6.0,0.018841,0.621016,0.00868,17.136237,-0.235897,0,0.84722,0.0,1.0,0
3,7.0,0.009183,0.274942,0.135522,32.556154,-0.086978,0,0.889972,0.853058,-0.00067,0
4,8.0,0.010767,0.52585,0.19005,18.140371,0.168089,0,0.835778,0.990068,0.009932,0


In [81]:
# Creating arrays and features for the target variable (expenditure / sales)

# 'Normalized Cost of Energy' is a placeholder for the column name with the target variable
target = 'target'

y = df[target]
x = df.drop(columns = ['target'])



# REMOVE ONCE x4, x8 IS DONE:
x = df.drop(columns = ['target','x4','x8'])
x

Unnamed: 0,respondent_id,x0,x1,x2,x3,x5,x6,x7
0,2.0,0.268538,0.094636,27.887101,-0.244293,0.859689,0.801151,0.194781
1,3.0,0.358297,0.122841,12.172580,0.219762,0.857886,0.000000,1.000000
2,6.0,0.621016,0.008680,17.136237,-0.235897,0.847220,0.000000,1.000000
3,7.0,0.274942,0.135522,32.556154,-0.086978,0.889972,0.853058,-0.000670
4,8.0,0.525850,0.190050,18.140371,0.168089,0.835778,0.990068,0.009932
...,...,...,...,...,...,...,...,...
111,281.0,0.000000,0.023523,33.495041,-0.001787,0.797537,0.000000,0.000000
112,288.0,0.496182,0.006634,18.193219,0.059214,0.886562,0.000000,0.000000
113,290.0,0.000000,0.000000,18.819757,0.000000,0.839007,0.000000,0.000000
114,403.0,0.408250,0.001236,14.202419,0.028103,0.883881,0.000000,0.000000


#### Scale predictor variables

In [82]:
# Import scale from sklearn
from sklearn.preprocessing import scale

# A scaling function that does not affect categorical variables such as respondent id,
# year, or regulation
def scale_function(X):
    x_scaled = X
    x_scaled = x_scaled.drop(columns = ['respondent_id'])
    x_scaled = scale(x_scaled)
    x_scaled['respondent_id'] = X['respondent_id']
    return x_scaled

# Tutorial recommends that we use one-hot numeric arrays, but I would
# like to avoid them as they create excessive columns

In [83]:
x = x.drop(columns = ['respondent_id'])

In [84]:
# Scale variables - use scale_function(X) if we use categorical variables
X_scaled = scale(x)

#### Split into test and train sets

In [85]:
# Import train_test_split from sklearn
from sklearn.model_selection import train_test_split

# Create training and test sets (not quite sure how to use test_size and random_state)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=0)


In [86]:
np.save(os.path.join(input_dir, 'X_train'), X_train)
np.save(os.path.join(input_dir, 'X_test'), X_test)
np.save(os.path.join(input_dir, 'y_train'), y_train)
np.save(os.path.join(input_dir, 'y_test'), y_test)

#### Fit regression model

(a) Linear regression

In [87]:
# Import LinearRegression from sklearn
from sklearn.linear_model import LinearRegression

# Instantiate the regressor
reg_lin = LinearRegression()

# Fit the regressor to the training data
reg_lin.fit(X_train, y_train)

# Test the linear regression model
predicted = reg_lin.predict(X_test)
mae = mean_absolute_error(y_test, predicted)
print("mean absolute error:", mae)
r2 = r2_score(y_test, predicted)
print('r^2:', r2)

mean absolute error: 0.016743523368629488
r^2: -0.21507562961955906


In [88]:
print(reg_lin.intercept_, reg_lin.coef_, reg_lin.score(x, y))

0.00621143323778499 [-0.00045965  0.00992311  0.01107211 -0.00435233 -0.00182313 -0.00656704
  0.00082144] -629.7567615429899




(b) Random forest

In [12]:
# Import RandomForestRegressor from sklearn
from sklearn.ensemble import RandomForestRegressor

# Instantiate the regressor with default value for hyperparameters
reg_forest = RandomForestRegressor(random_state=None)

# Fit the regressor to the training data
reg_forest.fit(X_train, y_train)

RandomForestRegressor()

(c) Neural Network

In [13]:
# Import MLPRegressor from skleatn
from sklearn.neural_network import MLPRegressor

#Instantiate the regressor with default values
reg_neural = MLPRegressor(random_state=None)

# Fit the regressor to the training data
reg_neural.fit(X_train, y_train)

MLPRegressor()

#### Dump model_fit file(s)

In [89]:
# Import the dump function
from joblib import dump

In [90]:
# Save linear regression model to disk
filename = os.path.join(input_dir,'lin_model_2.joblib')
dump(reg_lin, open(filename, 'wb'))

In [15]:
# Save random forest model to disk
filename = os.path.join(input_dir,'forest_model_1.joblib')
dump(reg_forest, open(filename, 'wb'))

In [16]:
# Save neural network model to disk
filename = os.path.join(input_dir,'neural_model_1.joblib')
dump(reg_neural, open(filename, 'wb'))