## 04 Model Training

#### Import relevant libraries

In [26]:
import os
import sklearn as skl
import pandas as pd
import numpy as np
import pickle

#### Load model training data

In [2]:
input_dir = os.path.join('..', 'data', 'processed')
df = pd.read_csv(os.path.join(input_dir, 'model_data.csv'))

In [4]:
df.head()

Unnamed: 0,respondent_id,target,x0,x1,x2
0,2.0,0.012967,3.744372,26.887101,0.002868
1,3.0,0.019325,2.180698,11.17258,0.0
2,6.0,0.018841,5.320943,16.136237,0.0
3,7.0,0.009183,4.47552,31.556154,0.080693
4,8.0,0.010767,4.769553,17.140371,0.0


In [5]:
# Creating arrays and features for the target variable (expenditure / sales)

# 'Normalized Cost of Energy' is a placeholder for the column name with the target variable
target = 'target'

y = df[target]
x = df.drop(columns = [target])

#### Scale predictor variables

In [6]:
# Import scale from sklearn
from sklearn.preprocessing import scale

# A scaling function that does not affect categorical variables such as respondent id,
# year, or regulation
def scale_function(X):
    x_scaled = X
    x_scaled = x_scaled.drop(columns = ['respondent_id'])
    x_scaled = scale(x_scaled)
    x_scaled['respondent_id'] = X['respondent_id']
    return x_scaled

# Tutorial recommends that we use one-hot numeric arrays, but I would
# like to avoid them as they create excessive columns

In [14]:
x = x.drop(columns = ['respondent_id'])

In [15]:
# Scale variables - use scale_function(X) if we use categorical variables
X_scaled = scale(x)

#### Split into test and train sets

In [16]:
# Import train_test_split from sklearn
from sklearn.model_selection import train_test_split

# Create training and test sets (not quite sure how to use test_size and random_state)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=None, random_state=None)


#### Fit regression model

(a) Linear regression

In [23]:
# Import LinearRegression from sklearn
from sklearn.linear_model import LinearRegression

# Instantiate the regressor
reg_lin = LinearRegression()

# Fit the regressor to the training data
reg_lin.fit(X_train, y_train)

# Test the linear regression model
predicted = reg_lin.predict(X_test)


In [24]:
print(reg_lin.intercept_, reg_lin.coef_, reg_lin.score(x, y))

0.009975381422283205 [-0.03913896  0.05021351 -0.00188548] -4123.3916482001005




(b) Random forest

In [10]:
# Import RandomForestRegressor from sklearn
from sklearn.ensemble import RandomForestRegressor

# Instantiate the regressor with default value for hyperparameters
reg_forest = RandomForestRegressor(random_state=None)

# Fit the regressor to the training data
reg_forest.fit(X_train, y_train)

NameError: name 'X_train' is not defined

(c) Neural Network

In [None]:
# Import MLPRegressor from skleatn
from sklearn.neural_network import MLPRegressor

#Instantiate the regressor with default values
reg_neural = MLPRegressor(random_state=None)

# Fit the regressor to the training data
reg_neural.fit(X_train, y_train)

#### Pickle model_fit file(s)

In [28]:
# Save linear regression model to disk
filename = os.path.join(input_dir,'lin_model_1.pkl')
pickle.dump(reg_lin, open(filename, 'wb'))

In [None]:
# Save random forest model to disk
filename = os.path.join(input_dir,'forest_model_1.pkl')
pickle.dump(reg_forest, open(filename, 'wb'))

In [None]:
# Save neural network model to disk
filename = os.path.join(input_dir,'neural_model_1.pkl')
pickle.dump(reg_neural, open(filename, 'wb'))