## Model Training (model v2)

#### Import relevant libraries

In [1]:
import os
import sklearn as skl
import pandas as pd
import numpy as np

#### Load model training data

In [2]:
#directory (folder) for model v2
input_dir = os.path.join('..', 'data', 'processed', 'model_v2')
df = pd.read_csv(os.path.join(input_dir, 'model_data_v2.csv'))

In [3]:
df.head()

Unnamed: 0,target,x01,x02,x03,x04,x05,x06,x07,x08,x09,c10
0,0.024761,0.277673,0.074652,76.26845,1.0,0.000764,0.780306,0.219694,0.0,0.105,vert_int
1,0.004686,0.535174,0.340114,56.872832,1.0,0.000791,0.82609,0.17391,0.0,0.105,vert_int
2,0.002276,0.590562,0.256538,39.581972,1.0,0.000789,0.847292,0.152708,0.0,0.105,vert_int
3,0.025767,0.532942,0.19032,48.594641,1.0,0.000893,0.714805,0.285195,0.0,0.105,vert_int
4,0.035918,0.544757,0.132016,63.100622,1.0,0.000833,0.788407,0.211593,0.0,0.105,vert_int


#### One-hot encoding categorical variables

In [4]:
categorical_vars = df.columns[[col_name.startswith('c') for col_name in df.columns]]
c_vars = pd.get_dummies(df[categorical_vars], drop_first=True)
c_vars.head()

Unnamed: 0,c10_other_wires,c10_restructured,c10_vert_int
0,0,0,1
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1


#### Scale predictor variables

In [5]:
from sklearn.preprocessing import scale
continuous_vars = df.columns[[col_name.startswith('x') for col_name in df.columns]]
x_vars = df[continuous_vars].copy()

for col_name in continuous_vars:
    x_vars[col_name] = scale(df[col_name])

x_vars.head()

Unnamed: 0,x01,x02,x03,x04,x05,x06,x07,x08,x09
0,-1.220731,-0.276929,0.220831,-0.082296,9.577562,1.744026,-0.008585,-0.720454,0.02726
1,-0.42194,0.536039,-0.164504,-0.082296,9.921034,1.873865,-0.097223,-0.720454,0.02726
2,-0.25012,0.280091,-0.508024,-0.082296,9.897009,1.933992,-0.13827,-0.720454,0.02726
3,-0.428864,0.077299,-0.328968,-0.082296,11.223231,1.558269,0.118226,-0.720454,0.02726
4,-0.392212,-0.101255,-0.040776,-0.082296,10.457989,1.766999,-0.024268,-0.720454,0.02726


#### Split into test and train sets

In [6]:
y = df['target']
X = x_vars.join(c_vars)

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
print(X_train.shape, y_train.shape); print(X_test.shape, y_test.shape);

(852, 12) (852,)
(214, 12) (214,)


Save as csv for use in fit evaluation step

In [8]:
X_train.to_csv(os.path.join(input_dir, 'X_train.csv'), index=False)
X_test.to_csv(os.path.join(input_dir, 'X_test.csv'), index=False)
y_train.to_csv(os.path.join(input_dir, 'y_train.csv'), index=False)
y_test.to_csv(os.path.join(input_dir, 'y_test.csv'), index=False)

In [9]:
np.save(os.path.join(input_dir, 'X_train'), X_train)
np.save(os.path.join(input_dir, 'X_test'), X_test)
np.save(os.path.join(input_dir, 'y_train'), y_train)
np.save(os.path.join(input_dir, 'y_test'), y_test)

#### Fit regression model

(a) Linear regression

In [10]:
import matplotlib.pyplot as plt
y_train[y_train.isnull().values]

Series([], Name: target, dtype: float64)

In [11]:
# Import LinearRegression from sklearn
from sklearn.linear_model import LinearRegression

# Instantiate the regressor
reg_lin = LinearRegression()

# Fit the regressor to the training data
reg_lin.fit(X_train, y_train)

# Test the linear regression model
predicted = reg_lin.predict(X_test)
print(reg_lin.intercept_, reg_lin.coef_)

0.031210762884090144 [-0.00308202  0.00297225 -0.00712664 -0.0043552   0.00019937  0.00415474
  0.00357574 -0.00676372 -0.0007218  -0.04227457 -0.02044243 -0.01659954]


(b) Random forest

In [12]:
# Import RandomForestRegressor from sklearn
from sklearn.ensemble import RandomForestRegressor

# Instantiate the regressor with default value for hyperparameters
reg_forest = RandomForestRegressor(random_state=123)

# Fit the regressor to the training data
reg_forest.fit(X_train, y_train)
reg_forest.feature_importances_

array([0.16289627, 0.06888482, 0.18715575, 0.14345853, 0.15467481,
       0.0667995 , 0.05931025, 0.02914767, 0.10035449, 0.01626613,
       0.00336824, 0.00768353])

(c) Neural Network

In [13]:
# Import MLPRegressor from skleatn
from sklearn.neural_network import MLPRegressor

#Instantiate the regressor with default values
reg_neural = MLPRegressor(random_state=123)

# Fit the regressor to the training data
reg_neural.fit(X_train, y_train)

MLPRegressor(random_state=123)

#### Dump model_fit file(s)

In [14]:
# Import the dump function
from joblib import dump

In [15]:
# Save linear regression model to disk
filename = os.path.join(input_dir,'lin_model_v2_1.joblib')
dump(reg_lin, open(filename, 'wb'))

In [16]:
# Save random forest model to disk
filename = os.path.join(input_dir,'forest_model_v2_1.joblib')
dump(reg_forest, open(filename, 'wb'))

In [17]:
# Save neural network model to disk
filename = os.path.join(input_dir,'neural_model_v2_1.joblib')
dump(reg_neural, open(filename, 'wb'))