## Model Training (model v01)

#### Import relevant libraries

In [1]:
import os
import sklearn as skl
import pandas as pd
import numpy as np

#### Load model training data

In [2]:
input_dir = os.path.join('..', 'data', 'processed', 'model_v1')
df = pd.read_csv(os.path.join(input_dir, 'model_data.csv'))

In [3]:
df.head()

Unnamed: 0,target,x01,x02,x03,x04,x05,x06,x07,x08,x09,c10
0,0.019042,0.351208,0.110678,95.893125,1.0,0.475068,0.349169,0.114363,0.0,0.105,vert_int
1,0.015868,0.513455,0.476233,31.004672,1.0,0.288965,0.0,0.925652,0.0,0.1288,vert_int
2,0.022832,0.642341,0.039913,54.876484,1.000005,0.407213,0.645221,0.066756,10.487586,0.103387,vert_int
3,0.012631,0.419923,0.251899,77.037977,1.000554,0.375943,0.06144,0.057746,14.58546,0.11,vert_int
4,0.019896,0.610624,0.525933,65.810615,1.0,0.784067,0.929888,0.007142,0.0,0.099,vert_int


#### One-hot encoding categorical variables

In [4]:
categorical_vars = df.columns[[col_name.startswith('c') for col_name in df.columns]]
c_vars = pd.get_dummies(df[categorical_vars], drop_first=True)
c_vars.head()

Unnamed: 0,c10_other_wires,c10_restructured,c10_vert_int
0,0,0,1
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1


#### Scale predictor variables

In [5]:
from sklearn.preprocessing import scale
continuous_vars = df.columns[[col_name.startswith('x') for col_name in df.columns]]
x_vars = df[continuous_vars].copy()

for col_name in continuous_vars:
    x_vars[col_name] = scale(df[col_name])

x_vars.head()

Unnamed: 0,x01,x02,x03,x04,x05,x06,x07,x08,x09
0,-1.20826,-0.283548,0.592913,-0.178322,0.261409,-0.286949,-0.182708,-0.710241,0.02473
1,-0.490188,1.377255,-1.02104,-0.178322,-0.735858,-1.370272,3.645462,-0.710241,1.94389
2,0.080234,-0.60505,-0.427283,-0.178248,-0.102204,0.631572,-0.407347,0.048291,-0.105376
3,-0.904142,0.358051,0.123934,-0.170482,-0.269769,-1.179649,-0.449864,0.344676,0.427914
4,-0.060139,1.603053,-0.155321,-0.178322,1.917238,1.514773,-0.688644,-0.710241,-0.459092


#### Split into test and train sets

In [6]:
y = df['target']
X = x_vars.join(c_vars)

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
print(X_train.shape, y_train.shape); print(X_test.shape, y_test.shape);

(92, 12) (92,)
(24, 12) (24,)


Save as csv for use in fit evaluation step

In [8]:
X_train.to_csv(os.path.join(input_dir, 'X_train.csv'), index=False)
X_test.to_csv(os.path.join(input_dir, 'X_test.csv'), index=False)
y_train.to_csv(os.path.join(input_dir, 'y_train.csv'), index=False)
y_test.to_csv(os.path.join(input_dir, 'y_test.csv'), index=False)

In [9]:
np.save(os.path.join(input_dir, 'X_train'), X_train)
np.save(os.path.join(input_dir, 'X_test'), X_test)
np.save(os.path.join(input_dir, 'y_train'), y_train)
np.save(os.path.join(input_dir, 'y_test'), y_test)

#### Fit regression model

(a) Linear regression

In [10]:
# Import LinearRegression from sklearn
from sklearn.linear_model import LinearRegression

# Instantiate the regressor
reg_lin = LinearRegression()

# Fit the regressor to the training data
reg_lin.fit(X_train, y_train)

# Test the linear regression model
predicted = reg_lin.predict(X_test)
print(reg_lin.intercept_, reg_lin.coef_)

0.002729871414864221 [-0.01651227  0.00059818 -0.00101134 -0.0028236  -0.00144282 -0.00148311
  0.00248967 -0.0035695  -0.00384262  0.01391957  0.02816026  0.00383816]


(b) Random forest

In [11]:
# Import RandomForestRegressor from sklearn
from sklearn.ensemble import RandomForestRegressor

# Instantiate the regressor with default value for hyperparameters
reg_forest = RandomForestRegressor(random_state=123)

# Fit the regressor to the training data
reg_forest.fit(X_train, y_train)
reg_forest.feature_importances_

array([0.30062526, 0.02746705, 0.22348862, 0.10843147, 0.05933366,
       0.1028969 , 0.0864483 , 0.02783408, 0.04170127, 0.01547497,
       0.00285231, 0.00344612])

(c) Neural Network

In [12]:
# Import MLPRegressor from skleatn
from sklearn.neural_network import MLPRegressor

#Instantiate the regressor with default values
reg_neural = MLPRegressor(random_state=123)

# Fit the regressor to the training data
reg_neural.fit(X_train, y_train)

MLPRegressor(random_state=123)

#### Dump model_fit file(s)

In [13]:
# Import the dump function
from joblib import dump

In [14]:
# Save linear regression model to disk
filename = os.path.join(input_dir,'lin_model_3.joblib')
dump(reg_lin, open(filename, 'wb'))

In [15]:
# Save random forest model to disk
filename = os.path.join(input_dir,'forest_model_1.joblib')
dump(reg_forest, open(filename, 'wb'))

In [16]:
# Save neural network model to disk
filename = os.path.join(input_dir,'neural_model_1.joblib')
dump(reg_neural, open(filename, 'wb'))