## 04 Model Training (v2)

#### Import relevant libraries

In [1]:
import os
import sklearn as skl
import pandas as pd
import numpy as np

#### Load model training data

In [2]:
#directory (folder) for model v2
model_v2_dir = os.path.join('..', 'data', 'processed', 'model_v2')
df = pd.read_csv(os.path.join(model_v2_dir, 'model_2_data.csv'))

In [3]:
df.head()

Unnamed: 0,target,x01,x02,x03,x04,x05,x06,x07,x08,c09
0,0.024761,0.277673,0.074652,76.26845,1.0,0.234956,0.780306,0.219694,0.0,vert_int
1,0.004686,0.535174,0.340114,56.872832,1.0,0.258488,0.82609,0.17391,0.0,vert_int
2,0.002276,0.590562,0.256538,39.581972,1.0,0.283575,0.847292,0.152708,0.0,vert_int
3,0.025767,0.532942,0.19032,48.594641,1.0,0.297706,0.714805,0.285195,0.0,vert_int
4,0.035918,0.544757,0.132016,63.100622,1.0,0.284825,0.788407,0.211593,0.0,vert_int


#### One-hot encoding categorical variables

In [4]:
categorical_vars = df.columns[[col_name.startswith('c') for col_name in df.columns]]
c_vars = pd.get_dummies(df[categorical_vars], drop_first=True)
c_vars.head()

Unnamed: 0,c09_other_wires,c09_restructured,c09_vert_int
0,0,0,1
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1


#### Scale predictor variables

In [5]:
from sklearn.preprocessing import scale
continuous_vars = df.columns[[col_name.startswith('x') for col_name in df.columns]]
x_vars = df[continuous_vars].copy()

for col_name in continuous_vars:
    x_vars[col_name] = scale(df[col_name])

x_vars.head()

Unnamed: 0,x01,x02,x03,x04,x05,x06,x07,x08
0,-1.223755,-0.275048,0.225081,-0.081346,0.04923,1.749408,-0.006572,-0.717901
1,-0.42377,0.539339,-0.160312,-0.081346,0.059663,1.879485,-0.095378,-0.717901
2,-0.251693,0.282944,-0.503884,-0.081346,0.070785,1.939722,-0.136504,-0.717901
3,-0.430705,0.079798,-0.324801,-0.081346,0.077051,1.563311,0.120481,-0.717901
4,-0.393997,-0.099067,-0.036565,-0.081346,0.07134,1.772423,-0.022285,-0.717901


#### Split into test and train sets

In [6]:
y = df['target']
X = x_vars.join(c_vars)

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
print(X_train.shape, y_train.shape); print(X_test.shape, y_test.shape);

(856, 11) (856,)
(215, 11) (215,)


Save as csv for use in fit evaluation step

In [8]:
X_train.to_csv(os.path.join(model_v2_dir, 'X_train.csv'), index=False)
X_test.to_csv(os.path.join(model_v2_dir, 'X_test.csv'), index=False)
y_train.to_csv(os.path.join(model_v2_dir, 'y_train.csv'), index=False)
y_test.to_csv(os.path.join(model_v2_dir, 'y_test.csv'), index=False)

In [9]:
np.save(os.path.join(model_v2_dir, 'X_train'), X_train)
np.save(os.path.join(model_v2_dir, 'X_test'), X_test)
np.save(os.path.join(model_v2_dir, 'y_train'), y_train)
np.save(os.path.join(model_v2_dir, 'y_test'), y_test)

#### Fit regression model

(a) Linear regression

In [10]:
# Import LinearRegression from sklearn
from sklearn.linear_model import LinearRegression

# Instantiate the regressor
reg_lin = LinearRegression()

# Fit the regressor to the training data
reg_lin.fit(X_train, y_train)

# Test the linear regression model
predicted = reg_lin.predict(X_test)
print(reg_lin.intercept_, reg_lin.coef_)

-0.19965944134703173 [ 0.06999916  0.05312202 -0.45028993 -0.39870787  0.01638194  0.02441756
 -0.03542413 -0.14889198  1.80128843  0.78160843  0.2743148 ]


(b) Random forest

In [11]:
# Import RandomForestRegressor from sklearn
from sklearn.ensemble import RandomForestRegressor

# Instantiate the regressor with default value for hyperparameters
reg_forest = RandomForestRegressor(random_state=123)

# Fit the regressor to the training data
reg_forest.fit(X_train, y_train)
reg_forest.feature_importances_

array([0.19530254, 0.02053476, 0.53014203, 0.15175003, 0.0348578 ,
       0.01129251, 0.01076165, 0.02652324, 0.01199936, 0.00225218,
       0.00458389])

(c) Neural Network

In [12]:
# Import MLPRegressor from skleatn
from sklearn.neural_network import MLPRegressor

#Instantiate the regressor with default values
reg_neural = MLPRegressor(random_state=123)

# Fit the regressor to the training data
reg_neural.fit(X_train, y_train)



MLPRegressor(random_state=123)

#### Dump model_fit file(s)

In [13]:
# Import the dump function
from joblib import dump

In [14]:
# Save linear regression model to disk
filename = os.path.join(model_v2_dir,'lin_model_v2_1.joblib')
dump(reg_lin, open(filename, 'wb'))

In [15]:
# Save random forest model to disk
filename = os.path.join(model_v2_dir,'forest_model_v2_1.joblib')
dump(reg_forest, open(filename, 'wb'))

In [16]:
# Save neural network model to disk
filename = os.path.join(model_v2_dir,'neural_model_v2_1.joblib')
dump(reg_neural, open(filename, 'wb'))