## 04 Model Training

#### Import relevant libraries

In [1]:
import os
import sklearn as skl
import pandas as pd
import numpy as np

#### Load model training data

In [2]:
input_dir = os.path.join('..', 'data', 'processed')
df = pd.read_csv(os.path.join(input_dir, 'model_data.csv'))

In [3]:
df.head()

Unnamed: 0,target,x01,x02,x03,x04,x05,x06,x07,x08,c09
0,0.012967,0.342183,0.106354,27.887101,0.859689,0.290916,0.778672,0.220809,0.0,vert_int
1,0.019325,0.486981,0.428369,12.17258,0.857886,0.916615,0.0,1.0,0.0,vert_int
2,0.018841,0.616083,0.035663,17.136237,0.84722,0.024628,0.0,1.00008,10.487586,vert_int
3,0.009183,0.412378,0.244196,32.556154,0.889972,0.325085,0.963355,0.0,14.58546,vert_int
4,0.010767,0.585865,0.474441,18.140371,0.835778,0.537077,0.990224,0.009776,0.0,vert_int


#### One-hot encoding categorical variables

In [4]:
categorical_vars = df.columns[[col_name.startswith('c') for col_name in df.columns]]
c_vars = pd.get_dummies(df[categorical_vars], drop_first=True)
c_vars.head()

Unnamed: 0,c09_other_wires,c09_restructured,c09_vert_int
0,0,0,1
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1


#### Scale predictor variables

In [5]:
from sklearn.preprocessing import scale
continuous_vars = df.columns[[col_name.startswith('x') for col_name in df.columns]]
x_vars = df[continuous_vars].copy()

for col_name in continuous_vars:
    x_vars[col_name] = scale(df[col_name])

x_vars.head()

Unnamed: 0,x01,x02,x03,x04,x05,x06,x07,x08
0,-1.173803,-0.247177,0.680971,-0.257555,0.449758,1.710097,0.053862,-0.710241
1,-0.509805,1.416664,-1.042475,-0.283972,2.534275,-0.481968,2.202111,-0.710241
2,0.08222,-0.612436,-0.4981,-0.440246,-0.437378,-0.481968,2.202332,0.048291
3,-0.851911,0.465048,1.193036,0.186164,0.563595,2.230005,-0.554915,0.344676
4,-0.056351,1.654716,-0.387974,-0.60791,1.269845,2.305643,-0.527961,-0.710241


#### Split into test and train sets

In [6]:
y = df['target']
X = x_vars.join(c_vars)

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
print(X_train.shape, y_train.shape); print(X_test.shape, y_test.shape);

(92, 11) (92,)
(24, 11) (24,)


Save as csv for use in fit evaluation step

In [8]:
X_train.to_csv(os.path.join(input_dir, 'X_train.csv'), index=False)
X_test.to_csv(os.path.join(input_dir, 'X_test.csv'), index=False)
y_train.to_csv(os.path.join(input_dir, 'y_train.csv'), index=False)
y_test.to_csv(os.path.join(input_dir, 'y_test.csv'), index=False)

In [9]:
np.save(os.path.join(input_dir, 'X_train'), X_train)
np.save(os.path.join(input_dir, 'X_test'), X_test)
np.save(os.path.join(input_dir, 'y_train'), y_train)
np.save(os.path.join(input_dir, 'y_test'), y_test)

#### Fit regression model

(a) Linear regression

In [10]:
# Import LinearRegression from sklearn
from sklearn.linear_model import LinearRegression

# Instantiate the regressor
reg_lin = LinearRegression()

# Fit the regressor to the training data
reg_lin.fit(X_train, y_train)

# Test the linear regression model
predicted = reg_lin.predict(X_test)
print(reg_lin.intercept_, reg_lin.coef_)

0.04141000609225279 [-0.00397137  0.00127563  0.00497343 -0.00051518  0.00234758 -0.00012792
  0.00325411 -0.00589277 -0.03356462 -0.02877338 -0.03316382]


(b) Random forest

In [11]:
# Import RandomForestRegressor from sklearn
from sklearn.ensemble import RandomForestRegressor

# Instantiate the regressor with default value for hyperparameters
reg_forest = RandomForestRegressor(random_state=123)

# Fit the regressor to the training data
reg_forest.fit(X_train, y_train)
reg_forest.feature_importances_

array([0.26880003, 0.03905482, 0.27615246, 0.20993693, 0.06494005,
       0.00528364, 0.04070483, 0.06607236, 0.01128411, 0.00944684,
       0.00832392])

(c) Neural Network

In [12]:
# Import MLPRegressor from skleatn
from sklearn.neural_network import MLPRegressor

#Instantiate the regressor with default values
reg_neural = MLPRegressor(random_state=123)

# Fit the regressor to the training data
reg_neural.fit(X_train, y_train)

MLPRegressor(random_state=123)

#### Dump model_fit file(s)

In [13]:
# Import the dump function
from joblib import dump

In [14]:
# Save linear regression model to disk
filename = os.path.join(input_dir,'lin_model_3.joblib')
dump(reg_lin, open(filename, 'wb'))

In [15]:
# Save random forest model to disk
filename = os.path.join(input_dir,'forest_model_1.joblib')
dump(reg_forest, open(filename, 'wb'))

In [16]:
# Save neural network model to disk
filename = os.path.join(input_dir,'neural_model_1.joblib')
dump(reg_neural, open(filename, 'wb'))