# Machine Learning with Random Forest

#### I ran this model through Amazon Sagemaker with an instance size of ml.c5d.9xlarge. This took about 9 hours to run.

In [7]:
# Some basic packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# For splitting our data
from sklearn.model_selection import train_test_split

# For some simple model building
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
#from xgboost import XGBClassifier

# This gets rid of those annoying default solver messages when fitting logistic regression
import warnings
warnings.filterwarnings('ignore')

# For cross-validation
from sklearn.model_selection import cross_val_score

# For setting up a temporary directory for caching pipeline results
from tempfile import mkdtemp

# Pipeline
from sklearn.pipeline import Pipeline

# Some scalers we'll try later
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

# For trying PCA later
from sklearn.decomposition import PCA

# For cross-validated grid search
from sklearn.model_selection import GridSearchCV

### 1. Import Data/Setup Train, Validation, Test set

In [12]:
train = pd.read_csv('model_train.csv')
test = pd.read_csv('model_test.csv')

In [13]:
# This is train datasett
X = train.drop('points', axis = 1)
y = train['points']

# This is train datasett
X_test = test.drop('points', axis = 1)
y_test = test['points']

In [14]:
# Import train_test_split package
from sklearn.model_selection import train_test_split

# Split data into train and test, where text_size is 30 percent, andsp train set is 70%
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.30, random_state=42, stratify = y)

### 2. Create a Baseline Model

In [15]:
# Baseline logistic regression
baseline_logreg = LogisticRegression(random_state=1).fit(X_train, y_train)

print(f'Accuracy on train set: {baseline_logreg.score(X_train, y_train)}')
print(f'Accuracy on remainder set: {baseline_logreg.score(X_val, y_val)}')
print(f'Accuracy on test set: {baseline_logreg.score(X_test, y_test)}')

Accuracy on train set: 0.827133547931224
Accuracy on remainder set: 0.8215358686890892
Accuracy on test set: 0.8222712351251539


### 3. Scale, Dimension Reduction, Hyperparameterization

In [17]:
# This packages allows us to save the model so that we can import it and use it later
#import joblib
from sklearn.externals import joblib

In [None]:
# Set up a directory to cache the pipeline results
cachedir = mkdtemp()

# Set up a pipeline
# The steps here act as placeholders and will be changed when we pass the pipeline into the grid search later
my_pipeline = Pipeline([('scaler', StandardScaler()), ('dim_reducer', PCA()), ('model', LogisticRegression())], memory=cachedir)

# n_estimators
n_estimator = [10, 100, 200, 400, 500, 600, 800, 900, 1000]

# Parameter grid
param_grid = [

    # l2 (default) with PCA
    {'scaler': [StandardScaler(), RobustScaler()],
     'dim_reducer': [PCA()],
     'dim_reducer__n_components': [10, 100, 200, 400, 500, 600, 800, 900, 1000],
     'model': [RandomForestClassifier(n_jobs = -1)],
     'model__n_estimators': n_estimator}
]

# Instantiate the log reg grid search
random_forest_gs = GridSearchCV(my_pipeline, param_grid=param_grid, cv=5, verbose=10)

# Fit the log reg grid search
fitted_randomforest_gs = random_forest_gs.fit(X_train, y_train)

Fitting 5 folds for each of 162 candidates, totalling 810 fits
[CV] dim_reducer=PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), dim_reducer__n_components=10, model=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), model__n_estimators=10, scaler=StandardScaler(copy=True, with_mean=True, with_std=True) 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  dim_reducer=PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), dim_reducer__n_components=10, model=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), model__n_estimators=10, scaler=StandardScaler(copy=True, with_mean=True, with_std=True), score=0.7277437588318417, total=  13.7s
[CV] dim_reducer=PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), dim_reducer__n_components=10, model=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   15.1s remaining:    0.0s


[CV]  dim_reducer=PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), dim_reducer__n_components=10, model=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), model__n_estimators=10, scaler=StandardScaler(copy=True, with_mean=True, with_std=True), score=0.7289000549579964, total=  11.8s
[CV] dim_reducer=PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), dim_reducer__n_components=10, model=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None,

[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   28.2s remaining:    0.0s


[CV]  dim_reducer=PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), dim_reducer__n_components=10, model=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), model__n_estimators=10, scaler=StandardScaler(copy=True, with_mean=True, with_std=True), score=0.7261521551385727, total=  11.6s
[CV] dim_reducer=PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), dim_reducer__n_components=10, model=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None,

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   41.1s remaining:    0.0s


[CV]  dim_reducer=PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), dim_reducer__n_components=10, model=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), model__n_estimators=10, scaler=StandardScaler(copy=True, with_mean=True, with_std=True), score=0.72269765250844, total=  11.6s
[CV] dim_reducer=PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), dim_reducer__n_components=10, model=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, m

[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   54.1s remaining:    0.0s


[CV]  dim_reducer=PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), dim_reducer__n_components=10, model=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), model__n_estimators=10, scaler=StandardScaler(copy=True, with_mean=True, with_std=True), score=0.7282506281407035, total=  11.2s
[CV] dim_reducer=PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), dim_reducer__n_components=10, model=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None,

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.1min remaining:    0.0s


[CV]  dim_reducer=PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), dim_reducer__n_components=10, model=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), model__n_estimators=10, scaler=RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True), score=0.739284032030146, total=  11.3s
[CV] dim_reducer=PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), dim_reducer__n_components=10, model=RandomForestClassifier(bootstrap=True, class_weight=None, cri

[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  1.3min remaining:    0.0s


[CV]  dim_reducer=PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), dim_reducer__n_components=10, model=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), model__n_estimators=10, scaler=RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True), score=0.7366726858757949, total=  11.3s
[CV] dim_reducer=PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), dim_reducer__n_components=10, model=RandomForestClassifier(bootstrap=True, class_weight=None, cr

[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  1.5min remaining:    0.0s


[CV]  dim_reducer=PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), dim_reducer__n_components=10, model=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), model__n_estimators=10, scaler=RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True), score=0.7395776085420429, total=  11.3s
[CV] dim_reducer=PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), dim_reducer__n_components=10, model=RandomForestClassifier(bootstrap=True, class_weight=None, cr

[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  1.7min remaining:    0.0s


[CV]  dim_reducer=PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), dim_reducer__n_components=10, model=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), model__n_estimators=10, scaler=RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True), score=0.734474366020256, total=  11.3s
[CV] dim_reducer=PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), dim_reducer__n_components=10, model=RandomForestClassifier(bootstrap=True, class_weight=None, cri

[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  2.0min remaining:    0.0s


[CV]  dim_reducer=PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), dim_reducer__n_components=10, model=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), model__n_estimators=10, scaler=RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True), score=0.742697864321608, total=  11.3s
[CV] dim_reducer=PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), dim_reducer__n_components=10, model=RandomForestClassifier(bootstrap=True, class_weight=None, cri

[Parallel(n_jobs=1)]: Done 810 out of 810 | elapsed: 431.1min finished


In [None]:
# Save to file in the current working directory
randomforest_ml = "randomforest_ml.pkl"
joblib.dump(fitted_randomforest_gs, randomforest_ml)

In [21]:
# View best estimator for Random Forest
# This is helpful for if I want to run the model again and didn't save the model
fitted_randomforest_gs.best_estimator_

Pipeline(memory='/tmp/tmporjlr3r4',
     steps=[('scaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)), ('dim_reducer', PCA(copy=True, iterated_power='auto', n_components=200, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('model', RandomForestClassifier(bootstrap=..._jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [22]:
print('Logistic Regression Accuracy Score for Train Set:', fitted_randomforest_gs.score(X_train, y_train))
print('Logistic Regression Accuracy Score for Validation Set:', fitted_randomforest_gs.score(X_val, y_val))
print('Logistic Regression Accuracy Score for Test Set:', fitted_randomforest_gs.score(X_test, y_test))

Logistic Regression Accuracy Score for Train Set: 0.9997801680144461
Logistic Regression Accuracy Score for Validation Set: 0.8124496226276837
Logistic Regression Accuracy Score for Test Set: 0.8181934755847353


In [23]:
# Save to file in the current working directory
randomforest_ml = "randomforest_ml.pkl"
joblib.dump(fitted_randomforest_gs, randomforest_ml)

['randomforest_ml.pkl']

In [26]:
## CONFUSION MATRIX
from sklearn.metrics import confusion_matrix

# Confusion Matrix
y_pred = fitted_randomforest_gs.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[17348,  3308],
       [ 3781, 14555]])

- Correctly predicted a bad score 17348
- Incorrectly predicted a bad score 3781
- Correctly predicted a good score 14555
- Incorrectly predicted a bad score 3309

In [27]:
# Accuracy Score
from sklearn.metrics import accuracy_score

# Precision Score
from sklearn.metrics import precision_score

# Recall Score
from sklearn.metrics import recall_score

# F1 Score
from sklearn.metrics import f1_score

print('Model Evaluation for Logistic Regression:')
print('Accuracy Score for Test data:', accuracy_score(y_test, y_pred))
print('Precision Score for Test data:', precision_score(y_test, y_pred))
print('Recall Score for Test data:', recall_score(y_test, y_pred))
print('F1 Score for Test data:', f1_score(y_test, y_pred))

Model Evaluation for Logistic Regression:
Accuracy Score for Test data: 0.8181934755847353
Precision Score for Test data: 0.814812741420814
Recall Score for Test data: 0.793793630017452
F1 Score for Test data: 0.8041658609353849
