In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import plot_tree
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

##Seaborn for fancy plots. 
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams["figure.figsize"] = (8,8)

# 3950 Assignment 1: Part 2

For this assignment we want to use some sort of tree based model to classify the data below. We have a very small training set, so overfitting is a very real concern. 

Some specifics for this assignment:
<ul>
<li>Please paste in any outside functions you may use before submitting. E.g. if you're importing any functions from a util file, paste them in here for this. The reason for this is that it makes it massively easier for me when downloading a submission from everyone. Please put the blocks with those functions before they're called, so I can hit Run All to run the entire workbook. 
<li>
</ul>

In [108]:
df = pd.read_csv("training.csv")
df = df.drop(columns={"id"})
df["target"] = df["target"].astype("int32")
df.sample(10)

Unnamed: 0,target,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199,var_200
216,1,0.417,0.362,0.084,0.182,0.281,0.244,0.266,0.4,0.582,...,0.63,0.592,0.659,0.037,0.852,0.501,0.462,0.29,0.357,0.437
53,0,0.089,0.392,0.351,0.807,0.627,0.04,0.869,0.119,0.343,...,0.774,0.484,0.508,0.412,0.458,0.981,0.616,0.539,0.977,0.979
121,0,0.536,0.905,0.252,0.787,0.642,0.129,0.196,0.353,0.003,...,0.632,0.062,0.324,0.319,0.677,0.508,0.492,0.546,0.051,0.47
52,0,0.14,0.667,0.745,0.472,0.598,0.735,0.462,0.802,0.405,...,0.435,0.729,0.588,0.225,0.362,0.74,0.43,0.329,0.411,0.766
3,0,0.681,0.245,0.909,0.785,0.738,0.57,0.692,0.411,0.182,...,0.219,0.691,0.261,0.031,0.968,0.353,0.798,0.104,0.944,0.09
173,0,0.443,0.999,0.527,0.291,0.14,0.608,0.848,0.132,0.454,...,0.882,0.236,0.031,0.512,0.155,0.904,0.813,0.054,0.117,0.509
78,1,0.223,0.513,0.924,0.788,0.092,0.925,0.876,0.618,0.79,...,0.805,0.268,0.024,0.761,0.221,0.767,0.539,0.424,0.859,0.227
210,0,0.782,0.908,0.52,0.649,0.557,0.385,0.934,0.124,0.229,...,0.697,0.212,0.334,0.402,0.58,0.839,0.872,0.778,0.735,0.779
104,1,0.706,0.219,0.331,0.979,0.274,0.117,0.093,0.584,0.038,...,0.08,0.657,0.22,0.08,0.542,0.967,0.445,0.193,0.096,0.885
81,1,0.028,0.706,0.018,0.984,0.564,0.831,0.697,0.032,0.414,...,0.085,0.511,0.551,0.914,0.061,0.85,0.977,0.557,0.624,0.935


In [109]:
#Check for missing
df.isna().sum().sort_values(ascending=False)

target     0
var_138    0
var_128    0
var_129    0
var_130    0
          ..
var_70     0
var_71     0
var_72     0
var_73     0
var_200    0
Length: 201, dtype: int64

Create a trial run to see what a default forrest looks like. 

In [110]:

y_trial = np.array(df["target"]).reshape(-1,1)
X_trial = np.array(df.drop(columns={"target"}))
X_trainT, X_testT, y_trainT, y_testT = train_test_split(X_trial, y_trial.ravel(), test_size=.3)

trial_forrest = RandomForestClassifier()
trial_pipe = [('scale', StandardScaler()),('forest', trial_forrest) ]
pipe = Pipeline(trial_pipe)
# The pipeline can be used as any other estimator
# and avoids leaking the test set into the train set
pipe.fit(X_trainT, y_trainT)
print("Score:", pipe.score(X_testT, y_testT))
trial_depths = [estimator.tree_.max_depth for estimator in trial_forrest.estimators_]
print("Avg Depth:", np.mean(trial_depths))

Score: 0.5733333333333334
Avg Depth: 8.22


Create model using grid search to tune HPs. The training set is very small, so calculation of many options should be pretty fast. 

I'm going to scale the data, but I suspect that will not be a massive impact. 

In [111]:
#Create Pipeline with Scaling. 
scaler = StandardScaler()
estimator = RandomForestClassifier(n_jobs=-1, verbose=0)
pipe = Pipeline(steps=[("scaler", scaler), ("forrest", estimator)])

In [114]:
X_train, X_test, y_train, y_test = train_test_split(X_trial, y_trial.ravel(), test_size=.3)

rf_para = {'forrest__min_samples_split':[3,4,5,6,7,8,9,10],
#            'forrest__criterion':["gini","entropy"],
            'forrest__max_depth':[5,6,7,8,9],
            'forrest__n_estimators':[100,150,175],
            'forrest__max_samples':[.4, .5, .6, .7]}
#            'forrest_max_features':[100,120,140,160,180,199]}

#rf_para = {'forrest__max_depth':[3,4,5,6,7,8,9]}
 
clf = GridSearchCV(pipe, param_grid=rf_para, cv=10, n_jobs=-1) 
print("post grid")
clf.fit(X_train, y_train.ravel())
print("post fit")
best = clf.best_estimator_
print("post best")
print(best.score(X_test, y_test))
print("post score")

post grid
post fit
post best
0.6533333333333333
post score


In [102]:
print(best.score(X_test, y_test))

0.72


# Testing

Please leave the stuff below as-is in your file. 

This will take your best model and score it with the test data. 

In [115]:
#Load Test Data
test_df = pd.read_csv("testing.csv")
test_df["id"] = test_df["id"].astype("int32")

In [116]:
#Create tests and score
test_y = np.array(test_df["target"]).reshape(-1,1)
test_X = np.array(test_df.drop(columns={"id","target"}))
print(cross_val_score(best, test_X, test_y.ravel(), cv=5, scoring='roc_auc'))
print(best.score(test_X,test_y))

[0.88521246 0.8699046  0.86946671 0.88331467 0.86870119]
0.6247088607594937
