## P1 Random Forest

In [1]:
# Importing all the necessary packages
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot')

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Loading in our dataset
df = pd.read_csv('csv_files/p1_comp2_final_use.csv', index_col=0)
df.head()

Unnamed: 0,X1,X2,X3_Education_Bins,X4_Marital_Bins,X5_Age_Bins,X6_Payment_Bins,X7_Payment_Bins,X8_Payment_Bins,X9_Payment_Bins,X10_Payment_Bins,...,X17_abs,X17_sign,X17_monthlydif,X18_binary,X19_binary,X20_binary,X21_binary,X22_binary,X23_binary,Y
0,0.139347,2,2,1,1,2,2,1,1,2,...,0.0,0,1,0,1,0,0,0,0,1
1,0.46216,2,2,2,1,1,2,2,2,2,...,0.299742,1,1,0,1,1,1,0,1,1
2,0.394132,2,2,2,2,2,2,2,2,2,...,0.504505,1,1,1,1,1,1,1,1,0
3,0.278693,2,2,1,2,2,2,2,2,2,...,0.624888,1,1,1,1,1,1,1,1,0
4,0.278693,1,2,1,3,1,2,1,2,2,...,0.540601,1,1,1,1,1,1,1,1,0


In [3]:
# Splitting up our data into variable and target data
X = df.iloc[:, :-1] # Variable
Y = df.Y # Target

In [4]:
# import the RandomOverSampler package from imblearn 
from imblearn.over_sampling import RandomOverSampler

# define the model
ros = RandomOverSampler(random_state=2019)

# fit the training data only to the RandomOverSampler model
# this will help address the imbalanced nature of the target variable 
X_resample, Y_resample = ros.fit_resample(X, Y)

In [6]:
# tweaked code from class to build this block of code

# save features and targets as the X_resample and Y_resample variables 
features, targets = X_resample, Y_resample

# define an empty list that the Random Forest model will feed into
models = []
models.append(('Random Forest:', RandomForestClassifier(n_estimators=100, max_depth=13, max_features=8, min_samples_split=10,
                                                        random_state=2019)))


# use Cross Validation in the model with a 'stratify' option using the StratifiedKFolds package from sklearn
# specifiy that the scoring method is F1 
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=2019)
for name, model in models:
    score = cross_val_score(model, features, targets, cv=cv, scoring='f1')
    print("Model:{0}, F1 Score: mean={1:0.5f}, var={2:0.5f}".format(name, score.mean(), score.var()))
    

# Same as above but the scoring option has beeen changed to AUC
cv1 = StratifiedKFold(n_splits=10, shuffle=True, random_state=2019)
for name, model in models:
    score = cross_val_score(model, features, targets, cv=cv1, scoring='roc_auc')
    print("Model:{0}, AUC Score: mean={1:0.5f}, var={2:0.5f}".format(name, score.mean(), score.var()))

Model:Random Forest:, F1 Score: mean=0.92813, var=0.00011
Model:Random Forest:, AUC Score: mean=0.97923, var=0.00010
