In [7]:
import pandas as pd
import numpy as np

In [8]:
down = pd.read_csv('DownSampled_Oscars.csv')

# DownSampled Data

In [9]:
down.head()

Unnamed: 0,Drama,History,Horror,Metascore,Action,Musical,War,Thriller,Romance,Runtime,imdbRating,Documentary,Science,Fiction,France,Won
0,1,0,0,0.9,0,0,1,0,0,0.505495,0.8,0,0,0,0,1
1,1,0,0,0.311111,1,0,0,1,0,0.131868,0.433333,0,0,0,0,0
2,1,0,0,0.817,0,0,0,0,0,0.346154,0.45,0,0,0,0,1
3,1,0,0,0.492444,0,0,0,0,0,0.263736,0.533333,0,0,0,0,0
4,0,0,0,0.788889,1,0,0,0,0,0.335165,0.766667,0,0,0,1,0


In [12]:
from sklearn.model_selection import StratifiedShuffleSplit

X = down.drop('Won', axis = 1)
y = down['Won']

sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

for train_index, test_index in sss.split(X, y):
     print("TRAIN:", train_index, "TEST:", test_index)
     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
     y_train, y_test = y.iloc[train_index], y.iloc[test_index]

TRAIN: [ 21  45 110  36 172 107  75  37  61  88 180  22   6  58 104 152  42  26
 130 189 162 101 118  85 161  52  84  32  34 146 157  16 131  39  63 159
  23  79 179 132  14  51  77 148 117 171 106  46 137 134 163 122  96  25
 166 128  81 123 181  64  78  11  89 190  50  18 105  87  33 133  41 178
 127  47  93  62 139  82  28   7 111 126  86 108   9   1 149  70  59  60
  27 138  55 191   2  49 140 183  15  54  30   5 114  72  95 129 167 113
 151  90  65 143 160  76 112 165  24 173 120 115 102  94 109  66 175 150
 164 103  80 121  53 188 174 155 135  31 170 187  20  13  71 136  44 154
 185 156   0  35  98 145  17 147  92] TEST: [ 43  73   8  67  57 177  91  38   3  69 125 158  10  68  56 153 184 119
 169  40 116  97  19  74 142  99 186 176  48 100  29 168  12 141   4  83
 144 182 124]
TRAIN: [136  71  85  10 163  22 154  75  73  72 107 151  17 113 155 148  20  84
  13 182   6 122  23 159 187  98  64  27 138 101 114 191  87   1 176  92
  70 162  34 139  76  55 170 117  38 183 171 147 100

# Random Forest Classifier

In [14]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()

In [15]:
rfc.fit(X_train, y_train)

RandomForestClassifier()

In [16]:
pred = rfc.predict(X_test)

In [17]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, pred)

0.9223684210526316

Even though this is a great score, let's see if we can make it better with hyperparameter tuning

In [26]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

n_estimators = [100, 300, 500, 800, 1200]
max_depth = [5, 8, 15, 25, 30]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10] 

forest = RandomForestClassifier()

hyperF = dict(n_estimators = n_estimators, max_depth = max_depth,  
              min_samples_split = min_samples_split, 
             min_samples_leaf = min_samples_leaf)

In [27]:
gridF = GridSearchCV(forest, hyperF, scoring = 'roc_auc', cv = 3, verbose = 1, n_jobs = -1)

In [29]:
bestF = gridF.fit(X_train, y_train)

Fitting 3 folds for each of 500 candidates, totalling 1500 fits


In [32]:
bestF.best_params_

{'max_depth': 30,
 'min_samples_leaf': 5,
 'min_samples_split': 5,
 'n_estimators': 100}

In [36]:
tunedrfc = RandomForestClassifier(n_estimators = 100, max_depth = 30, min_samples_leaf = 5, min_samples_split = 5)

In [37]:
tunedrfc.fit(X_train, y_train)

RandomForestClassifier(max_depth=30, min_samples_leaf=5, min_samples_split=5)

In [38]:
pred = tunedrfc.predict(X_test)

In [39]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, pred)

0.9223684210526316