In [None]:
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier 

In [None]:
df = pd.read_csv('/content/drive/Shareddrives/CSC373_DMP_Wu_Chenyang/DMP_Classification/Data/Cancer_data.csv')

df['Class'] = df['Class'].map({'M':1, 'B':0})

df.head(20)

In [None]:
f_space = df.iloc[:,df.columns != 'Class']
f_class = df.iloc[:,df.columns == 'Class']

training_set, test_set, training_class, test_class = train_test_split(f_space,f_class,test_size=.2,random_state=42)

In [None]:
rfc = RandomForestClassifier(random_state=42)

In [None]:
np.random.seed(42)

params = {'max_depth':[2,3,4], 'bootstrap':[True, False],
          'max_features': ['auto','sqrt','log2', None],
          'criterion': ['gini','entropy']}

gsearch = GridSearchCV(rfc, cv = 10, param_grid=params,n_jobs= 3)

gsearch.fit(training_set,training_class)

print("Best parameters: ", gsearch.best_params_)

In [None]:
rfc.set_params(criterion = 'entropy', max_features = 'log2', max_depth = 4,
               warm_start=True, oob_score=True)

min_est = 15
max_est = 800


error_rate = {}

for i in range(min_est, max_est + 1):
  rfc.set_params(n_estimators = i)
  rfc.fit(training_set,training_class)

  oob_error = 1 - rfc.oob_score_
  error_rate[i] = oob_error



In [None]:
oob_series = pd.Series(error_rate)

fig_ax = plt.subplots()
oob_series.plot(kind='line',
                color = 'red')
plt.axhline(0.035, 
            color='purple',
           linestyle='--')
plt.axhline(0.045, 
            color='purple',
           linestyle='--')
plt.xlabel('n_estimators')
plt.ylabel('OOB Error Rate')
plt.title('OOB Error Rate Across various Forest sizes \n(From 15 to 500 trees)')

In [None]:
rfc.set_params(n_estimators=130,
                bootstrap = True,
                warm_start=False, 
                oob_score=False)

rfc.fit(training_set, training_class)


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, plot_roc_curve, plot_confusion_matrix

predicted = rfc.predict(test_set)
accuracy = accuracy_score(test_class, predicted)

print(f'Out-of-bag score estimate: {rfc.oob_score_:.3}')
print(f'Mean accuracy score: {accuracy:.3}')

In [None]:
cm = pd.DataFrame(confusion_matrix(test_class, predicted))
sns.heatmap(cm, annot=True)

In [None]:
rfc_disp = plot_roc_curve(rfc,test_set,test_class)
plt.show()