Pros:
    Very accurate and robust
    Does not suffer from overfitting
    Can do feature selection
Cons:
    Computational complexity
    Difficult to interpret?


In [25]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import warnings
warnings.filterwarnings("ignore")
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import numpy as np



In [26]:
df = pd.read_csv('./Data Exploration/wdbc.csv')
df = df.drop(["Area", "AreaSE", "AreaWorst", "Perimeter", "PerimeterSE", "PerimeterWorst"], axis = 1)
encoder = LabelEncoder().fit(df["B/M"])
df['B/M'] = encoder.transform(df["B/M"])
not_data = ["ID", "B/M"]
label = df["B/M"]
feature = df.drop(not_data, axis = 1)
data = pd.concat([label, feature], axis=1)

No need to standardise as not sensitive to magnitude

In [27]:
x_train, x_test, y_train, y_test = train_test_split(feature, label, test_size=0.25, stratify=label)

In [28]:
# ROC ROC CURE, AUC CURVE

rfc = RandomForestClassifier(random_state=0, n_estimators=10)
rfc.fit(x_train, y_train)
y_pred = rfc.predict(x_test)

print(accuracy_score(y_test, y_pred))
feature_scores = pd.Series(rfc.feature_importances_, index=x_train.columns).sort_values(ascending=False)
print(feature_scores)

0.951048951048951
RadiusWorst              0.275733
Concavity                0.251079
ConcavePointsWorst       0.119291
Compactness              0.086320
ConcavityWorst           0.041250
RadiusSE                 0.038521
Radius                   0.027407
CompactnessWorst         0.026663
TextureWorst             0.022253
SymmetryWorst            0.014792
FractalDimensionSE       0.013633
Smoothness               0.011304
TextureSE                0.009483
FractalDimension         0.009468
ConcavitySE              0.008609
ConcavePoints            0.007796
Texture                  0.007223
SmoothnessWorst          0.005561
FractalDimensionWorst    0.005189
CompactnessSE            0.005063
SymmetrySE               0.004776
ConcavePointsSE          0.003966
SmoothnessSE             0.003820
Symmetry                 0.000799
dtype: float64


In [29]:


cm = confusion_matrix(y_test, y_pred)

print('Confusion matrix\n\n', cm)

print(classification_report(y_test, y_pred))

Confusion matrix

 [[89  1]
 [ 6 47]]
              precision    recall  f1-score   support

           0       0.94      0.99      0.96        90
           1       0.98      0.89      0.93        53

    accuracy                           0.95       143
   macro avg       0.96      0.94      0.95       143
weighted avg       0.95      0.95      0.95       143



In [30]:
feature_scores = feature_scores.index.tolist()
important_features = feature_scores[:-10]
print(important_features)
feature = df[important_features]
x_train, x_test, y_train, y_test = train_test_split(feature, label, test_size=0.25, stratify=label)



['RadiusWorst', 'Concavity', 'ConcavePointsWorst', 'Compactness', 'ConcavityWorst', 'RadiusSE', 'Radius', 'CompactnessWorst', 'TextureWorst', 'SymmetryWorst', 'FractalDimensionSE', 'Smoothness', 'TextureSE', 'FractalDimension']


In [40]:
rfc = RandomForestClassifier(random_state=0, n_estimators=100)
rfc.fit(x_train, y_train)
y_pred = rfc.predict(x_test)

print(accuracy_score(y_test, y_pred))
feature_scores = pd.Series(rfc.feature_importances_, index=x_train.columns).sort_values(ascending=False)



0.9230769230769231


In [41]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

[[83  7]
 [ 4 49]]


In [61]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in range(0,100,1)]
max_depth = [int(x) for x in np.linspace(10, 150, num = 11)]
max_depth.append(None)
max_features = ['sqrt']
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)


{'n_estimators': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99], 'max_features': ['sqrt'], 'max_depth': [10, 24, 38, 52, 66, 80, 94, 108, 122, 136, 150, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [62]:
rf_random = RandomizedSearchCV(estimator = rfc, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(x_train, y_train)




Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END bootstrap=False, max_depth=70, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=50; total time=   0.1s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=220; total time=   0.2s
[CV] END bootstrap=False, max_depth=70, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=320; total time=   0.3s
[CV] END bootstrap=False, max_depth=110, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=370; total time=   0.4s
[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=290; total time=   0.3s
[CV] END bootstrap=False, max_depth=40, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=20; total time=   0.0s
[CV] END bootstrap=False, max_depth=40, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=

In [63]:
from sklearn import metrics

def evaluate(model, test_features, test_labels):
    y_pred = model.predict(test_features)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    print (accuracy)

print(rf_random.best_estimator_)
best_random = rf_random.best_estimator_
evaluate(best_random, x_test, y_test)

RandomForestClassifier(bootstrap=False, max_depth=38, n_estimators=66,
                       random_state=0)
0.951048951048951
