In [44]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score, classification_report, roc_curve
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings("ignore")

# Get data & split into train/test/validation sets

In [45]:
df = pd.read_csv('./Data Exploration/wdbc.csv')
df = df.drop(["Area", "AreaSE", "AreaWorst", "Perimeter", "PerimeterSE", "PerimeterWorst"], axis = 1)
encoder = LabelEncoder().fit(df["B/M"])
df['B/M'] = encoder.transform(df["B/M"])
not_data = ["ID", "B/M"]
label = df["B/M"]
feature = df.drop(not_data, axis = 1)
data = pd.concat([label, feature], axis=1)



In [46]:
x_train, x_test, y_train, y_test = train_test_split(feature, label, test_size=0.20, stratify=label)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.20, stratify=y_train)

# Create & Train Initial Model (no tuning)

Our initial model that we train, before hyperparameter tuning. We want a higher Recall, as it accounts for false negatives - and gives us a lower proportion of them, which we desire. We get a very high accuracy and recall and precision - which makes me suspect of overfitting, especially because we have such a small sample size respective to the number of features.

In [47]:
rfc = RandomForestClassifier(random_state=0, n_estimators=10)
rfc.fit(x_train, y_train)
y_pred = rfc.predict(x_val)

def bench_marks(y_test, y_pred):
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
    print(f'Recall: {recall_score(y_test, y_pred)}')
    print(f'Precision: {precision_score(y_test, y_pred)}')

bench_marks(y_val, y_pred)

              precision    recall  f1-score   support

           0       0.92      0.98      0.95        57
           1       0.97      0.85      0.91        34

    accuracy                           0.93        91
   macro avg       0.94      0.92      0.93        91
weighted avg       0.94      0.93      0.93        91

[[56  1]
 [ 5 29]]
Accuracy: 0.9340659340659341
Recall: 0.8529411764705882
Precision: 0.9666666666666667


# Feature Selection
One 'feature' of Random Forest Classification is we can get the importance of each figure in the category - which can help us get rid of features with low utility and also make inferences from our results. We will remove our the lowest importance features using our importance series and recreate our train test sets.

In [48]:
feature_scores = pd.Series(rfc.feature_importances_, index=x_train.columns).sort_values(ascending=False)
print(feature_scores)
feature_scores = feature_scores.index.tolist()
important_features = feature_scores[:-13]
feature = df[important_features]
x_train, x_test, y_train, y_test = train_test_split(feature, label, test_size=0.20, stratify=label)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.20, stratify=y_train)
print("\n")
print(important_features)

RadiusWorst              0.257859
Concavity                0.247608
ConcavePointsWorst       0.121807
Compactness              0.080661
Radius                   0.039158
CompactnessWorst         0.035117
RadiusSE                 0.032078
ConcavePoints            0.029560
FractalDimensionSE       0.026167
SymmetryWorst            0.016330
CompactnessSE            0.013389
ConcavityWorst           0.012979
Texture                  0.012741
SmoothnessWorst          0.008617
Smoothness               0.008528
SymmetrySE               0.008083
TextureWorst             0.007933
TextureSE                0.007503
FractalDimensionWorst    0.006494
ConcavePointsSE          0.006350
SmoothnessSE             0.005968
FractalDimension         0.005365
ConcavitySE              0.005307
Symmetry                 0.004400
dtype: float64


['RadiusWorst', 'Concavity', 'ConcavePointsWorst', 'Compactness', 'Radius', 'CompactnessWorst', 'RadiusSE', 'ConcavePoints', 'FractalDimensionSE', 'SymmetryWorst', 'Co

We remove our features, and try our metrics again. We get a higher precision if we remove the least important features? Why is this happening? It also loses some accuracy. This makes me suspect of overfitting.

In [49]:
# test will validation
rfc = RandomForestClassifier(random_state=0, n_estimators=100)
rfc.fit(x_train, y_train)
y_pred = rfc.predict(x_val)
bench_marks(y_val, y_pred)
feature_scores = pd.Series(rfc.feature_importances_, index=x_train.columns).sort_values(ascending=False)
print(feature_scores)


              precision    recall  f1-score   support

           0       1.00      0.95      0.97        57
           1       0.92      1.00      0.96        34

    accuracy                           0.97        91
   macro avg       0.96      0.97      0.97        91
weighted avg       0.97      0.97      0.97        91

[[54  3]
 [ 0 34]]
Accuracy: 0.967032967032967
Recall: 1.0
Precision: 0.918918918918919
ConcavePointsWorst    0.212548
ConcavePoints         0.202547
RadiusWorst           0.201690
Radius                0.108950
Concavity             0.093138
RadiusSE              0.059088
CompactnessWorst      0.044960
Compactness           0.028964
SymmetryWorst         0.023877
FractalDimensionSE    0.012640
CompactnessSE         0.011598
dtype: float64


# Hyperparameter Tuning

We will use a random grib with a randomized search to try different iterations of the trees and brute force our way to a optimal solution. We will repeat the code below, adjusting the parameters each iteration.

In [52]:
n_estimators = [int(x) for x in range(10,100,1)]
max_depth = [int(x) for x in np.linspace(20, 150, num = 5)]
max_depth.append(None)
max_features = ['sqrt']
min_samples_split = [2, 3, 4, 5, 7, 10]
min_samples_leaf = [1, 2, 3, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [53]:
rf_random = RandomizedSearchCV(estimator = rfc, param_distributions = random_grid, n_iter = 200, cv = 5, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(x_train, y_train)
rfc = rf_random.best_estimator_
#validation set
y_pred = rfc.predict(x_val)
print(rf_random.best_estimator_)
print(f'Accuracy: {accuracy_score(y_val, y_pred)}')
print(f'Recall: {recall_score(y_val, y_pred)}')
print(f'Precision: {precision_score(y_val, y_pred)}')
#actual test set
y_real_pred = rfc.predict(x_test)
print(f'Accuracy: {accuracy_score(y_test, y_real_pred)}')
print(f'Recall: {recall_score(y_test, y_real_pred)}')
print(f'Precision: {precision_score(y_test, y_real_pred)}')

Fitting 5 folds for each of 200 candidates, totalling 1000 fits
[CV] END bootstrap=False, max_depth=52, max_features=sqrt, min_samples_leaf=2, min_samples_split=3, n_estimators=55; total time=   0.1s
[CV] END bootstrap=False, max_depth=52, max_features=sqrt, min_samples_leaf=2, min_samples_split=3, n_estimators=55; total time=   0.1s
[CV] END bootstrap=True, max_depth=85, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=90; total time=   0.1s
[CV] END bootstrap=False, max_depth=117, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=75; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=44; total time=   0.1s
[CV] END bootstrap=True, max_depth=85, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=65; total time=   0.1s
[CV] END bootstrap=True, max_depth=85, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=65; t

# Test for over/ underfitting

In [None]:
train_sizes_abs_accuracy, train_accuracy, test_accuracy = learning_curve(estimator=rfc, X=x_train, y=y_train, scoring='accuracy', cv=5)

train_sizes_abs_error, train_error, test_error = learning_curve(estimator=rfc, X=x_train, y=y_train, scoring='recall', cv=5)

fig, ax = plt.subplots(1, 2, figsize=(10, 5))
train_avg_accuracy, test_avg_accuracy, train_avg_error, test_avg_error = [], [], [], []

for i in range(len(train_sizes_abs_accuracy)):
    train_avg_accuracy.append(np.average(train_accuracy[i]))
    test_avg_accuracy.append(np.average(test_accuracy[i]))

for i in range(len(train_sizes_abs_error)):
    train_avg_error.append(np.average(train_error[i]))
    test_avg_error.append(np.average(test_error[i]))

# Accuracy learning curve
ax[0].plot(train_sizes_abs_accuracy, train_avg_accuracy)
ax[0].plot(train_sizes_abs_accuracy, test_avg_accuracy)
ax[0].legend(['Train', 'Test'])
ax[0].set_ylabel('Accuracy')
ax[0].set_xlabel('Num. samples')

# Error learning curve
ax[1].plot(train_sizes_abs_error, train_avg_error)
ax[1].plot(train_sizes_abs_error, test_avg_error)
ax[1].legend(['Train', 'Test'])
ax[1].set_xlabel('Num. of training samples')
ax[1].set_ylabel('Recall')
plt.show()

##### before was
100% accuracy and not changing with the number of samples is clearly overfitting, so we will manually remove the least significant variables we calculated to simplify our model - so that we can make up for our lack of instances. When writing up this report, we can do these graphs twice - once with many variable and second with few. We get a higher recall, even though we get a lower precision and accuracy when we use less variables. #When writing up explain why each one

In [None]:
# After hyperparameter tuning* and feature selection*
y_real_pred = rfc.predict(x_test)
auc = roc_auc_score(y_test, y_real_pred)

y_pred_prob = rfc.predict_proba(x_test)

fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob[:, 1])
plt.plot(fpr, tpr)
plt.title('ROC curve')
plt.xlabel('False positive rate')
plt.ylabel('True Positive Rate')
plt.show()


We want to minimise false positives..

How does this graph help us? IG I could manually calculate (using the graph) the precesion, accuracy and  recall stats to determine which point is best.