In [37]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import KFold
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler


In [38]:
all_df=pd.read_excel('./TrainDataset2023.xls', index_col=False) #Read from File
all_df.drop('ID', axis=1, inplace=True) # Drop ID - not needed for training

In [39]:
#Impute Missing Values
imputer = SimpleImputer(missing_values = 999, strategy="median") # Test Other Methods
SimpleImputer(missing_values = 999)
s = 0
for i in all_df:
    imputer.fit(all_df)
    array = np.array(all_df[i])
    all_df[i] = imputer.fit_transform(array.reshape(-1, 1))  

In [40]:
#Min Max 
colno = 0
for col in all_df:
    if col == 'RelapseFreeSurvival (outcome)':
        continue
    if colno >= 12: # ONLY NORMALISE MRI SCAN DATA
        colmean = np.median(all_df[col])
        colstd = np.std(all_df[col])
        upper = colmean + (3*colstd)
        lower = colmean - (3*colstd) # USING MIN ALSO SEEMS TO SKEW DATA
        #Comment out to cancel
        #all_df[col] = minmax_scale(all_df[col], feature_range=(lower,upper)) #Minimal Change - Downscales severity of Mean Squared Error
    colno+=1

In [41]:
print('Random Forest \n')
y = all_df['pCR (outcome)']
x = all_df.drop('pCR (outcome)', axis=1)
train_X, validate_X, train_y, validate_y = train_test_split(x, y, test_size=0.15, shuffle = False)

def RFDepthSelection(trainx, trainy, testx, testy):
    feature_limit = 1
    mse_score = 0
    while feature_limit < 50:
        rfregressor = RandomForestClassifier(n_estimators=100, random_state = 0, max_features = feature_limit)
        rfregressor.fit(trainx, trainy)
        mse = rfregressor.score(testx, testy)
        if mse > mse_score:# - (mse_score/20):
            mse_score = mse
            feature_limit+=1
            continue
        else:
            return feature_limit

#Make x the validated feature
#rfregressor = RandomForestRegressor(n_estimators=100, random_state = 0, max_features = feature_limit) #Default Measure = MSE
#K-fold
featureno = RFDepthSelection(train_X, train_y, validate_X, validate_y) #Use Nested Kfold for this?
print(featureno)
rfregressor = RandomForestClassifier(n_estimators=100, random_state = 0, max_features = featureno)

scaler = StandardScaler()
Xs = scaler.fit_transform(x)

mean_recall = []
mean_f1 = []
mean_ba = []

kf = KFold(n_splits=5)
for train, test in kf.split(Xs, y):
    rfregressor.fit(Xs[train], y[train])

    y_pred = rfregressor.predict(Xs[test])

    print("Results for Fold:")

    # Recall
    recall = recall_score(y[test], y_pred)
    mean_recall.append(recall)
    print('Recall: %.3f' % recall)

    # F1
    f1 = f1_score(y[test], y_pred)
    mean_f1.append(f1)
    print('F1 score: %.3f' % f1)

    # Balanced Accuracy
    balanced_accuracy = balanced_accuracy_score(y[test], y_pred)
    print('Balanced Accuracy: %.3f' % balanced_accuracy)
    mean_ba.append(balanced_accuracy)

    print()

print("Mean Recall: %.3f" % np.mean(mean_recall))
print("Mean F1: %.3f" % np.mean(mean_f1))
print("Mean Balanced Accuracy: %.3f" % np.mean(mean_ba))

Random Forest 

2
Results for Fold:
Recall: 0.000
F1 score: 0.000
Balanced Accuracy: 0.493

Results for Fold:
Recall: 0.118
F1 score: 0.211
Balanced Accuracy: 0.559

Results for Fold:
Recall: 0.000
F1 score: 0.000
Balanced Accuracy: 0.491

Results for Fold:
Recall: 0.000
F1 score: 0.000
Balanced Accuracy: 0.500

Results for Fold:
Recall: 0.214
F1 score: 0.316
Balanced Accuracy: 0.592

Mean Recall: 0.066
Mean F1: 0.105
Mean Balanced Accuracy: 0.527
