##### **Imports**

In [18]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [19]:
all_df=pd.read_excel('./TrainDataset2023.xls', index_col=False) #Read from File
all_df.drop('ID', axis=1, inplace=True) # Drop ID - not needed for training

In [20]:
#Impute Missing Values
imputer = SimpleImputer(missing_values = 999, strategy="median") # Test Other Methods
SimpleImputer(missing_values = 999)
s = 0
for i in all_df:
    imputer.fit(all_df)
    array = np.array(all_df[i])
    all_df[i] = imputer.fit_transform(array.reshape(-1, 1)) 

In [21]:
#Min Max 
colno = 0
for col in all_df:
    if col == 'RelapseFreeSurvival (outcome)':
        continue
    if colno >= 12: # ONLY NORMALISE MRI SCAN DATA
        colmean = np.median(all_df[col])
        colstd = np.std(all_df[col])
        upper = colmean + (3*colstd)
        lower = colmean - (3*colstd) # USING MIN ALSO SEEMS TO SKEW DATA
        #Comment out to cancel
        #all_df[col] = minmax_scale(all_df[col], feature_range=(lower,upper)) #Minimal Change - Downscales severity of Mean Squared Error
    colno+=1
    
print('Data Normilisation Complete')

Data Normilisation Complete


##### **Model**

In [22]:
print('Decision Tree \n')
y = all_df['pCR (outcome)']
x = all_df.drop('pCR (outcome)', axis=1)
train_X, validate_X, train_y, validate_y = train_test_split(x, y, test_size=0.15, shuffle = False)
depth = 1
top_score = 0
best_depth = 1
train_X, validate_X, train_y, validate_y = train_test_split(x, y, test_size=0.15, shuffle = False)
while depth <= 100:
    tree_clf = DecisionTreeClassifier(max_depth=depth)
    tree_clf.fit(train_X, train_y) #Train Model
    new_score = tree_clf.score(validate_X, validate_y)
    if new_score > top_score:
        best_depth = depth
        top_score = new_score
        #print("Depth = "+str(depth)+", Score = "+str(new_score))
    depth += 1
    
mean_recall = []
mean_f1 = []
mean_ba = []

print("Best Depth = "+str(best_depth))
scaler = StandardScaler()
Xs = scaler.fit_transform(x)
#K-fold
mse_total = 0
score_total = 0
kf = KFold(n_splits=5)
tree_clf = DecisionTreeClassifier(max_depth=best_depth)
for train, test in kf.split(train_X, train_y):
    tree_clf.fit(Xs[train],y[train])

    y_pred = tree_clf.predict(Xs[test])

    print("Results for Fold:")

    # Recall
    recall = recall_score(y[test], y_pred)
    mean_recall.append(recall)
    print('Recall: %.3f' % recall)

    # F1
    f1 = f1_score(y[test], y_pred)
    mean_f1.append(f1)
    print('F1 score: %.3f' % f1)

    # Balanced Accuracy
    balanced_accuracy = balanced_accuracy_score(y[test], y_pred)
    mean_ba.append(balanced_accuracy)
    print('Balanced Accuracy: %.3f' % balanced_accuracy)

    print()

print("Mean Recall: "+str(np.mean(mean_recall)))
print("Mean F1: "+str(np.mean(mean_f1)))
print("Mean Balanced Accuracy: "+str(np.mean(mean_ba)))


Decision Tree 

Best Depth = 1
Results for Fold:
Recall: 0.143
F1 score: 0.143
Balanced Accuracy: 0.522

Results for Fold:
Recall: 0.188
F1 score: 0.222
Balanced Accuracy: 0.517

Results for Fold:
Recall: 0.000
F1 score: 0.000
Balanced Accuracy: 0.500

Results for Fold:
Recall: 0.105
F1 score: 0.160
Balanced Accuracy: 0.512

Results for Fold:
Recall: 0.318
F1 score: 0.368
Balanced Accuracy: 0.561

Mean Recall: 0.15076042378673957
Mean F1: 0.1787000835421888
Mean Balanced Accuracy: 0.5224310482375571
