In [140]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SequentialFeatureSelector

In [141]:
np.random.seed(101)

all_df=pd.read_excel('./TrainDataset2023.xls', index_col=False) #Read from File
all_df.drop('ID', axis=1, inplace=True) # Drop ID - not needed for training

In [142]:
#Impute Missing Values
imputer = SimpleImputer(missing_values = 999, strategy="median") # Test Other Methods
SimpleImputer(missing_values = 999)
s = 0
for i in all_df:
    imputer.fit(all_df)
    array = np.array(all_df[i])
    all_df[i] = imputer.fit_transform(array.reshape(-1, 1)) 

In [143]:
#Min Max 
colno = 0
for col in all_df:
    if col == 'RelapseFreeSurvival (outcome)':
        continue
    if colno >= 12: # ONLY NORMALISE MRI SCAN DATA
        colmean = np.median(all_df[col])
        colstd = np.std(all_df[col])
        upper = colmean + (3*colstd)
        lower = colmean - (3*colstd) # USING MIN ALSO SEEMS TO SKEW DATA
        #Comment out to cancel
        #all_df[col] = minmax_scale(all_df[col], feature_range=(lower,upper)) #Minimal Change - Downscales severity of Mean Squared Error
    colno+=1
    
print('Data Normilisation Complete')

Data Normilisation Complete


In [144]:
print('Logistic Regression FFS \n')
y = all_df['pCR (outcome)']
x = all_df.drop('pCR (outcome)', axis=1)
train_X, validate_X, train_y, validate_y = train_test_split(x, y, test_size=0.15, shuffle = False)
lin_reg = LogisticRegression()


def FFS(trainx, trainy, validationx, validationy, maxscore):
    F = trainx.columns.tolist()
    #F.remove('RelapseFreeSurvival (outcome)')
    X = []
    B = maxscore
    while X != F:
        Y = []
        remfeatures = list(set(F)-set(Y))
        for i in remfeatures:
            temp = Y+[i]
            lin_reg.fit(trainx[temp],trainy)
            lin_reg.intercept_,lin_reg.coef_
            score = lin_reg.score(validationx[temp], validationy)
            if score > B:#-(B/40): #Fine Tune the Optimal Increase for a new feature to be worthwhile
                B = score
                Y = temp
                #print("Features Updated: "+str(Y))
                #print("Best Score Updated: "+str(B))
        if X != [] and lin_reg.score(validationx, validationy) > B:
            #print("Best X Features = "+str(X))
            #print("Best Features Accuracy = "+str(B))
            break
        else:
            X = Y
            break
    print("Best Features = "+str(X))
    print('Number of Features Used = '+str(len(X)))
    #y_pred = lin_reg.predict(validationx[X])
    #print("Feature Mean Squared Error = "+str(mean_squared_error(validationy, y_pred)))
    print("")
    return X

X = FFS(train_X, train_y, validate_X, validate_y, 0) #Test FFS
scaler = StandardScaler()
Xs = scaler.fit_transform(train_X[X]) # Change when adding validation set
mse_total = 0
mse2_total = 0
score_total = 0
kf = KFold(n_splits=5)
for train, test in kf.split(Xs, train_y):
    lin_reg.fit(Xs[train],y[train])
    lin_reg.intercept_,lin_reg.coef_

    y_pred = lin_reg.predict(Xs[test])

    print("Results for Fold:")

    # Recall
    recall = recall_score(y[test], y_pred, average='binary')
    print('Recall: %.3f' % recall)

    # F1
    f1 = f1_score(y[test], y_pred, average='binary')
    print('F1 score: %.3f' % f1)

    # Balanced Accuracy
    balanced_accuracy = balanced_accuracy_score(y[test], y_pred) 
    print('Balanced Accuracy: %.3f' % balanced_accuracy)

    print()



Logistic Regression FFS 

Best Features = ['original_shape_Maximum2DDiameterSlice']
Number of Features Used = 1

Results for Fold:
Recall: 0.000
F1 score: 0.000
Balanced Accuracy: 0.500

Results for Fold:
Recall: 0.000
F1 score: 0.000
Balanced Accuracy: 0.500

Results for Fold:
Recall: 0.000
F1 score: 0.000
Balanced Accuracy: 0.500

Results for Fold:
Recall: 0.000
F1 score: 0.000
Balanced Accuracy: 0.500

Results for Fold:
Recall: 0.000
F1 score: 0.000
Balanced Accuracy: 0.500



In [145]:
print('Logistic Regression SFS \n')
y = all_df['pCR (outcome)']
x = all_df.drop('pCR (outcome)', axis=1)
train_X, validate_X, train_y, validate_y = train_test_split(x, y, test_size=0.15, shuffle = False)
lin_reg = LogisticRegression()

#Create loop to determine best no of features
featureno = 1
score = 0
y = all_df['pCR (outcome)']
x = all_df.drop('pCR (outcome)', axis=1)
while featureno < 50:
    featuretest = []
    sfs = SequentialFeatureSelector(lin_reg, n_features_to_select=featureno)
    sfs.fit(x,y)
    feats = sfs.get_feature_names_out() 
    for i in feats:
        featuretest.append(i)
    x2 = all_df[featuretest] 
    lin_reg.fit(x2,y)
    lin_reg.intercept_,lin_reg.coef_
    acc = lin_reg.score(x2, y)
    if acc > score:#-(score/40):
        featureno+=1
        score = acc
        continue
    else:
        break

print("SFS Best No. of Features = "+str(featureno))

sfs = SequentialFeatureSelector(lin_reg, n_features_to_select=featureno)
sfs.fit(x,y)
feats = sfs.get_feature_names_out() 
print(feats)
feature = []
for i in feats:
    feature.append(i)

scaler = StandardScaler()
x = train_X[feature]
Xs = scaler.fit_transform(x) # Change when adding validation set
mse_total = 0
mse2_total = 0
score_total = 0
kf = KFold(n_splits=5)
for train, test in kf.split(Xs, train_y):
    lin_reg.fit(Xs[train],y[train])
    lin_reg.intercept_,lin_reg.coef_

    y_pred = lin_reg.predict(Xs[test])

    print("Results for Fold:")
    # Recall
    recall = recall_score(y[test], y_pred, average='binary')
    print('Recall: %.3f' % recall)

    # F1
    f1 = f1_score(y[test], y_pred, average='binary')
    print('F1 score: %.3f' % f1)

    # Balanced Accuracy
    balanced_accuracy = balanced_accuracy_score(y[test], y_pred)
    print('Balanced Accuracy: %.3f' % balanced_accuracy)

    print()

Logistic Regression SFS 

SFS Best No. of Features = 2
['original_glrlm_RunVariance' 'original_ngtdm_Busyness']
Results for Fold:
Recall: 0.000
F1 score: 0.000
Balanced Accuracy: 0.500

Results for Fold:
Recall: 0.000
F1 score: 0.000
Balanced Accuracy: 0.500

Results for Fold:
Recall: 0.067
F1 score: 0.125
Balanced Accuracy: 0.533

Results for Fold:
Recall: 0.000
F1 score: 0.000
Balanced Accuracy: 0.500

Results for Fold:
Recall: 0.000
F1 score: 0.000
Balanced Accuracy: 0.500

