In [2]:
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
import random
import seaborn as sns
import scipy
from scipy.stats import pearsonr
import sklearn
from sklearn import datasets, linear_model
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
import math
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.neural_network import MLPRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso
from sklearn.preprocessing import minmax_scale
from numpy import std, mean
import statistics
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SequentialFeatureSelector

#Import File Containing Train/Test Dataset
global all_df
all_df=pd.read_csv('TrainDataset2023.csv', index_col=False) #Read from File
all_df.drop('ID', axis=1, inplace=True) # Drop ID - not needed for training
all_df.drop('pCR (outcome)', axis=1, inplace=True) # Drop PCR for Export Purposes - Not Needed

#Impute Missing Values
imputer = SimpleImputer(missing_values = 999, strategy="median") 
SimpleImputer(missing_values = 999)
for i in all_df:
    imputer.fit(all_df)
    array = np.array(all_df[i])
    all_df[i] = imputer.fit_transform(array.reshape(-1, 1))       
    
#Min Max Normilisation Using Standard Deviation
colno = 0
for col in all_df:
    if colno >= 11: # ONLY NORMALISE MRI SCAN DATA - Clinical data is not impacted by outliers
        colmed = np.median(all_df[col])
        colstd = np.std(all_df[col])
        upper = colmed + (3*colstd)
        lower = colmed - (3*colstd) 
        all_df[col] = minmax_scale(all_df[col], feature_range=(lower,upper)) 
    colno+=1

print('Data Normilisation Complete')

Data Normilisation Complete


In [6]:
#Perform K-Fold (5) cross validation of training set with current hyperparameter permutations
def kfold(trainx, trainy, iteration, activate, solve, learn, layer):
    scaler = StandardScaler()
    Xs = scaler.fit_transform(trainx)
    #hyperparameterval = hyperparameter(train_X, train_y, validate_X, validate_y) 
    #K-fold
    mae_total = 0
    mae2_total = 0
    kf = KFold(n_splits=5)
    for train, test in kf.split(Xs, trainy):
        mlp_clf = MLPRegressor(random_state=1, max_iter=iteration,
                              activation = activate, solver = solve, learning_rate = learn,
                              hidden_layer_sizes = layer).fit(Xs[train], y[train])
        y_pred = mlp_clf.predict(Xs[test])
        mae = mean_absolute_error(y[test], y_pred)
        y_pred2 = mlp_clf.predict(Xs[train])
        mae2 = mean_absolute_error(y[train], y_pred2)
        mae_total += mae
        mae2_total += mae2
    return [mae_total/5, mae2_total/5]

#Iterate through possible hidden layer values to find the best value
def hiddenlayervalidation(trainx, trainy, validatex, validatey, iterations, activate, solve, learnrate):
    currenthiddenlayer = 5
    besthiddenlayer = 5
    best_MAE = 1000
    best_training_MAE = 1000
    counter = 0
    while currenthiddenlayer <= 140:
        mlp_clf = MLPRegressor(random_state=1, max_iter=iterations,
                              activation = activate, solver = solve, learning_rate = learnrate,
                              hidden_layer_sizes = currenthiddenlayer).fit(trainx, trainy)
        y_pred = mlp_clf.predict(validatex)
        mae =  mean_absolute_error(validatey, y_pred)
        if mae < best_MAE:
            best_MAE = mae
            besthiddenlayer = currenthiddenlayer
        currenthiddenlayer+=5
    return [besthiddenlayer, best_MAE]

#Iterate through possible max iterations to find the best value
def iterationvalidation(trainx, trainy, validatex, validatey, itermultiplier, itermax, activate, solve, learnrate):
    currentiter = itermultiplier
    bestiter = 1
    besthiddenlayer = 5
    best_MAE = 1000
    best_training_MAE = 1000
    counter = 0
    while currentiter <= itermax:
        print('\nCurrent Iteration = '+str(currentiter)+"/"+str(itermax))
        hiddenlayer = hiddenlayervalidation(trainx, trainy, validatex, validatey, currentiter, activate, solve, learnrate)
        print('Best Hidden Layer Size = '+str(hiddenlayer[0]))
        kfoldresult = kfold(trainx, trainy, currentiter, activate, solve, learnrate, hiddenlayer[0])
        mae = kfoldresult[0]
        mae2 = kfoldresult[1]
        print("Test MAE = "+str(mae))
        print("Training MAE = "+str(mae2))
        if mae < best_MAE:
            best_MAE = mae
            best_training_MAE = kfoldresult[1]
            bestiter = currentiter
            besthiddenlayer = hiddenlayer[0]
        currentiter += itermultiplier
    return [bestiter, besthiddenlayer, best_MAE, best_training_MAE]

print('Functions Loaded')

Functions Loaded


In [7]:
x = all_df.drop('RelapseFreeSurvival (outcome)', axis=1)
y = all_df['RelapseFreeSurvival (outcome)']
train_X, validate_X, train_y, validate_y = train_test_split(x, y, test_size=0.15, shuffle = False)

# Perform validation checks and K-fold cross validation on the following system permutation
bestiter = iterationvalidation(train_X, train_y, validate_X, validate_y, 25, 1200, 
                               'logistic', 'sgd', 'invscaling')
print('\nFINAL OUTPUT')
print('\nBest Iteration Value = '+str(bestiter[0]))
print('Best Quantity of Hidden Layer Neurons= '+str(bestiter[1]))
print('Best Validation MAE = '+str(bestiter[2]))
print('Best Validation MAE = '+str(bestiter[3]))


Current Iteration = 25/1200
Best Hidden Layer Size = 105
Test MAE = 26.327358726167354
Training MAE = 25.446965138767318

Current Iteration = 50/1200
Best Hidden Layer Size = 105
Test MAE = 25.532674725846793
Training MAE = 24.04350018155109

Current Iteration = 75/1200
Best Hidden Layer Size = 105
Test MAE = 25.21750894690668
Training MAE = 23.39599514252359

Current Iteration = 100/1200
Best Hidden Layer Size = 105
Test MAE = 25.05120909006593
Training MAE = 23.010071142448744

Current Iteration = 125/1200
Best Hidden Layer Size = 105
Test MAE = 24.977660016985876
Training MAE = 22.729062008921858

Current Iteration = 150/1200
Best Hidden Layer Size = 105
Test MAE = 24.947029334617476
Training MAE = 22.524476230943556

Current Iteration = 175/1200
Best Hidden Layer Size = 105
Test MAE = 24.931633734890323
Training MAE = 22.36833641518727

Current Iteration = 200/1200
Best Hidden Layer Size = 105
Test MAE = 24.9209043773182
Training MAE = 22.246936027635364

Current Iteration = 225/1

In [12]:
#K-Fold Iteration - Repeated to demonstrate accuracy gain through iterations on the best model permutation
x = all_df.drop('RelapseFreeSurvival (outcome)', axis=1)
y = all_df['RelapseFreeSurvival (outcome)']
train_X, validate_X, train_y, validate_y = train_test_split(x, y, test_size=0.15, shuffle = False)
mlp_clf = MLPRegressor(random_state=1, max_iter=bestiter[0], 
                           activation = 'logistic', solver = 'sgd', learning_rate = 'invscaling',
                           hidden_layer_sizes = bestiter[1])
scaler = StandardScaler()
Xs = scaler.fit_transform(train_X)
mae_total = 0
mae2_total = 0
kf = KFold(n_splits=5)
for train, test in kf.split(Xs, train_y):
    mlp_clf.fit(Xs[train], y[train])
    y_pred = mlp_clf.predict(Xs[test])
    mae = mean_absolute_error(y[test], y_pred)
    y_pred2 = mlp_clf.predict(Xs[train])
    mae2 = mean_absolute_error(y[train], y_pred2)
    print('\nTest MAE = '+str(mae))
    print('Training MAE = '+str(mae2))
    mae_total += mae
    mae2_total += mae2
    
print('\nAverage K-Fold Test MAE = '+str(mae_total/5))
print('Average K-Fold Training MAE = '+str(mae2_total/5))


Test MAE = 50.85134587910554
Training MAE = 17.06572689544502

Test MAE = 28.069305818687898
Training MAE = 21.66457227991447

Test MAE = 17.34732008455835
Training MAE = 23.86916335404775

Test MAE = 12.332850812785173
Training MAE = 24.821441956747556

Test MAE = 16.00369929145406
Training MAE = 23.813775652022034

Average K-Fold Test MAE = 24.9209043773182
Average K-Fold Training MAE = 22.246936027635364


In [17]:
# Import Test File
# PLEASE CHANGE FILE WHEN TESTING WITH NEW DATA
new_df=pd.read_csv('TestDatasetExample.csv', index_col=False) 
new_df_id = new_df['ID']
new_df.drop('ID', axis=1, inplace=True)

#Normilise the test file in the same way as the training file to gain a consistent result
#Impute Missing Values in Test File
imputer = SimpleImputer(missing_values = 999, strategy="median") 
SimpleImputer(missing_values = 999)
s = 0
for i in new_df:
    imputer.fit(new_df)
    array = np.array(new_df[i])
    new_df[i] = imputer.fit_transform(array.reshape(-1, 1))  

#Min Max Normilisation
colno = 0
for col in new_df:
    if colno >= 9: # ONLY NORMALISE MRI SCAN DATA
        colmed = np.median(new_df[col])
        colstd = np.std(new_df[col])
        upper = colmed + (3*colstd)
        lower = colmed - (3*colstd) # USING MIN ALSO SEEMS TO SKEW DATA
        #Comment out to cancel
        new_df[col] = minmax_scale(new_df[col], feature_range=(lower,upper)) #Minimal Change - Downscales severity of Mean Squared Error
    colno+=1

#Predict the RFS value from the test file paramters
testfile_x =  new_df
scaler = StandardScaler()
Xs = scaler.fit_transform(testfile_x)
y_pred = mlp_clf.predict(Xs)

#Cast ID and predicted Y value into new list to export to an output csv
idlist = []
for i in new_df_id:
    idlist.append(i)   
ylist = []
for i in y_pred:
    ylist.append(i)

print(idlist)
print(ylist)

#Export Patient ID and Predicted RFS to Specified Sheet (MUST BE CSV)
write = pd.DataFrame({
    'ID': idlist, 
    'RelapseFreeSurvival (outcome)': ylist
})

write.to_csv('COMP4139 Regression Test Result Export.csv', index = False)
print('\nData Exported into Output File')

['TRG002728', 'TRG002649', 'TRG002628']
[51.847941650348545, 59.65130608976672, 38.93215927240072]


PermissionError: [Errno 13] Permission denied: 'COMP4139 Regression Test Result Export.csv'