In [None]:
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
import random
import seaborn as sns
import scipy
from scipy.stats import pearsonr
import sklearn
from sklearn import datasets, linear_model
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
import math
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.neural_network import MLPRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso
from sklearn.preprocessing import minmax_scale
from numpy import std, mean
import statistics
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SequentialFeatureSelector

#Import File Containing Train/Test Dataset
global all_df
all_df=pd.read_csv('TrainDataset2023.csv', index_col=False) #Read from File
all_df.drop('ID', axis=1, inplace=True) # Drop ID - not needed for training
all_df.drop('pCR (outcome)', axis=1, inplace=True) # Drop PCR for Export Purposes - Not Needed

#Impute Missing Values
imputer = SimpleImputer(missing_values = 999, strategy="median") 
SimpleImputer(missing_values = 999)
for i in all_df:
    imputer.fit(all_df)
    array = np.array(all_df[i])
    all_df[i] = imputer.fit_transform(array.reshape(-1, 1))       
    
#Min Max Normilisation Using Standard Deviation
colno = 0
for col in all_df:
    if colno >= 11: # ONLY NORMALISE MRI SCAN DATA - Clinical data is not impacted by outliers
        colmed = np.median(all_df[col])
        colstd = np.std(all_df[col])
        upper = colmed + (3*colstd)
        lower = colmed - (3*colstd) 
        all_df[col] = minmax_scale(all_df[col], feature_range=(lower,upper)) 
    colno+=1

print('Data Normilisation Complete')

In [None]:
#Perform K-Fold (5) cross validation of training set with current hyperparameter permutations
def kfold(trainx, trainy, iteration, activate, solve, learn, layer):
    scaler = StandardScaler()
    Xs = scaler.fit_transform(trainx)
    #hyperparameterval = hyperparameter(train_X, train_y, validate_X, validate_y) 
    #K-fold
    mae_total = 0
    mae2_total = 0
    kf = KFold(n_splits=5)
    for train, test in kf.split(Xs, trainy):
        mlp_clf = MLPRegressor(random_state=1, max_iter=iteration,
                              activation = activate, solver = solve, learning_rate = learn,
                              hidden_layer_sizes = layer).fit(Xs[train], y[train])
        y_pred = mlp_clf.predict(Xs[test])
        mae = mean_absolute_error(y[test], y_pred)
        y_pred2 = mlp_clf.predict(Xs[train])
        mae2 = mean_absolute_error(y[train], y_pred2)
        mae_total += mae
        mae2_total += mae2
    return [mae_total/5, mae2_total/5]

#Iterate through possible hidden layer values to find the best value
def hiddenlayervalidation(trainx, trainy, validatex, validatey, iterations, activate, solve, learnrate):
    currenthiddenlayer = 5
    besthiddenlayer = 5
    best_MAE = 1000
    best_training_MAE = 1000
    counter = 0
    while currenthiddenlayer <= 140:
        mlp_clf = MLPRegressor(random_state=1, max_iter=iterations,
                              activation = activate, solver = solve, learning_rate = learnrate,
                              hidden_layer_sizes = currenthiddenlayer).fit(trainx, trainy)
        y_pred = mlp_clf.predict(validatex)
        mae =  mean_absolute_error(validatey, y_pred)
        if mae < best_MAE:
            best_MAE = mae
            besthiddenlayer = currenthiddenlayer
        currenthiddenlayer+=5
    return [besthiddenlayer, best_MAE]

#Iterate through possible max iterations to find the best value
def iterationvalidation(trainx, trainy, validatex, validatey, itermultiplier, itermax, activate, solve, learnrate):
    currentiter = itermultiplier
    bestiter = 1
    besthiddenlayer = 5
    best_MAE = 1000
    best_training_MAE = 1000
    counter = 0
    while currentiter <= itermax:
        print('\nCurrent Iteration = '+str(currentiter)+"/"+str(itermax))
        hiddenlayer = hiddenlayervalidation(trainx, trainy, validatex, validatey, currentiter, activate, solve, learnrate)
        print('Best Hidden Layer Size = '+str(hiddenlayer[0]))
        kfoldresult = kfold(trainx, trainy, currentiter, activate, solve, learnrate, hiddenlayer[0])
        mae = kfoldresult[0]
        mae2 = kfoldresult[1]
        print("Test MAE = "+str(mae))
        print("Training MAE = "+str(mae2))
        if mae < best_MAE:
            best_MAE = mae
            best_training_MAE = kfoldresult[1]
            bestiter = currentiter
            besthiddenlayer = hiddenlayer[0]
        currentiter += itermultiplier
    return [bestiter, besthiddenlayer, best_MAE, best_training_MAE]

x = all_df.drop('RelapseFreeSurvival (outcome)', axis=1)
y = all_df['RelapseFreeSurvival (outcome)']
train_X, validate_X, train_y, validate_y = train_test_split(x, y, test_size=0.15, shuffle = False)

print('Functions Loaded')

In [None]:
# RELU + ADAM
bestiter = iterationvalidation(train_X, train_y, validate_X, validate_y, 25, 1200, 
                               'relu', 'adam', 'constant')
print('\nFINAL OUTPUT')
print('\nBest Iteration Value = '+str(bestiter[0]))
print('Best Quantity of Hidden Layer Neurons= '+str(bestiter[1]))
print('Best Validation MAE = '+str(bestiter[2]))
print('Best Training MAE = '+str(bestiter[3]))

In [None]:
# RELU + LBFGS
bestiter = iterationvalidation(train_X, train_y, validate_X, validate_y, 1, 50, 
                               'relu', 'lbfgs', 'constant')
print('\nFINAL OUTPUT')
print('\nBest Iteration Value = '+str(bestiter[0]))
print('Best Quantity of Hidden Layer Neurons= '+str(bestiter[1]))
print('Best Validation MAE = '+str(bestiter[2]))
print('Best Training MAE = '+str(bestiter[3]))

In [None]:
# RELU + SGD + CONSTANT
bestiter = iterationvalidation(train_X, train_y, validate_X, validate_y, 25, 1200, 
                               'relu', 'sgd', 'constant')
print('\nFINAL OUTPUT')
print('\nBest Iteration Value = '+str(bestiter[0]))
print('Best Quantity of Hidden Layer Neurons= '+str(bestiter[1]))
print('Best Validation MAE = '+str(bestiter[2]))
print('Best Training MAE = '+str(bestiter[3]))

In [None]:
# RELU + SGD + INVSCALING
bestiter = iterationvalidation(train_X, train_y, validate_X, validate_y, 25, 1200, 
                               'relu', 'sgd', 'invscaling')
print('\nFINAL OUTPUT')
print('\nBest Iteration Value = '+str(bestiter[0]))
print('Best Quantity of Hidden Layer Neurons= '+str(bestiter[1]))
print('Best Validation MAE = '+str(bestiter[2]))
print('Best Training MAE = '+str(bestiter[3]))

In [None]:
# RELU + SGD + ADAPTIVE
bestiter = iterationvalidation(train_X, train_y, validate_X, validate_y, 1, 50, 
                               'relu', 'sgd', 'adaptive')
print('\nFINAL OUTPUT')
print('\nBest Iteration Value = '+str(bestiter[0]))
print('Best Quantity of Hidden Layer Neurons= '+str(bestiter[1]))
print('Best Validation MAE = '+str(bestiter[2]))
print('Best Training MAE = '+str(bestiter[3]))

In [None]:
# IDENTITY + ADAM
bestiter = iterationvalidation(train_X, train_y, validate_X, validate_y, 25, 1200, 
                               'identity', 'adam', 'constant')
print('\nFINAL OUTPUT')
print('\nBest Iteration Value = '+str(bestiter[0]))
print('Best Quantity of Hidden Layer Neurons= '+str(bestiter[1]))
print('Best Validation MAE = '+str(bestiter[2]))
print('Best Training MAE = '+str(bestiter[3]))

In [None]:
# IDENTITY + LBFGS
bestiter = iterationvalidation(train_X, train_y, validate_X, validate_y, 1, 50, 
                               'identity', 'lbfgs', 'constant')
print('\nFINAL OUTPUT')
print('\nBest Iteration Value = '+str(bestiter[0]))
print('Best Quantity of Hidden Layer Neurons= '+str(bestiter[1]))
print('Best Validation MAE = '+str(bestiter[2]))
print('Best Training MAE = '+str(bestiter[3]))

In [None]:
# IDENTITY + SGD = ERROR

In [None]:
# LOGISTIC + ADAM
bestiter = iterationvalidation(train_X, train_y, validate_X, validate_y, 25, 1200, 
                               'logistic', 'adam', 'constant')
print('\nFINAL OUTPUT')
print('\nBest Iteration Value = '+str(bestiter[0]))
print('Best Quantity of Hidden Layer Neurons= '+str(bestiter[1]))
print('Best Validation MAE = '+str(bestiter[2]))
print('Best Training MAE = '+str(bestiter[3]))

In [None]:
# LOGISTIC + LBFGS
bestiter = iterationvalidation(train_X, train_y, validate_X, validate_y, 1, 50, 
                               'logistic', 'lbfgs', 'constant')
print('\nFINAL OUTPUT')
print('\nBest Iteration Value = '+str(bestiter[0]))
print('Best Quantity of Hidden Layer Neurons= '+str(bestiter[1]))
print('Best Validation MAE = '+str(bestiter[2]))
print('Best Training MAE = '+str(bestiter[3]))

In [None]:
# LOGISTIC + SGD + CONSTANT
bestiter = iterationvalidation(train_X, train_y, validate_X, validate_y, 25, 1200, 
                               'logistic', 'sgd', 'constant')
print('\nFINAL OUTPUT')
print('\nBest Iteration Value = '+str(bestiter[0]))
print('Best Quantity of Hidden Layer Neurons= '+str(bestiter[1]))
print('Best Validation MAE = '+str(bestiter[2]))
print('Best Training MAE = '+str(bestiter[3]))

In [None]:
# LOGISTIC + SGD + INVSCALING
bestiter = iterationvalidation(train_X, train_y, validate_X, validate_y, 25, 1200, 
                               'logistic', 'sgd', 'invscaling')
print('\nFINAL OUTPUT')
print('\nBest Iteration Value = '+str(bestiter[0]))
print('Best Quantity of Hidden Layer Neurons= '+str(bestiter[1]))
print('Best Validation MAE = '+str(bestiter[2]))
print('Best Training MAE = '+str(bestiter[3]))

In [None]:
# LOGISTIC + SGD + ADAPTIVE
bestiter = iterationvalidation(train_X, train_y, validate_X, validate_y, 25, 1200, 
                               'logistic', 'sgd', 'adaptive')
print('\nFINAL OUTPUT')
print('\nBest Iteration Value = '+str(bestiter[0]))
print('Best Quantity of Hidden Layer Neurons= '+str(bestiter[1]))
print('Best Validation MAE = '+str(bestiter[2]))
print('Best Training MAE = '+str(bestiter[3]))

In [None]:
# TANH + ADAM
bestiter = iterationvalidation(train_X, train_y, validate_X, validate_y, 25, 1200, 
                               'tanh', 'adam', 'constant')
print('\nFINAL OUTPUT')
print('\nBest Iteration Value = '+str(bestiter[0]))
print('Best Quantity of Hidden Layer Neurons= '+str(bestiter[1]))
print('Best Validation MAE = '+str(bestiter[2]))
print('Best Training MAE = '+str(bestiter[3]))

In [None]:
# TANH + LBFGS
bestiter = iterationvalidation(train_X, train_y, validate_X, validate_y, 1, 50, 
                               'tanh', 'lbfgs', 'constant')
print('\nFINAL OUTPUT')
print('\nBest Iteration Value = '+str(bestiter[0]))
print('Best Quantity of Hidden Layer Neurons= '+str(bestiter[1]))
print('Best Validation MAE = '+str(bestiter[2]))
print('Best Training MAE = '+str(bestiter[3]))

In [None]:
# TANH + SGD + CONSTANT
bestiter = iterationvalidation(train_X, train_y, validate_X, validate_y, 25, 1200, 
                               'tanh', 'sgd', 'constant')
print('\nFINAL OUTPUT')
print('\nBest Iteration Value = '+str(bestiter[0]))
print('Best Quantity of Hidden Layer Neurons= '+str(bestiter[1]))
print('Best Validation MAE = '+str(bestiter[2]))
print('Best Training MAE = '+str(bestiter[3]))

In [None]:
# TANH + SGD + INVSCALING
bestiter = iterationvalidation(train_X, train_y, validate_X, validate_y, 25, 1200, 
                               'tanh', 'sgd', 'invscaling')
print('\nFINAL OUTPUT')
print('\nBest Iteration Value = '+str(bestiter[0]))
print('Best Quantity of Hidden Layer Neurons= '+str(bestiter[1]))
print('Best Validation MAE = '+str(bestiter[2]))
print('Best Training MAE = '+str(bestiter[3]))

In [None]:
# TANH + SGD + ADAPTIVE
bestiter = iterationvalidation(train_X, train_y, validate_X, validate_y, 25, 1200, 
                               'tanh', 'sgd', 'adaptive')
print('\nFINAL OUTPUT')
print('\nBest Iteration Value = '+str(bestiter[0]))
print('Best Quantity of Hidden Layer Neurons= '+str(bestiter[1]))
print('Best Validation MAE = '+str(bestiter[2]))
print('Best Training MAE = '+str(bestiter[3]))