In [1]:
#import libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [2]:
#import dataset
df = pd.read_excel('trainDataset.xls')

In [3]:
#drop ID column from dataset
df = df.iloc[:, 1:]

# **REPLACING MISSING VALUES**

In [4]:
#replace missing values (999) of each column to the column's median
for column in df:
    df[column]=df[column].replace(999, df[column].median())

In [5]:
#new_df will be without outliers
new_df = df

#drop the columns with booleans/specific grades as they wont have outliers
df = df.drop(['pCR (outcome)', 'ER', 'PgR', 'HER2', 'TrippleNegative', 'ChemoGrade', 'Proliferation', 'HistologyType', 'LNStatus', 'TumourStage'], axis=1)

# **REPLACING OUTLIERS**

In [6]:
#use IQR method to find outliers
def IQR_outliers(column):
    Q1=df[column].quantile(0.25)
    Q3=df[column].quantile(0.75)
    IQR = Q3-Q1
    df_outliers=df[((df[column]<(Q1-1.5*IQR)) | (df[column]>(Q3+1.5*IQR)))]
    return df_outliers

#replace outliers with median of the column
for column in df:
    df_outliers = IQR_outliers(column)
    
    for i in range((df_outliers.shape[0])):
        label_index = df_outliers.index[i]
        new_df.loc[label_index, column] = df[column].median()
display(new_df)

Unnamed: 0,pCR (outcome),RelapseFreeSurvival (outcome),Age,ER,PgR,HER2,TrippleNegative,ChemoGrade,Proliferation,HistologyType,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
0,1,55.041667,41.0,0,0,0,1,3,3,1,...,0.517172,0.375126,3.325332,0.002314,3.880772e+06,66.507214,0.000768,0.182615,0.002301,0.000758
1,0,55.041667,39.0,1,1,0,0,3,3,1,...,0.444391,0.444391,3.032144,0.005612,2.372010e+06,59.459710,0.004383,0.032012,0.001006,0.003685
2,1,55.041667,31.0,0,0,0,1,2,1,1,...,0.534549,0.534549,2.485848,0.006752,1.540027e+06,33.935384,0.007584,0.024062,0.000529,0.006447
3,0,12.000000,35.0,0,0,0,1,3,3,1,...,0.506185,0.506185,2.606255,0.003755,6.936741e+06,46.859265,0.005424,0.013707,0.000178,0.004543
4,0,109.000000,61.0,1,0,0,0,2,1,1,...,0.462282,0.462282,2.809279,0.006521,1.265399e+06,39.621023,0.006585,0.034148,0.001083,0.005626
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,1,49.250000,46.1,0,0,0,1,3,3,1,...,0.439568,0.439568,3.056046,0.001339,1.671271e+07,79.989003,0.003282,0.024716,0.000812,0.003078
396,0,48.500000,53.3,0,0,0,1,2,1,1,...,0.527779,0.527778,1.500000,0.003728,2.132007e+05,0.996746,0.004399,0.007380,0.000037,0.003384
397,0,47.500000,68.8,1,0,0,0,3,3,1,...,0.313693,0.313693,3.573557,0.001112,2.008034e+07,204.864200,0.001372,0.054063,0.003697,0.001368
398,0,46.916667,46.0,1,0,0,0,2,1,1,...,0.670229,0.670229,1.857045,0.006706,5.609262e+05,9.609163,0.026591,0.018682,0.000311,0.003384


# **NORMALISATION**

In [7]:
from sklearn.preprocessing import MinMaxScaler

#namelist of columns that do not need normalisation for classification
drop_classif = ['pCR (outcome)', 'ER', 'PgR', 'HER2', 'TrippleNegative', 'ChemoGrade', 'Proliferation', 'HistologyType', 'LNStatus', 'TumourStage']

#namelist of columns that do not need normalisation for regression
drop_reg = drop_classif + ['RelapseFreeSurvival (outcome)']

def normalize(df, x):
    result = df.copy()
    for feature_name in df.columns:
        if (x == 'classification' and feature_name not in drop_classif) or (x == 'regression' and feature_name not in drop_reg):
            max_value = df[feature_name].max()
            min_value = df[feature_name].min()
            result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result

df_classif = normalize(new_df,'classification')
df_reg = normalize(new_df,'regression')

# FEATURE SELECTION

### Classification 

In [8]:
#separate target variable for classification (PCR)
x_classif = df_classif.drop(labels=['pCR (outcome)'], axis=1)
y_classif = df_classif['pCR (outcome)']

In [9]:
#apply ANOVA for classification
from sklearn.feature_selection import f_classif, SelectKBest
import math

fs = SelectKBest(score_func = f_classif, k= 53)  #k value for when P_value < 0.05
# Apply feature selection
fs.fit(x_classif,y_classif)

#declare variable to put selected features for classification
df_classif_final = pd.DataFrame()

def list_ceil(x):
    return[math.ceil(i) in x]

features_score = pd.DataFrame(fs.scores_)
features_pvalue = pd.DataFrame(np.round(fs.pvalues_,4))
features = pd.DataFrame(x_classif.columns)
feature_score = pd.concat([features,features_score,features_pvalue],axis=1)

# Assign column names
feature_score.columns = ['Input_Features','Score','P_Value']
chosen_features = feature_score.nlargest(53,columns='Score')

# Add selected features into a new dataframe
for feature in chosen_features['Input_Features']:
    df_classif_final = pd.concat([df_classif_final, df_classif[feature]], axis=1)

### Regression

In [10]:
#separate target value for regression (RFS)
x_reg = df_reg.drop(labels=['RelapseFreeSurvival (outcome)'], axis=1)
y_reg = df_reg['RelapseFreeSurvival (outcome)']

In [11]:
#apply ANOVA for regression
from sklearn.feature_selection import f_regression, SelectKBest
import math

fs = SelectKBest(score_func=f_regression,k=20) #k value for when P_value < 0.05
# Apply feature selection
fs.fit(x_reg,y_reg)

#declare variable to put selected features for classification
df_reg_final = pd.DataFrame()

def list_ceil(x):
    return[math.ceil(i) in x]

features_score = pd.DataFrame(fs.scores_)
features_pvalue = pd.DataFrame(np.round(fs.pvalues_,4))
features = pd.DataFrame(x_reg.columns)
feature_score = pd.concat([features,features_score,features_pvalue],axis=1)

# Assign column names
feature_score.columns = ['Input_Features','Score','P_Value']
chosen_features = feature_score.nlargest(20,columns='Score')

# Add selected features into a new dataframe
for feature in chosen_features['Input_Features']:
    df_reg_final = pd.concat([df_reg_final, df_reg[feature]], axis=1)