In [1]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import sklearn
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [2]:
#import dataset
df = pd.read_excel('trainDataset.xls')

In [3]:
#drop ID column from dataset
df = df.iloc[:, 1:]

# **DELETE MISSING VALUES**

In [4]:
#delete missing values (999) of each column 
for column in df:
    df[column]=df[column].replace(999, None)

df = df.dropna()

target_classif = pd.DataFrame(df['pCR (outcome)'], columns = ['pCR (outcome)'])

In [5]:
final_df_classif = df

#namelist of columns that has categorical data or do not need normalisation for classification
drop_classif = ['pCR (outcome)', 'ER', 'PgR', 'HER2', 'TrippleNegative', 'ChemoGrade', 'Proliferation', 'HistologyType', 'LNStatus', 'TumourStage']

#drop the columns with booleans/specific grades as they wont have outliers
df_classif = df.drop(drop_classif, axis=1)

# **REPLACING OUTLIERS**

In [6]:
#use IQR method to find outliers
def IQR_outliers(column):
    Q1=df[column].quantile(0.25)
    Q3=df[column].quantile(0.75)
    IQR = Q3-Q1
    df_outliers=df[((df[column]<(Q1-1.5*IQR)) | (df[column]>(Q3+1.5*IQR)))]
    return df_outliers

#delete outliers
def replace_outliers(df1):
    for column in df1:
        df_outliers = IQR_outliers(column)
        for i in range((df_outliers.shape[0])):
            label_index = df_outliers.index[i]
            df1.loc[label_index, column] = df1[column].median()
    return df1

no_outliers_df = replace_outliers(df_classif)

# **NORMALISATION**

In [7]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

#get names of columns
names = []
for column in no_outliers_df:
    names.append(column)

normalised_df_classif = pd.DataFrame(scaler.fit_transform(no_outliers_df), columns = names)


# FEATURE SELECTION

### Regression

In [8]:
target_classif.reset_index(drop=True, inplace = True)

In [9]:
#get categorical input data
target_name = 'pCR (outcome)'

categorical_classif = pd.DataFrame()
for column in drop_classif:
    if column == target_name:
        continue
    else:
        categorical_classif = pd.concat([categorical_classif, df[column]], axis = 1)
categorical_classif.reset_index(inplace= True, drop = True)

In [10]:
# Converting categorical data with more than 2 types of data using one hot encoder
from sklearn.preprocessing import OneHotEncoder

categorical_classif['ChemoGrade']=categorical_classif['ChemoGrade'].astype('category')
categorical_classif['Proliferation']=categorical_classif['Proliferation'].astype('category')
categorical_classif['TumourStage']=categorical_classif['TumourStage'].astype('category')

#Create an instance of One-hot-encoder
enc=OneHotEncoder()

#Rename columns
one_hot_encoded_data = pd.get_dummies(categorical_classif, columns = ['ChemoGrade','Proliferation','TumourStage'])

In [11]:
#concat the one hot encoded categorical features + normalised numerical features
finalised = pd.concat([normalised_df_classif, one_hot_encoded_data], axis=1)

In [12]:
#apply ANOVA for regression
from sklearn.feature_selection import f_classif, SelectKBest
import math

fs = SelectKBest(score_func=f_classif,k='all') #k value for when P_value < 0.05
# Apply feature selection
fs.fit(finalised,target_classif)

#declare variable to put selected features for classification
df_classif_final = pd.DataFrame()

#get Anova F scores
features_score = pd.DataFrame(fs.scores_)

#get Anova F P values
features_pvalue = pd.DataFrame(np.round(fs.pvalues_,4))

#get feature names
features = pd.DataFrame(finalised.columns)

#concat into a dataframe
feature_score = pd.concat([features,features_score,features_pvalue],axis=1)

# Assign column names
feature_score.columns = ['Input_Features','Score','P_Value']

#select features where p < 0.05
selected = feature_score[feature_score['P_Value'] < 0.05]

  y = column_or_1d(y, warn=True)


In [13]:
#get the names of the selected data
selected_names = []

for i in (range(selected.shape[0])):
    selected_names.append(selected.iloc[i,0])

In [14]:
#concat the selected features from both fs methods
selected = finalised[selected_names]

In [15]:
x_classif = selected.iloc[:,:]
y_classif = target_classif.iloc[:,:]