In [None]:
import pandas as pd
import numpy as np
from fancyimpute import KNN
import matplotlib
import seaborn as se
from matplotlib import pyplot as plt
from datetime import datetime as dt
from sklearn.preprocessing import MinMaxScaler
matplotlib.style.use('ggplot')
%matplotlib inline

#load and visualize the training data
trainData = pd.read_csv("C:/Users/Arko/Downloads/Studies/PersonalResearch/EnovaDataChallenge/participant_files/training_data.csv")
trainData.head()

#load and visualize the training data
testData = pd.read_csv("C:/Users/Arko/Downloads/Studies/PersonalResearch/EnovaDataChallenge/participant_files/testdata.csv")
#testData.head()

## explore the relation between some of the categorical variables and the target variable using the trainData

#tscore vs survival_7_years
plot = se.factorplot('t_score','survival_7_years',data = trainData,kind = 'bar')
#Observation: Since there are no definite relation coming out of the categories, it might make sense to club the levels
#we do the same during data-processing stage

#nscore vs survival_7_years
plot = se.factorplot('n_score','survival_7_years',data = trainData,kind = 'bar',palette = 'BuGn')
#observation: evidently if the cancer has not spread to lymph nodes, patient has higher chance of survival.
#we cannot have any inference about the unknown (NX) state

#mscore vs survival_7_years
plot = se.factorplot('m_score','survival_7_years',data = trainData,kind = 'bar',palette = 'BuGn')
#observation: evidently if the cancer has not spread to distant parts, patient has higher chance of survival.
#Since the survival shows less variation among the a,b,c categories of M1( that is cancer has spread to distant parts,
#we may club the 3 levels)

#stage vs survival_7_years
plot = se.factorplot('stage','survival_7_years',data = trainData,kind = 'bar',palette = 'BuGn')
#observation: patient with stage IV cancer has lowest survival chance. stages II(A,B) have less variation in count and can be combined

#family_history vs survival_7_years
plot = se.factorplot('family_history','survival_7_years',data = trainData,kind = 'bar',palette = 'BuGn')
#Observation: One peculiar finding is patients with 4 members affected by cancer have a distinctly higher chance of survival
#compared to others. However, as indicated by the boxplot, the variation in this level is also high and hence the 
#uncertainty associated is also more. We may consider this to be a random pattern 

#previous_cancer vs survival_7_years
plot = se.factorplot('previous_cancer','survival_7_years',data = trainData,kind = 'bar',palette = 'BuGn')
#observation: Not much difference in survival chance based on whether or not the patient had cancer before

#smoker vs survival_7_years
plot = se.factorplot('smoker','survival_7_years',data = trainData,kind = 'bar',palette = 'BuGn')
#observation: Not much difference in survival chance based on whether or not the patient is a smoker or not
#This may suggest that prostate cancer is not affected significantly by smoking but this is only an assumption
#since other types of cancer may also be present here

#tea vs survival_7_years
plot = se.factorplot('tea','survival_7_years',data = trainData,kind = 'bar')
#observation: with an exception to 9 cups of tea, the general pattern indicates that patients having 
#higher number of tea has higher chance of survival. May be having some kind of medicated tea improves survival chance

## combine Train and Test Data and perform feature engineering

#combine train and test data
def combineData():
    train = pd.read_csv("C:/Users/Arko/Downloads/Studies/PersonalResearch/EnovaDataChallenge/participant_files/training_data.csv")
    test = pd.read_csv("C:/Users/Arko/Downloads/Studies/PersonalResearch/EnovaDataChallenge/participant_files/testdata.csv")
    #add an identifier column
    train['Identifier'] = "Train"
    test['Identifier'] = "Test"
    #remove the Survived column from train so that train and test can be combined
    train.drop('survival_7_years',axis=1,inplace = True)
    test.drop('survival_7_years',axis=1,inplace = True)
    combined = pd.concat([train,test],axis = 0)
    combined.reset_index(inplace = True)
    #removing unnecessary index column
    combined.drop('id',axis = 1,inplace=True)
    return(combined)

#invoke the function to combine train and test data
data = combineData()
#get the number of rows and columns
print(data.shape)
#data.head()
#get the summary statistics for Numeric variables
data.describe()

#we visualize the correlation between the numeric columns
#this can be taken into account while imputing missing values for continuous variables 
corr = data._get_numeric_data().corr()
se.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)

#variableto be treated : gleason_score
#There are only 320 missing values in gleason_score(2% data). hence replacing them with 0
data['gleason_score'].fillna(0,inplace = True)
#check if the missing values have been replaced
#data.describe()

#Using the Diagnosis date we are calculating Number of months elapsed after 7 years since diagnosis.
#This aligns with the fact that the target column has the survival status at 7 years since diagnosis
data['mnths_from_diag']= 84 - pd.to_datetime(data['diagnosis_date']).dt.month.astype(int)
#trainData['mnths_from_diag'].head(40)
#drop the diagnosis date column since it is no longer required
data.drop('diagnosis_date',axis = 1,inplace = True)

#for t_score, convert to numeric by converting the values to 1,2,3,4 respectively.
#to reduce number of levels, clubbing together the levels a,b,c,d
data['t_score'] = data['t_score'].map(lambda x : x[1]).astype(int)
#data['t_score'].head()

#converting n_score to numeric. Assuming NX to be unknown values, we replace them by -1 
data['n_score'] = data['n_score'].map(lambda x: x[1])
data['n_score'] = data['n_score'].replace({'X':-1},regex = True)
data['n_score'] = data['n_score'].astype(int)
#data['n_score'].head(74)

#for m_score, convert to numeric by converting the values to 0,1
#to reduce number of levels, clubbing together the levels a,b,c,d
data['m_score'] = data['m_score'].map(lambda x : x[1]).astype(int)
#data['m_score'].head()

#mapping stage levels to their numeric equivalents
stageDict = {
    'I' : '1',
    'IIA' : '2',
    'IIB' : '2',
    'III': '3',
    'IV' : '4'
}
data['stage'] = data['stage'].map(stageDict).astype(int)
#data['stage'].head()

#We are not sure the most probable value for replacing missing values in race, hence replacing by 0(unknown)
#we then create dummy variables
data['race'].fillna(0,inplace = True)
raceEncoded = pd.get_dummies(data['race'],prefix = "race_")
data = pd.concat([data,raceEncoded],axis = 1)
#data.head()

#for following pairs of columns, rather than imputing by mean,median or mode, we use knn imputation(nearest neighbor)
#this is based on the intuition that these sets of attributes are related and can be good estimators of each other
#we have also considered the correlation between the attributes as depicted above in the corrplot
#we first create a function for performing knn imputation
#parameters are the data, specified columns and the number of neighbors to be considered
def imputeKNN(dat,cols,n):
    dat_mat = dat.as_matrix(columns = cols)
    dat_imputed = pd.DataFrame(KNN(k=n).complete(dat_mat))
    #specify appropriate column names
    dat_imputed.columns = cols
    #drop the previous columns from the original data
    dat.drop(cols,axis = 1,inplace = True)
    #add the imputed columns
    dat = pd.concat([dat,dat_imputed],axis=1)
    return(dat)

#in general, age can be imputed based on weight , height and race. The same analogy goes for weight and height. Hence, we 
#form a subset of these attrbutes and impute the missing values in each by knn imputation( nearest neighbor of 5)
pair1 = ['age', 'height','weight','race']
data = imputeKNN(data,pair1,3)
pair2 = ['family_history', 'first_degree_history','previous_cancer']
data = imputeKNN(data,pair2,3)
pair3 = ['tumor_diagnosis', 'tumor_6_months','tumor_1_year']
data = imputeKNN(data,pair3,5)
pair4 = ['psa_diagnosis', 'psa_6_months','psa_1_year']
data = imputeKNN(data,pair4,5)
#drop the race variable now since it has no more significance and we already have created dummy variables for the same
data.drop('race',axis = 1,inplace = True)

#tea does not have any prominent correlation with any other column. hence replacing missing values with median
data['tea'].fillna(data['tea'].median(),inplace = True)
#for smoker column, replace missing values with -1
data['smoker'].fillna(-1,inplace = True)

#Create dummy variables for side variable
sideEncoded = pd.get_dummies(data['side'],prefix = "side_")
data = pd.concat([data,sideEncoded],axis = 1)
#drop the side variable
data.drop('side',axis = 1,inplace=True)
#data.head()

#we calculate increase/decrease of tumor size in first 6months and then next 6 months
data['tumor_change_6mnths'] = data['tumor_6_months'] - data['tumor_diagnosis']
data['tumor_change_1yr'] = data['tumor_1_year'] - data['tumor_6_months']
#we perform the same action for psa level
data['psa_change_6mnths'] = data['psa_6_months'] - data['psa_diagnosis']
data['psa_change_1yr'] = data['psa_1_year'] - data['psa_6_months']
#we will later standardize these columns to bring them into same scale
#we drop the parent columns since they will no longer be considered
data.drop(['tumor_diagnosis','tumor_6_months','tumor_1_year','psa_diagnosis','psa_6_months','psa_1_year'],axis = 1,inplace = True)

#the meaning of the symptoms are not defined. Finding the count of symptoms with assumption that more symptoms means
#greater chance the patient has cancer
data['symptoms_count'] = data['symptoms'].apply(lambda x : len(str(x).split(',')))

#the definition of symptoms are not defined but it is mentioned that the symptoms are predictive
#hence we will create dummy variables corresponding to each of the symptoms and include the same in the dataset 
subset_data = data['symptoms'].str.get_dummies(sep=',')
#subset_data.columns
data = pd.concat([data,subset_data],axis = 1)
#drop the symptoms column since its no longer required
data.drop('symptoms',axis = 1,inplace = True)

#check the correlation between survival_1_year and survival_7_years on trainData
trainData[['survival_1_year','survival_7_years']].corr()
#Since these two columns don't have a high correlation let us keep the survival_1_year for now and later analyze
#the significance of survival_1_year attribute in predicting the target column
#We replace the missing values with -1
data['survival_1_year'].fillna(-1,inplace = True)

#Scaling all features except index and Identifier columns
#scaling can be applicable since we have converted all variables to numeric values
features = list(data[data.columns.difference(['Identifier','index'])].columns)
#use scikit-learn MinMaxScaler to normalize all the variables
scaler = MinMaxScaler()
data[features] = scaler.fit_transform(data[features])
data.head()

#check the name of all the columns at the end of feature engineering 
data.columns

#write the processed data to a file
data.to_csv("C:/Users/Arko/Downloads/Studies/PersonalResearch/EnovaDataChallenge/participant_files/processedData.csv")