In [None]:
#import necessary modules
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score
from matplotlib import pyplot as plt
%matplotlib inline

#read the model ready data and separate the train and test data
#Also get the Survived column from original train data since the same was removed during pre-processing
def getdata(data):
    response = pd.read_csv("C:/Users/Arko/Downloads/Studies/PersonalResearch/EnovaDataChallenge/participant_files/training_data.csv", usecols = ['survival_7_years'])
    #Convert ndarray to 1d array
    response = np.array(response).ravel()
    traindata = data[data['Identifier']=="Train"]
    testdata = data[data['Identifier']=="Test"]
    #drop the Identifier columns
    traindata.drop("Identifier",axis = 1,inplace = True)
    testdata.drop("Identifier",axis = 1,inplace = True)
    return response,traindata,testdata

processedData = pd.read_csv("C:/Users/Arko/Downloads/Studies/PersonalResearch/EnovaDataChallenge/participant_files/processedData.csv")
#print(processedData.columns)
#drop the unnecessary Unnamed column
processedData.drop('Unnamed: 0',axis = 1,inplace = True)
#print(processedData.columns)
#invoke the above function
response , traindata, testdata = getdata(processedData)

#select features using RandomForest Classifier
#don't consider the index since it is not a feature but an index
clf = RandomForestClassifier(n_estimators=500, max_features='sqrt')
clf = clf.fit(traindata[traindata.columns.difference(['index'])], response)
features = pd.DataFrame()
features['predictors'] = traindata[traindata.columns.difference(['index'])].columns
features['importances'] = clf.feature_importances_
features.sort_values(by = ['importances'],ascending = False,inplace = True)
#features.set_index('predictors',inplace = True)
#print the features with the importances
print(features)
#plot the features to view by importances
features.plot(kind='barh',figsize = (20,20))
#Observations: we can see that the various levels of race and whether the person previously had cancer were of low significance
#The most significant ones being rate of change of tumor size and psa levels.
#Some of the symptom levels had higher significance than the rest.

#Select the top 80% of the features based on their importances
#basically we want to include all features upto chemo therapy(excluding the insignificant features)
predictors = features.head(round(len(features.importances)*0.8))['predictors']
predictors

#To evaluate our model we will use 5-fold cross validation with Accuracy metric
def compute_score(clf, X, y, scoring='accuracy'):
    xval = cross_val_score(clf, X, y, cv = 5, scoring=scoring)
    return np.mean(xval)

#Hyperparameter tuning using grid-search
parameter_grid = {
                 'max_depth' : [4, 6, 8],
                 'n_estimators': [50],
                 'max_features': ['sqrt', 'auto'],
                 'min_samples_leaf': [1, 3, 10],
                 'bootstrap': [True, False],
                 }
forest = RandomForestClassifier()
cross_validation = StratifiedKFold(response, n_folds=5)
grid_search = GridSearchCV(forest,
                               scoring='accuracy',
                               param_grid=parameter_grid,
                               cv=cross_validation)
grid_search.fit(traindata[predictors], response)
model = grid_search
parameters = grid_search.best_params_
print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))

#build the final model with the best parameters
model = RandomForestClassifier(**parameters)
model.fit(traindata[predictors], response)

#Now we predict for testdata
output = model.predict(testdata[predictors]).astype(int)
testdata['survival_7_years'] = output
#write it to a file
testdata.to_csv("C:/Users/Arko/Downloads/Studies/PersonalResearch/EnovaDataChallenge/participant_files/Arkojyoti_score.csv")