## Importing packages

In [1]:
import pandas as pd
import numpy as np


## Reading the data

In [2]:
pharma_data = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/pharma_data/Training_set_begs.csv')

In [9]:

pharma_data.head()

Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Treated_with_drugs,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,Patient_mental_condition,A,B,C,D,E,F,Z,Number_of_prev_cond,Survived_1_year
0,22374,8,3333,DX6,56,18.479385,YES,URBAN,Stable,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0
1,18164,5,5740,DX2,36,22.945566,YES,RURAL,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
2,6283,23,10446,DX6,48,27.510027,YES,RURAL,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
3,5339,51,12011,DX1,5,19.130976,NO,URBAN,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
4,33012,0,12513,,128,1.3484,Cannot say,RURAL,Stable,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1


### Filling NAs  with median for numerical variables and mode for categorical variables

In [6]:
for column in ['A', 'B', 'C', 'D', 'E','F','Z','Number_of_prev_cond']:
    pharma_data[column].fillna(pharma_data[column].median(), inplace=True)

In [7]:
pharma_data['Treated_with_drugs'].fillna(pharma_data['Treated_with_drugs'].mode()[0], inplace=True)

### Hot encoding categorical variables

In [8]:
for column in ['Treated_with_drugs', 'Patient_Smoker', 'Patient_Rural_Urban', 'Patient_mental_condition']:
     pharma_data = pd.get_dummies(pharma_data, columns=[column])

In [22]:
pharma_data.head()

Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Patient_Age,Patient_Body_Mass_Index,A,B,C,D,E,...,Treated_with_drugs_DX4,Treated_with_drugs_DX4 DX5,Treated_with_drugs_DX5,Treated_with_drugs_DX6,Patient_Smoker_Cannot say,Patient_Smoker_NO,Patient_Smoker_YES,Patient_Rural_Urban_RURAL,Patient_Rural_Urban_URBAN,Patient_mental_condition_Stable
0,22374,8,3333,56,18.479385,1.0,0.0,0.0,0.0,1.0,...,0,0,0,1,0,0,1,0,1,1
1,18164,5,5740,36,22.945566,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,1,0,1
2,6283,23,10446,48,27.510027,1.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,1,1,0,1
3,5339,51,12011,5,19.130976,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,0,1,1
4,33012,0,12513,128,1.3484,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,1,0,0,1,0,1


In [9]:
pharma_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23097 entries, 0 to 23096
Data columns (total 52 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   ID_Patient_Care_Situation                23097 non-null  int64  
 1   Diagnosed_Condition                      23097 non-null  int64  
 2   Patient_ID                               23097 non-null  int64  
 3   Patient_Age                              23097 non-null  int64  
 4   Patient_Body_Mass_Index                  23097 non-null  float64
 5   A                                        23097 non-null  float64
 6   B                                        23097 non-null  float64
 7   C                                        23097 non-null  float64
 8   D                                        23097 non-null  float64
 9   E                                        23097 non-null  float64
 10  F                                        23097

## Dropping columns that won't be required for modelling and setting my target variable as y

In [10]:

x= pharma_data.drop(['ID_Patient_Care_Situation','Patient_ID','Survived_1_year'], axis = 1)
y = pharma_data['Survived_1_year']

## Splitting test and train data

In [11]:

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

## Decision Tree 

In [None]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(x_train, y_train)

In [28]:

from sklearn.metrics import accuracy_score
print('Training accuracy...', accuracy_score(y_train, model.predict(x_train)))
print('Test accuracy', accuracy_score(y_test, model.predict(x_test)))

Training accuracy... 0.999547628279695
Test accuracy 0.7625606716515807


## Improving the decision tree model by optimizing the depth of the tree

In [29]:

model_improved = DecisionTreeClassifier(max_depth = 3)
model_improved.fit(x_train, y_train)
print('train score...' , accuracy_score(y_train, model_improved.predict(x_train)))
print('test score...', accuracy_score(y_test, model_improved.predict(x_test)))
y_pred =model_improved.predict(x_test)

train score... 0.7494506914824868
test score... 0.7510166601075692


In [29]:
from sklearn.metrics import f1_score
print("F1 Score: ",f1_score(y_test, y_pred))

F1 Score:  0.8174297806848788


In [21]:
test_data = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/pharma_data/Testing_set_begs.csv')

In [23]:
for column in ['Treated_with_drugs', 'Patient_Smoker', 'Patient_Rural_Urban', 'Patient_mental_condition']:
     test_data = pd.get_dummies(test_data, columns=[column])

In [32]:
test_data['Patient_Smoker_Cannot say'] = 0

In [35]:
test_data= test_data.drop(['ID_Patient_Care_Situation','Patient_ID'], axis = 1)

In [36]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9303 entries, 0 to 9302
Data columns (total 49 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Diagnosed_Condition                      9303 non-null   int64  
 1   Patient_Age                              9303 non-null   int64  
 2   Patient_Body_Mass_Index                  9303 non-null   float64
 3   A                                        9303 non-null   float64
 4   B                                        9303 non-null   float64
 5   C                                        9303 non-null   float64
 6   D                                        9303 non-null   float64
 7   E                                        9303 non-null   float64
 8   F                                        9303 non-null   float64
 9   Z                                        9303 non-null   float64
 10  Number_of_prev_cond                      9303 no

In [37]:
y_pred_2 =model_improved.predict(test_data)

In [101]:
res = pd.DataFrame(y_pred_2)
res.index = test.index # its important for comparison
res.columns = ["prediction"]
res.to_csv("y_pred.csv")

## Logistic Regression

In [44]:
from sklearn.linear_model import LogisticRegression
logreg=LogisticRegression()
logreg.fit(x_train,y_train)
y_pred_log=logreg.predict(x_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## Logistic Regression evaluation using F1 Score

In [45]:
from sklearn.metrics import f1_score
print("F1 Score: ",f1_score(y_test, y_pred_log))

F1 Score:  0.8008523827973654


## Random Forest Classifier

In [41]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

## Random Forest evaluation using F1 score

In [42]:

rfc.fit(x_train, y_train)
predictions = rfc.predict(x_test)
print("F1 Score: ",f1_score(y_test,predictions ))

F1 Score:  0.8560073372057474


#### from the evaluation metric results of the ML algorithms used,  Random forest classifier proved to be the algorithm with the highest F1 score of 85%  to predict patients likelihood of survival after one year of treatment 

## Using Random Forest model to predict test data and export a csv file with the results

In [43]:
predictions =rfc.predict(test_data)

In [109]:
res = pd.DataFrame(predictions)
res.index = test.index # its important for comparison
res.columns = ["prediction"]
res.to_csv("y_pred.csv")

##### Project Overview 
A hospital in the province of Greenland has been trying to improve its care conditions by looking at historic survival of the patients. They tried looking at their data but could not identify the main factors leading to high survivals.Develop a model that will predict the chances of survival of a patient after 1 year of treatment (Survived_1_year).