In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# to ignore warnings
import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder    # for converting categorical to numerical

from sklearn.metrics import f1_score    # for model evaluation



data = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/pharma_data/Training_set_begs.csv')

In [2]:
#Looking at the dataset
data.head()

Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Treated_with_drugs,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,Patient_mental_condition,A,B,C,D,E,F,Z,Number_of_prev_cond,Survived_1_year
0,22374,8,3333,DX6,56,18.479385,YES,URBAN,Stable,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0
1,18164,5,5740,DX2,36,22.945566,YES,RURAL,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
2,6283,23,10446,DX6,48,27.510027,YES,RURAL,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
3,5339,51,12011,DX1,5,19.130976,NO,URBAN,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
4,33012,0,12513,,128,1.3484,Cannot say,RURAL,Stable,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23097 entries, 0 to 23096
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID_Patient_Care_Situation  23097 non-null  int64  
 1   Diagnosed_Condition        23097 non-null  int64  
 2   Patient_ID                 23097 non-null  int64  
 3   Treated_with_drugs         23084 non-null  object 
 4   Patient_Age                23097 non-null  int64  
 5   Patient_Body_Mass_Index    23097 non-null  float64
 6   Patient_Smoker             23097 non-null  object 
 7   Patient_Rural_Urban        23097 non-null  object 
 8   Patient_mental_condition   23097 non-null  object 
 9   A                          21862 non-null  float64
 10  B                          21862 non-null  float64
 11  C                          21862 non-null  float64
 12  D                          21862 non-null  float64
 13  E                          21862 non-null  flo

In [4]:
data.describe()

Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Patient_Age,Patient_Body_Mass_Index,A,B,C,D,E,F,Z,Number_of_prev_cond,Survived_1_year
count,23097.0,23097.0,23097.0,23097.0,23097.0,21862.0,21862.0,21862.0,21862.0,21862.0,21862.0,21862.0,21862.0,23097.0
mean,16545.712041,26.413127,6261.280772,33.209768,23.45482,0.897905,0.136355,0.18507,0.083615,0.393239,0.0537,0.000595,1.75048,0.632247
std,9532.263503,15.030865,3595.99062,19.549882,3.807661,0.30278,0.343173,0.388363,0.276817,0.48848,0.225431,0.024379,0.770311,0.482204
min,2.0,0.0,1.0,0.0,1.0893,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,8280.0,13.0,3181.0,16.0,20.20555,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,16597.0,26.0,6242.0,33.0,23.386199,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0
75%,24825.0,39.0,9363.0,50.0,26.788154,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0
max,33014.0,52.0,12515.0,149.0,29.999579,1.0,1.0,1.0,1.0,1.0,1.0,1.0,5.0,1.0


In [5]:
#Checking percentage of missing values in the dataset
missing = data.isna().sum()
missing

ID_Patient_Care_Situation       0
Diagnosed_Condition             0
Patient_ID                      0
Treated_with_drugs             13
Patient_Age                     0
Patient_Body_Mass_Index         0
Patient_Smoker                  0
Patient_Rural_Urban             0
Patient_mental_condition        0
A                            1235
B                            1235
C                            1235
D                            1235
E                            1235
F                            1235
Z                            1235
Number_of_prev_cond          1235
Survived_1_year                 0
dtype: int64

In [6]:
percent = (missing.sum() / len(data)) * 100
percent

42.8324024765121

# Removing Outliers and Filling Missing Values in the dataset with mode values

In [7]:
#Filtering data by removing outliers. In this case removing people of age above 100
new_data=data[data["Patient_Age"] < 100]
len(new_data[data["Patient_Age"] > 100])

0

In [8]:
#Filling missing values with mode
new_data['Treated_with_drugs']=new_data['Treated_with_drugs'].fillna(new_data['Treated_with_drugs'].mode()[0])
new_data['Number_of_prev_cond']=new_data['Number_of_prev_cond'].fillna(new_data['Number_of_prev_cond'].mode()[0])
new_data['A'].fillna(new_data['A'].mode()[0], inplace=True)
new_data['B'].fillna(new_data['B'].mode()[0], inplace=True)
new_data['C'].fillna(new_data['C'].mode()[0], inplace=True)
new_data['D'].fillna(new_data['D'].mode()[0], inplace=True)
new_data['E'].fillna(new_data['E'].mode()[0], inplace=True)
new_data['F'].fillna(new_data['F'].mode()[0], inplace=True)
new_data['Z'].fillna(new_data['Z'].mode()[0], inplace=True)


new_data.isna().sum()

ID_Patient_Care_Situation    0
Diagnosed_Condition          0
Patient_ID                   0
Treated_with_drugs           0
Patient_Age                  0
Patient_Body_Mass_Index      0
Patient_Smoker               0
Patient_Rural_Urban          0
Patient_mental_condition     0
A                            0
B                            0
C                            0
D                            0
E                            0
F                            0
Z                            0
Number_of_prev_cond          0
Survived_1_year              0
dtype: int64

# Dropping colums which are not of use in the study as of now

In [9]:
#Droping Mental condition as all patients in the database are stable
new_data.drop('Patient_mental_condition', axis = 1, inplace=True)

#Looks like ID_patient_care can be duplicated so removing in from dataset
new_data.drop(['ID_Patient_Care_Situation'], axis =1, inplace=True)
new_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23084 entries, 0 to 23096
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Diagnosed_Condition      23084 non-null  int64  
 1   Patient_ID               23084 non-null  int64  
 2   Treated_with_drugs       23084 non-null  object 
 3   Patient_Age              23084 non-null  int64  
 4   Patient_Body_Mass_Index  23084 non-null  float64
 5   Patient_Smoker           23084 non-null  object 
 6   Patient_Rural_Urban      23084 non-null  object 
 7   A                        23084 non-null  float64
 8   B                        23084 non-null  float64
 9   C                        23084 non-null  float64
 10  D                        23084 non-null  float64
 11  E                        23084 non-null  float64
 12  F                        23084 non-null  float64
 13  Z                        23084 non-null  float64
 14  Number_of_prev_cond   

# Creating dummies for "Treated_with_drugs"

In [10]:
new_data.Treated_with_drugs.value_counts()

DX6                     8606
DX5                     1909
DX2                     1904
DX1                     1835
DX3                     1830
DX4                     1792
DX1 DX2                  448
DX3 DX4                  448
DX1 DX3                  424
DX4 DX5                  423
DX2 DX4                  419
DX1 DX4                  408
DX3 DX5                  407
DX1 DX5                  402
DX2 DX5                  400
DX2 DX3                  398
DX1 DX2 DX5              103
DX1 DX3 DX5              101
DX1 DX2 DX4               99
DX3 DX4 DX5               96
DX1 DX2 DX3               95
DX2 DX3 DX5               91
DX1 DX3 DX4               90
DX2 DX3 DX4               87
DX2 DX4 DX5               84
DX1 DX4 DX5               80
DX1 DX3 DX4 DX5           24
DX1 DX2 DX3 DX4           24
DX2 DX3 DX4 DX5           22
DX1 DX2 DX4 DX5           18
DX1 DX2 DX3 DX5           14
DX1 DX2 DX3 DX4 DX5        3
Name: Treated_with_drugs, dtype: int64

In [11]:
# spliting all the entries separated by space and creating a dummy variable
drugs = new_data['Treated_with_drugs'].str.get_dummies(sep=' ')
drugs.head()

Unnamed: 0,DX1,DX2,DX3,DX4,DX5,DX6
0,0,0,0,0,0,1
1,0,1,0,0,0,0
2,0,0,0,0,0,1
3,1,0,0,0,0,0
5,0,0,0,0,0,1


In [12]:
# Concat two data sets
new_data= pd.concat([new_data, drugs], axis=1)

#Droppping Treated_with_drugs column
new_data = new_data.drop("Treated_with_drugs", axis=1)

new_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23084 entries, 0 to 23096
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Diagnosed_Condition      23084 non-null  int64  
 1   Patient_ID               23084 non-null  int64  
 2   Patient_Age              23084 non-null  int64  
 3   Patient_Body_Mass_Index  23084 non-null  float64
 4   Patient_Smoker           23084 non-null  object 
 5   Patient_Rural_Urban      23084 non-null  object 
 6   A                        23084 non-null  float64
 7   B                        23084 non-null  float64
 8   C                        23084 non-null  float64
 9   D                        23084 non-null  float64
 10  E                        23084 non-null  float64
 11  F                        23084 non-null  float64
 12  Z                        23084 non-null  float64
 13  Number_of_prev_cond      23084 non-null  float64
 14  Survived_1_year       

Seperating values for 'Patient_Smoker', 'Patient_Rural_Urban'

In [13]:
new_data.Patient_Smoker.value_counts()

NO     13246
YES     9838
Name: Patient_Smoker, dtype: int64

In [14]:
new_data = pd.get_dummies(new_data, columns=['Patient_Smoker', 'Patient_Rural_Urban'])
new_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23084 entries, 0 to 23096
Data columns (total 23 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Diagnosed_Condition        23084 non-null  int64  
 1   Patient_ID                 23084 non-null  int64  
 2   Patient_Age                23084 non-null  int64  
 3   Patient_Body_Mass_Index    23084 non-null  float64
 4   A                          23084 non-null  float64
 5   B                          23084 non-null  float64
 6   C                          23084 non-null  float64
 7   D                          23084 non-null  float64
 8   E                          23084 non-null  float64
 9   F                          23084 non-null  float64
 10  Z                          23084 non-null  float64
 11  Number_of_prev_cond        23084 non-null  float64
 12  Survived_1_year            23084 non-null  int64  
 13  DX1                        23084 non-null  int

In [15]:
#Taking out the output varables
y = new_data['Survived_1_year']
y.head()

new_data.drop('Survived_1_year', axis = 1, inplace=True)

In [16]:
new_data.head()

Unnamed: 0,Diagnosed_Condition,Patient_ID,Patient_Age,Patient_Body_Mass_Index,A,B,C,D,E,F,...,DX1,DX2,DX3,DX4,DX5,DX6,Patient_Smoker_NO,Patient_Smoker_YES,Patient_Rural_Urban_RURAL,Patient_Rural_Urban_URBAN
0,8,3333,56,18.479385,1.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,1,0,1,0,1
1,5,5740,36,22.945566,1.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,1,1,0
2,23,10446,48,27.510027,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,1,1,0
3,51,12011,5,19.130976,1.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,1,0,0,1
5,45,7977,47,26.15512,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,1,0,1


In [17]:
#Splitting the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(new_data, y, test_size=0.2, random_state=1, stratify=y)

# Applying MACHINE LEARNING Algorithms


# 1. LOGISTIC REGRESSION

In [18]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

In [19]:
f1_score(y_test, model.predict(X_test))

0.7967244701348748

# 2. Random Forest

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.feature_selection import SelectFromModel

Rforest = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5)
Rforest.fit(X_train, y_train)

RandomForestClassifier(max_depth=5, n_estimators=1000, random_state=1)

In [21]:
f1_score(y_test, Rforest.predict(X_test))

0.8309636650868878

Applying Boruta Technique on Random Forest to get the most important columns

In [22]:
!pip install Boruta



In [23]:
from boruta import BorutaPy

boruta_selector = BorutaPy(Rforest, n_estimators='auto', verbose=2, random_state=1)
boruta_selector.fit(np.array(X_train), np.array(y_train))

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	22
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	22
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	22
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	22
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	22
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	22
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	22
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	16
Tentative: 	3
Rejected: 	3
Iteration: 	9 / 100
Confirmed: 	16
Tentative: 	3
Rejected: 	3
Iteration: 	10 / 100
Confirmed: 	16
Tentative: 	3
Rejected: 	3
Iteration: 	11 / 100
Confirmed: 	16
Tentative: 	3
Rejected: 	3
Iteration: 	12 / 100
Confirmed: 	16
Tentative: 	3
Rejected: 	3
Iteration: 	13 / 100
Confirmed: 	16
Tentative: 	3
Rejected: 	3
Iteration: 	14 / 100
Confirmed: 	16
Tentative: 	3
Rejected: 	3
Iteration: 	15 / 100
Confirmed: 	16
Tentative: 	3
Rejected: 	3
Iteration: 	16 / 100
Confirmed: 	16
Tentative: 	3
Rejected: 	3
I

BorutaPy(estimator=RandomForestClassifier(max_depth=5, n_estimators=123,
                                          random_state=RandomState(MT19937) at 0x7FBE8015EB40),
         n_estimators='auto',
         random_state=RandomState(MT19937) at 0x7FBE8015EB40, verbose=2)

In [25]:
boruta_selector.support_

array([ True, False,  True,  True,  True,  True, False,  True, False,
       False, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True])

In [28]:

x = new_data.iloc[:, 0:22]
df1= pd.DataFrame({"Column_names" : x.columns, "Imp": list(boruta_selector.support_)})
df1.sort_values(["Imp"], ascending=False)


Unnamed: 0,Column_names,Imp
0,Diagnosed_Condition,True
12,DX1,True
20,Patient_Rural_Urban_RURAL,True
19,Patient_Smoker_YES,True
18,Patient_Smoker_NO,True
17,DX6,True
16,DX5,True
15,DX4,True
14,DX3,True
13,DX2,True


In [29]:
print("Selected Features: ", boruta_selector.support_)    # check selected features
 

print("Ranking: ",boruta_selector.ranking_)               # check ranking of features

print("No. of significant features: ", boruta_selector.n_features_)

Selected Features:  [ True False  True  True  True  True False  True False False False  True
  True  True  True  True  True  True  True  True  True  True]
Ranking:  [1 2 1 1 1 1 3 1 4 5 6 1 1 1 1 1 1 1 1 1 1 1]
No. of significant features:  17


After applying Boruta technique we can see that there are 17 relevant features.

# Now creating a model using Boruta selected feature

In [41]:
X_important_train= boruta_selector.transform(np.array(X_train))
X_important_test= boruta_selector.transform(np.array(X_test))

In [42]:
rf_important=RandomForestClassifier(random_state=1, n_estimators=1000, n_jobs=-1)

rf_important.fit(X_important_train, y_train)


RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=1)

Predicting F1 score of the new model with Randome forest using Boruta Selector

In [43]:
f1_score(y_test, rf_important.predict(X_important_test))

0.861336032388664

# Let's apply Hyper Parameter Tuning model to improve the accurcy of F1 score

In [46]:
from sklearn.model_selection import GridSearchCV
grid= {'bootstrap': [True, False],
      'max_depth': [5, 10, 15],
      'n_estimators': [500, 1000]}

In [49]:
random = RandomForestClassifier(random_state=1)

grid_search = GridSearchCV(estimator= random, param_grid = grid, cv=2, n_jobs=-1, verbose=2)

In [50]:
grid_search.fit(X_important_train, y_train)

Fitting 2 folds for each of 12 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  1.5min finished


GridSearchCV(cv=2, estimator=RandomForestClassifier(random_state=1), n_jobs=-1,
             param_grid={'bootstrap': [True, False], 'max_depth': [5, 10, 15],
                         'n_estimators': [500, 1000]},
             verbose=2)

Evaluating the model

In [56]:
f1_score(y_test, grid_search.predict(X_important_test))

0.864185110663984

# Comparing Model's Improvance, Conclusion

# We can see that by applying various ML algorithms we increased the F1 score of the model from 79.67% to 86.41%


1.	 Logistic Regression: 79.67%

2.	 Random Forest Classifier: 83.09%

3.	 Random Forest with Boruta Selector: 86.13%

4.	 Hyperparameter Tunning | Random Forest | Boruta: 86.41%