# Data Science Datathon  

## A look at the dataset

In [30]:
# Import the necessary libraries
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import statsmodels.api as sm

np.random.seed(2022) # set the seed to get same results

pharma_data = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/pharma_data/Training_set_begs.csv')
test_data = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/pharma_data/Testing_set_begs.csv')
pharma_data.sample(5) # a look at the dataset

Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Treated_with_drugs,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,Patient_mental_condition,A,B,C,D,E,F,Z,Number_of_prev_cond,Survived_1_year
11158,6910,5,12144,DX4,66,19.085981,YES,URBAN,Stable,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0
12417,19883,19,2374,DX6,28,27.71671,NO,URBAN,Stable,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,1
16920,9342,2,6577,DX6,45,29.883526,YES,RURAL,Stable,,,,,,,,,1
14069,19222,38,12010,DX6,14,23.379967,YES,RURAL,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
22289,22643,46,10386,DX6,66,27.741979,YES,RURAL,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0


In [2]:
pharma_data.nunique(dropna=True) # a look at the unique values

ID_Patient_Care_Situation    23097
Diagnosed_Condition             53
Patient_ID                   10570
Treated_with_drugs              32
Patient_Age                     78
Patient_Body_Mass_Index      10570
Patient_Smoker                   3
Patient_Rural_Urban              2
Patient_mental_condition         1
A                                2
B                                2
C                                2
D                                2
E                                2
F                                2
Z                                2
Number_of_prev_cond              5
Survived_1_year                  2
dtype: int64

Observe that ID_patient_Care_Situation has same number of unique values as number of rows. As it's a variable just to identify the patient's situation, we can ignore this during our modelling.  
Patient_ID is also another variable which can be considered an unnecessary feature so we can drop it as well.  
Finally, Patient_mental_condition has only one value in all the records so we can ignore this feature too as it won't provide any insight.

In [3]:
pharma_data.drop(['ID_Patient_Care_Situation', 'Patient_ID', 'Patient_mental_condition'], axis=1, inplace=True)

We also see that we have a variable Treated_with_drugs which is categorical but if we were to encode it, it may give us way too many codes unnecessarily. When we will treate the null values, we will try ways to make this easy to encode.

In [4]:
pharma_data['Treated_with_drugs'].value_counts()

DX6                     8606
DX5                     1909
DX2                     1904
DX1                     1835
DX3                     1830
DX4                     1792
DX3 DX4                  448
DX1 DX2                  448
DX1 DX3                  424
DX4 DX5                  423
DX2 DX4                  419
DX1 DX4                  408
DX3 DX5                  407
DX1 DX5                  402
DX2 DX5                  400
DX2 DX3                  398
DX1 DX2 DX5              103
DX1 DX3 DX5              101
DX1 DX2 DX4               99
DX3 DX4 DX5               96
DX1 DX2 DX3               95
DX2 DX3 DX5               91
DX1 DX3 DX4               90
DX2 DX3 DX4               87
DX2 DX4 DX5               84
DX1 DX4 DX5               80
DX1 DX3 DX4 DX5           24
DX1 DX2 DX3 DX4           24
DX2 DX3 DX4 DX5           22
DX1 DX2 DX4 DX5           18
DX1 DX2 DX3 DX5           14
DX1 DX2 DX3 DX4 DX5        3
Name: Treated_with_drugs, dtype: int64

To avoid this problem, we will ensure this column only has values DX1,...,DX6 but not combinations. Furthermore, we can create another column that can tell us how many drugs a patient was taking.

## Treating Null values

In [5]:
pharma_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23097 entries, 0 to 23096
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Diagnosed_Condition      23097 non-null  int64  
 1   Treated_with_drugs       23084 non-null  object 
 2   Patient_Age              23097 non-null  int64  
 3   Patient_Body_Mass_Index  23097 non-null  float64
 4   Patient_Smoker           23097 non-null  object 
 5   Patient_Rural_Urban      23097 non-null  object 
 6   A                        21862 non-null  float64
 7   B                        21862 non-null  float64
 8   C                        21862 non-null  float64
 9   D                        21862 non-null  float64
 10  E                        21862 non-null  float64
 11  F                        21862 non-null  float64
 12  Z                        21862 non-null  float64
 13  Number_of_prev_cond      21862 non-null  float64
 14  Survived_1_year       

In [6]:
(pharma_data.isnull().sum()/23097)*100 # Observe that at most 5.35% data has missing values

Diagnosed_Condition        0.000000
Treated_with_drugs         0.056284
Patient_Age                0.000000
Patient_Body_Mass_Index    0.000000
Patient_Smoker             0.000000
Patient_Rural_Urban        0.000000
A                          5.347015
B                          5.347015
C                          5.347015
D                          5.347015
E                          5.347015
F                          5.347015
Z                          5.347015
Number_of_prev_cond        5.347015
Survived_1_year            0.000000
dtype: float64

We observe that a column can has at most 5.35% of its values missing.  
This can be considered a proportion small enough to have those rows dropped.  
We will see how different methods of treating the null values give varying results.

In [7]:
pharma_dropped = pharma_data.dropna() # direct dropping the null values
temp = pharma_dropped['Treated_with_drugs'].str.split() # get the drugs a patient too
temp = temp.apply(lambda x: len(x)) # get the number of drugs a patient took
pharma_dropped.insert(2, 'Number_of_drugs', temp)
# pharma_dropped = pharma_dropped.explode('Treated_with_drugs')

for col in pharma_dropped:
    if pharma_dropped[col].dtype == "object":
        pharma_dropped[col] = pharma_dropped[col].astype('category').cat.codes # encode the categorical values
        pharma_dropped[col] = pharma_dropped[col].astype('int64') # not required but helps in keeping things consistent

X = pharma_dropped.drop(columns=['Survived_1_year'])
y = pharma_dropped['Survived_1_year'].astype('category')
pharma_dropped.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21849 entries, 0 to 23096
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Diagnosed_Condition      21849 non-null  int64  
 1   Treated_with_drugs       21849 non-null  int64  
 2   Number_of_drugs          21849 non-null  int64  
 3   Patient_Age              21849 non-null  int64  
 4   Patient_Body_Mass_Index  21849 non-null  float64
 5   Patient_Smoker           21849 non-null  int64  
 6   Patient_Rural_Urban      21849 non-null  int64  
 7   A                        21849 non-null  float64
 8   B                        21849 non-null  float64
 9   C                        21849 non-null  float64
 10  D                        21849 non-null  float64
 11  E                        21849 non-null  float64
 12  F                        21849 non-null  float64
 13  Z                        21849 non-null  float64
 14  Number_of_prev_cond   

## Model Selection  

we will first see if we have any multicollinearity issues as that can cause problems for our model later.

In [8]:
X.corr()

Unnamed: 0,Diagnosed_Condition,Treated_with_drugs,Number_of_drugs,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,A,B,C,D,E,F,Z,Number_of_prev_cond
Diagnosed_Condition,1.0,0.011496,0.010127,-0.003931,0.003828,-0.001528,0.006965,0.00185,0.003525,-0.003327,-0.008166,0.00707,0.005688,,0.003832
Treated_with_drugs,0.011496,1.0,-0.347876,-0.009652,0.002011,-0.012255,0.002588,-0.012154,-0.009231,-0.003353,-0.005017,-0.000433,0.009481,,-0.009873
Number_of_drugs,0.010127,-0.347876,1.0,-0.002222,-0.010216,0.004456,0.006072,0.005749,0.004215,0.011307,-0.006284,0.007305,-0.004087,,0.011012
Patient_Age,-0.003931,-0.009652,-0.002222,1.0,0.013037,0.287905,-0.006134,-0.019266,-0.012507,-0.011709,0.004365,-0.002238,0.00743,,-0.016708
Patient_Body_Mass_Index,0.003828,0.002011,-0.010216,0.013037,1.0,0.015737,0.011527,0.009652,0.001034,-0.018666,-0.009511,-0.006482,-0.016578,,-0.017549
Patient_Smoker,-0.001528,-0.012255,0.004456,0.287905,0.015737,1.0,-0.002933,-0.006261,0.005943,-0.010178,0.010494,0.000359,-0.004236,,-0.00218
Patient_Rural_Urban,0.006965,0.002588,0.006072,-0.006134,0.011527,-0.002933,1.0,0.001075,-0.003311,0.00806,0.002639,-0.012283,0.005744,,-0.002149
A,0.00185,-0.012154,0.005749,-0.019266,0.009652,-0.006261,0.001075,1.0,-0.064556,-0.090918,-0.076801,-0.164966,-0.054944,,0.16922
B,0.003525,-0.009231,0.004215,-0.012507,0.001034,0.005943,-0.003311,-0.064556,1.0,-0.02129,-0.002122,-0.017999,-0.01607,,0.392672
C,-0.003327,-0.003353,0.011307,-0.011709,-0.018666,-0.010178,0.00806,-0.090918,-0.02129,1.0,0.007443,-0.026529,0.000835,,0.445228


We observe from the correlation matrix that column Z has null values for every column including itself. Upon closer inspection, we observe that Z has only 1 unique values after dropping the null values. Thus, it does not provide any information for our analysis.  

We also observe that Number_of_prev_cond has considerable correlation with other variables.

In [9]:
X.nunique()

Diagnosed_Condition          52
Treated_with_drugs           32
Number_of_drugs               5
Patient_Age                  67
Patient_Body_Mass_Index    9960
Patient_Smoker                2
Patient_Rural_Urban           2
A                             2
B                             2
C                             2
D                             2
E                             2
F                             2
Z                             1
Number_of_prev_cond           5
dtype: int64

In [10]:
X.drop(['Z'], axis=1, inplace=True)

After removing the Z column, we will see if there are any insignificant variables.

In [11]:
results = sm.OLS(y, X).fit() # useful in determining insignificant variables
print(results.summary())

                                 OLS Regression Results                                
Dep. Variable:        Survived_1_year   R-squared (uncentered):                   0.646
Model:                            OLS   Adj. R-squared (uncentered):              0.646
Method:                 Least Squares   F-statistic:                              3063.
Date:                Sun, 17 Apr 2022   Prob (F-statistic):                        0.00
Time:                        11:07:57   Log-Likelihood:                         -14518.
No. Observations:               21849   AIC:                                  2.906e+04
Df Residuals:                   21836   BIC:                                  2.917e+04
Df Model:                          13                                                  
Covariance Type:            nonrobust                                                  
                              coef    std err          t      P>|t|      [0.025      0.975]
----------------------------

It does seem like from $P>|t|$ column that C, F and Number_of_prev_cond are insignificant variables as their p-values are greater than 0.05.

We have now removed every variable that was either insignificant or could cause multicollinearity issues.

We observe from the notes that we don't have the issue of multicollinearity anymore. Also, now we have Number_of_prev_cond as an insignificant variable and that does make sense as it is basically sum of values in columns A-F and Z.

In [12]:
X.drop(['C', 'Number_of_prev_cond'], axis=1, inplace=True)
results = sm.OLS(y, X).fit()
print(results.summary())

                                 OLS Regression Results                                
Dep. Variable:        Survived_1_year   R-squared (uncentered):                   0.646
Model:                            OLS   Adj. R-squared (uncentered):              0.646
Method:                 Least Squares   F-statistic:                              3317.
Date:                Sun, 17 Apr 2022   Prob (F-statistic):                        0.00
Time:                        11:07:57   Log-Likelihood:                         -14519.
No. Observations:               21849   AIC:                                  2.906e+04
Df Residuals:                   21837   BIC:                                  2.916e+04
Df Model:                          12                                                  
Covariance Type:            nonrobust                                                  
                              coef    std err          t      P>|t|      [0.025      0.975]
----------------------------

We observe that we now have significant variables only.  

## Logistic Regression  

we will now divide the dataset into training and testing set and run a Logistic Model on it.

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75)

In [14]:
logreg = LogisticRegression(max_iter=1000, solver='saga', tol=1e-3)
params = {'penalty':['l1','l2','none','elasticnet'], 'C':[0.25, 0.5, 0.75, 1]}
rcv = RandomizedSearchCV(logreg, params, scoring='f1')
rcv.fit(X_train, y_train)
print(rcv.best_estimator_, rcv.best_score_, rcv.best_params_, sep='\n')

LogisticRegression(C=0.25, max_iter=1000, solver='saga', tol=0.001)
0.7746307160483998
{'penalty': 'l2', 'C': 0.25}


In [15]:
y_pred = rcv.predict(X_test)
print(f1_score(y_test, y_pred, average='binary'))

0.7793401838831802


We observe that we are getting a training score of about 77\% and test score of about 78\% which are really very good scores. We can use these scores as baseline when trying to run different models.  

## Treating null values with imputation

In [19]:
cols= pharma_data.columns
# num_cols = pharma_data.select_dtypes(include=np.number).columns # the continuous variables
# cat_cols = set(cols) - set(num_cols) # the categorical variables
pharma_imputed = pharma_data.copy()
# pharma_imputed.loc[:,num_cols] = pharma_data.fillna(pharma_data.loc[:,num_cols].mode().iloc[0])
# pharma_imputed.loc[:,cat_cols] = pharma_data.fillna(pharma_data.loc[:,cat_cols].mode().iloc[0])
pharma_imputed.loc[:,:] = pharma_data.fillna(pharma_data.loc[:,:].mode().iloc[0])
pharma_imputed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23097 entries, 0 to 23096
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Diagnosed_Condition      23097 non-null  int64  
 1   Treated_with_drugs       23097 non-null  object 
 2   Patient_Age              23097 non-null  int64  
 3   Patient_Body_Mass_Index  23097 non-null  float64
 4   Patient_Smoker           23097 non-null  object 
 5   Patient_Rural_Urban      23097 non-null  object 
 6   A                        23097 non-null  float64
 7   B                        23097 non-null  float64
 8   C                        23097 non-null  float64
 9   D                        23097 non-null  float64
 10  E                        23097 non-null  float64
 11  F                        23097 non-null  float64
 12  Z                        23097 non-null  float64
 13  Number_of_prev_cond      23097 non-null  float64
 14  Survived_1_year       

In [20]:
temp = pharma_imputed['Treated_with_drugs'].str.split() # get the drugs a patient too
temp = temp.apply(lambda x: len(x)) # get the number of drugs a patient took
pharma_imputed.insert(2, 'Number_of_drugs', temp)
# pharma_imputed = pharma_imputed.explode('Treated_with_drugs')

for col in pharma_imputed.columns:
    if pharma_imputed[col].dtype == "object":
        pharma_imputed[col] = pharma_imputed[col].astype('category').cat.codes
        pharma_imputed[col] = pharma_imputed[col].astype('int64')
        
X = pharma_imputed.drop(columns=['Survived_1_year'])
y = pharma_imputed['Survived_1_year'].astype('category')

pharma_imputed.head()

Unnamed: 0,Diagnosed_Condition,Treated_with_drugs,Number_of_drugs,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,A,B,C,D,E,F,Z,Number_of_prev_cond,Survived_1_year
0,8,31,1,56,18.479385,2,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0
1,5,16,1,36,22.945566,2,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
2,23,31,1,48,27.510027,2,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
3,51,0,1,5,19.130976,1,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
4,0,31,1,128,1.3484,0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1


In [21]:
X.corr()

Unnamed: 0,Diagnosed_Condition,Treated_with_drugs,Number_of_drugs,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,A,B,C,D,E,F,Z,Number_of_prev_cond
Diagnosed_Condition,1.0,0.009113,0.012035,-0.008971,0.008305,0.000314,0.010335,0.005431,0.003063,-0.003634,-0.008186,0.006056,0.005311,-0.041702,0.002793
Treated_with_drugs,0.009113,1.0,-0.349431,-0.007248,-0.001935,-0.010973,0.004969,-0.013095,-0.00911,-0.00344,-0.004996,-0.000743,0.009114,0.018332,-0.009766
Number_of_drugs,0.012035,-0.349431,1.0,-0.00469,-0.006925,0.002224,0.007926,0.007227,0.00322,0.009894,-0.006747,0.00526,-0.004483,-0.01176,0.008374
Patient_Age,-0.008971,-0.007248,-0.00469,1.0,-0.00656,0.274867,-0.005514,-0.028025,-0.011582,-0.010707,0.004541,-0.001239,0.007428,0.114038,-0.01467
Patient_Body_Mass_Index,0.008305,-0.001935,-0.006925,-0.00656,1.0,0.025686,0.014156,0.020349,0.000848,-0.01806,-0.009257,-0.006431,-0.016045,-0.136708,-0.016848
Patient_Smoker,0.000314,-0.010973,0.002224,0.274867,0.025686,1.0,-0.006458,-0.001642,0.007021,-0.008295,0.011135,0.002894,-0.00334,-0.068261,0.001
Patient_Rural_Urban,0.010335,0.004969,0.007926,-0.005514,0.014156,-0.006458,1.0,0.002834,-0.003848,0.007015,0.002068,-0.013022,0.005187,-0.01559,-0.003584
A,0.005431,-0.013095,0.007227,-0.028025,0.020349,-0.001642,0.002834,1.0,-0.070336,-0.097536,-0.081067,-0.174069,-0.058407,-0.072557,0.148692
B,0.003063,-0.00911,0.00322,-0.011582,0.000848,0.007021,-0.003848,-0.070336,1.0,-0.010937,0.004339,-0.000686,-0.010889,-0.009135,0.401671
C,-0.003634,-0.00344,0.009894,-0.010707,-0.01806,-0.008295,0.007015,-0.097536,-0.010937,1.0,0.015097,-0.005649,0.00692,-0.010936,0.455918


We again observe that Number_of_prev_cond has storng correlations so we will drop it immediately this time.

In [22]:
X.drop(['Number_of_prev_cond'], axis=1, inplace=True)

In [23]:
results = sm.OLS(y, X).fit()
print(results.summary())

                                 OLS Regression Results                                
Dep. Variable:        Survived_1_year   R-squared (uncentered):                   0.641
Model:                            OLS   Adj. R-squared (uncentered):              0.641
Method:                 Least Squares   F-statistic:                              2941.
Date:                Sun, 17 Apr 2022   Prob (F-statistic):                        0.00
Time:                        11:08:36   Log-Likelihood:                         -15654.
No. Observations:               23097   AIC:                                  3.134e+04
Df Residuals:                   23083   BIC:                                  3.145e+04
Df Model:                          14                                                  
Covariance Type:            nonrobust                                                  
                              coef    std err          t      P>|t|      [0.025      0.975]
----------------------------

In [24]:
# C and F are highly insignifiacnt
X.drop(['C', 'F'], axis=1, inplace=True)
results = sm.OLS(y, X).fit()
print(results.summary())

                                 OLS Regression Results                                
Dep. Variable:        Survived_1_year   R-squared (uncentered):                   0.641
Model:                            OLS   Adj. R-squared (uncentered):              0.640
Method:                 Least Squares   F-statistic:                              3430.
Date:                Sun, 17 Apr 2022   Prob (F-statistic):                        0.00
Time:                        11:08:36   Log-Likelihood:                         -15659.
No. Observations:               23097   AIC:                                  3.134e+04
Df Residuals:                   23085   BIC:                                  3.144e+04
Df Model:                          12                                                  
Covariance Type:            nonrobust                                                  
                              coef    std err          t      P>|t|      [0.025      0.975]
----------------------------

We observe that we are still getting some multicollineairty issues. Since Z has some huge standard error relatively, we will consider dropping it.

In [25]:
X.drop(['Z', 'Treated_with_drugs'], axis=1, inplace=True)
results = sm.OLS(y, X).fit()
print(results.summary())

                                 OLS Regression Results                                
Dep. Variable:        Survived_1_year   R-squared (uncentered):                   0.640
Model:                            OLS   Adj. R-squared (uncentered):              0.640
Method:                 Least Squares   F-statistic:                              4109.
Date:                Sun, 17 Apr 2022   Prob (F-statistic):                        0.00
Time:                        11:08:36   Log-Likelihood:                         -15672.
No. Observations:               23097   AIC:                                  3.136e+04
Df Residuals:                   23087   BIC:                                  3.144e+04
Df Model:                          10                                                  
Covariance Type:            nonrobust                                                  
                              coef    std err          t      P>|t|      [0.025      0.975]
----------------------------

We now don't have any insignificant variables and can proceed to modelling.  

## Logistic with imputed dataset

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75)
logreg3 = LogisticRegression(max_iter=1000, solver='saga', tol=1e-3)
params = {'penalty':['l1','l2','none','elasticnet'], 'C':[0.25, 0.5, 0.75, 1]}
rcv2 = RandomizedSearchCV(logreg3, params, scoring='f1')
rcv2.fit(X_train, y_train)
print(rcv2.best_estimator_, rcv2.best_score_, rcv2.best_params_)
y_pred = rcv2.predict(X_test)
print(f1_score(y_test, y_pred, average='binary'))

LogisticRegression(C=0.25, max_iter=1000, penalty='l1', solver='saga',
                   tol=0.001) 0.78047071509041 {'penalty': 'l1', 'C': 0.25}
0.7825543614096476


Our training score is about 78.7% and test score is about 77.9% which is slightly lower than what we got when we used a dataset with null values dropped. 

## Random Forest Classifier

In [27]:
# we will consider the first datatset we got from dropping null values.

X = pharma_dropped.drop(columns=['Survived_1_year'])
y = pharma_dropped['Survived_1_year'].astype('category')
X.drop(['C','Z','Number_of_prev_cond'], inplace=True, axis=1)
rfc = RandomForestClassifier(criterion='entropy')
params = {'n_estimators':[50, 75, 100, 125, 150], 'max_depth':[5, 10, 15, 20]}
rcv3 = RandomizedSearchCV(rfc, params, random_state=0, scoring='f1')
rcv3.fit(X_train, y_train)
print(rcv3.best_estimator_, rcv3.best_score_, rcv3.best_params_)
y_pred = rcv3.predict(X_test)
print(f1_score(y_test, y_pred, average='binary'))

RandomForestClassifier(criterion='entropy', max_depth=10, n_estimators=125) 0.8374131839982185 {'n_estimators': 125, 'max_depth': 10}
0.8392697561598366


We observe that we have got phenomenal results from Random Forest Classifier as both the training and test scores are about 83-84%.  

## Making Predictions

In [28]:
submission = pd.DataFrame()
test_data.info() # No null vallues obsrved

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9303 entries, 0 to 9302
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID_Patient_Care_Situation  9303 non-null   int64  
 1   Diagnosed_Condition        9303 non-null   int64  
 2   Patient_ID                 9303 non-null   int64  
 3   Treated_with_drugs         9303 non-null   object 
 4   Patient_Age                9303 non-null   int64  
 5   Patient_Body_Mass_Index    9303 non-null   float64
 6   Patient_Smoker             9303 non-null   object 
 7   Patient_Rural_Urban        9303 non-null   object 
 8   Patient_mental_condition   9303 non-null   object 
 9   A                          9303 non-null   float64
 10  B                          9303 non-null   float64
 11  C                          9303 non-null   float64
 12  D                          9303 non-null   float64
 13  E                          9303 non-null   float

In [31]:
temp = test_data['Treated_with_drugs'].str.split() # get the drugs a patient too
temp = temp.apply(lambda x: len(x)) # get the number of drugs a patient took
test_data.insert(2, 'Number_of_drugs', temp)
test_data = test_data.explode('Treated_with_drugs')

test_X = test_data.loc[:, X.columns] # setting up the dataset 

for col in test_X.columns:
    if test_X[col].dtype == "object":
        test_X[col] = test_X[col].astype('category').cat.codes
        test_X[col] = test_X[col].astype('int64')
test_X.head(5) # preparing the dataframe before fitting the model and making predictions

Unnamed: 0,Diagnosed_Condition,Treated_with_drugs,Number_of_drugs,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,A,B,D,E,F
0,40,24,1,16,29.443894,0,0,1.0,0.0,0.0,1.0,0.0
1,52,31,1,24,26.836321,0,1,1.0,1.0,0.0,0.0,0.0
2,50,29,2,63,25.52328,0,0,1.0,0.0,0.0,1.0,0.0
3,32,31,1,42,27.171155,0,1,1.0,0.0,0.0,1.0,0.0
4,20,24,1,50,25.556192,0,0,1.0,0.0,0.0,0.0,0.0


In [32]:
rfc = RandomForestClassifier(criterion='entropy', max_depth=10, n_estimators=125)
rfc.fit(X, y) # fit the model
print(f'{rfc.score(X,y):%}')
preds = rfc.predict(test_X) # make the predictions
submission['prediction'] = preds # store them in dataframe

85.212138%


We observed that we got 85% as our training score, which is really a good score.

In [34]:
submission.to_csv('DPhi_Data_Science_predictions.csv', index=False)
temp = pd.read_csv('DPhi_Data_Science_predictions.csv')