In [23]:
import numpy as np
import pandas as pd
import os

np.random.seed(10)
os.getcwd()

'C:\\Users\\adity\\Downloads\\ML'

In [36]:
data = pd.read_csv('training_data.csv', na_values=['?'])
data.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),,6,25,1,1,...,No,No,No,No,No,No,No,No,No,N
1,149190,55629189,Caucasian,Female,[10-20),,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,N
2,64410,86047875,AfricanAmerican,Female,[20-30),,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,N
3,500364,82442376,Caucasian,Male,[30-40),,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,N
4,16680,42519267,Caucasian,Male,[40-50),,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,N


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81414 entries, 0 to 81413
Data columns (total 50 columns):
encounter_id                81414 non-null int64
patient_nbr                 81414 non-null int64
race                        79601 non-null object
gender                      81414 non-null object
age                         81414 non-null object
weight                      2570 non-null object
admission_type_id           81414 non-null int64
discharge_disposition_id    81414 non-null int64
admission_source_id         81414 non-null int64
time_in_hospital            81414 non-null int64
payer_code                  49183 non-null object
medical_specialty           41479 non-null object
num_lab_procedures          81414 non-null int64
num_procedures              81414 non-null int64
num_medications             81414 non-null int64
number_outpatient           81414 non-null int64
number_emergency            81414 non-null int64
number_inpatient            81414 non-null int64
diag

In [8]:
percentage_missing = (data.isnull().sum()*100)/(len(data))
print(percentage_missing)

encounter_id                 0.000000
patient_nbr                  0.000000
race                         2.226890
gender                       0.000000
age                          0.000000
weight                      96.843295
admission_type_id            0.000000
discharge_disposition_id     0.000000
admission_source_id          0.000000
time_in_hospital             0.000000
payer_code                  39.589014
medical_specialty           49.051760
num_lab_procedures           0.000000
num_procedures               0.000000
num_medications              0.000000
number_outpatient            0.000000
number_emergency             0.000000
number_inpatient             0.000000
diag_1                       0.022109
diag_2                       0.353748
diag_3                       1.381826
number_diagnoses             0.000000
max_glu_serum                0.000000
A1Cresult                    0.000000
metformin                    0.000000
repaglinide                  0.000000
nateglinide 

### Because of high number of missing values, columns weight, payer_code and medical_specialty are removed. Since there are adequate number of observations in the dataset, the other missing values need not be imputed and those observations can be deleted. 
### Except insulin all other features of medication are removed as they show little to no variation. It has been identified after reading through research papers that prmary diagnosis is a sufficient indicator of diabetes and hence secondary/addition diagnoses can be removed.

In [37]:
data.drop(['weight','payer_code','medical_specialty','metformin','repaglinide','nateglinide','chlorpropamide','glimepiride',
               'acetohexamide','glipizide','glyburide','tolbutamide','pioglitazone','rosiglitazone','acarbose','miglitol',
               'troglitazone','tolazamide','examide','citoglipton','glyburide-metformin','glipizide-metformin',
               'glimepiride-pioglitazone','metformin-rosiglitazone','metformin-pioglitazone','diag_2','diag_3',
               'encounter_id','patient_nbr'], axis=1, inplace=True)

In [38]:
#Deleting observations with missing values in Race and diag_1
#Removing the 2 values of gender with data unknown/invalid
data.dropna(subset = ["race","diag_1"], inplace=True)
data = data[data["gender"] != "Unknown/Invalid"]
data.reset_index(drop=True, inplace=True)

### It has been identified by reading in a research paper that the values that start with 250 are indicators of diabetes

In [39]:
data["readmitted"] = pd.Series([0 if value == 'N' else 1 for value in data['readmitted']])
data['diag_1'] = data['diag_1'].astype(str)
data['diag_1'] = pd.Series([1 if val.startswith('250') else 0 for val in data['diag_1']], index=data.index)

### It is expected that young people are much less prone to diabetes and the data also indicates that most people are within the age groups 60-80. So it made sense to recategorize age groups.

In [40]:
data['age'] = pd.Series(['[0-50)' if val in ['[0-10)', '[10-20)', '[20-30)', '[30-40)', '[40-50)'] else val 
                         for val in data['age']], index=data.index)
data['age'] = pd.Series(['[80-100)' if val in ['[80-90)', '[90-100)'] else val 
                         for val in data['age']], index=data.index)

### Converted discharge disposition, admission source and types into 2 levels for ease of analysis. I do not expect them to have much impact in the readmission rate

In [41]:
data['discharge_disposition_id'] = pd.Series(['Home' if val == 1 else 'Other' 
                                              for val in data['discharge_disposition_id']], index=data.index)

In [42]:
data['admission_source_id'] = pd.Series(['ER' if val == 7 else 'Other' 
                                              for val in data['admission_source_id']], index=data.index)

In [43]:
data['admission_type_id'] = pd.Series(['Emergency' if val == 1 else 'Other' 
                                              for val in data['admission_type_id']], index=data.index)

### Getting dummy values for all nominal variables in the dataset

In [44]:
data_age = pd.get_dummies(data['age'])
data_race = pd.get_dummies(data['race'])
data_gender = pd.get_dummies(data['gender'])
data_max_glu_serum = pd.get_dummies(data['max_glu_serum'])
data_A1Cresult = pd.get_dummies(data['A1Cresult'])
data_insulin = pd.get_dummies(data['insulin'])
data_change = pd.get_dummies(data['change'])
data_diabetesMed = pd.get_dummies(data['diabetesMed'])
data_discharge_disposition_id = pd.get_dummies(data['discharge_disposition_id'])
data_admission_source_id = pd.get_dummies(data['admission_source_id'])
data_admission_type_id = pd.get_dummies(data['admission_type_id'])

data = pd.concat([data, data_age, data_race, data_gender, data_max_glu_serum, data_A1Cresult, 
                  data_insulin, data_change, data_diabetesMed, data_discharge_disposition_id, 
                  data_admission_source_id, data_admission_type_id], axis=1)
data.drop(['age', 'race', 'gender', 'max_glu_serum', 'A1Cresult', 'insulin', 'change', 
                  'diabetesMed', 'discharge_disposition_id', 'admission_source_id', 
                  'admission_type_id'], axis=1, inplace=True)

In [45]:
x = data.drop(["readmitted"], axis=1)
y = data["readmitted"]

In [46]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

### Logistic Regression

In [47]:
from sklearn.linear_model import LogisticRegression
log = LogisticRegression()
param_grid = {'C':[0.05,0.1,0.15,0.20,0.25,0.5],
              'max_iter':[100,500,1000,5000]}
grid_search1= GridSearchCV(log,param_grid, cv=5, scoring = 'accuracy', n_jobs=1)
grid_search1.fit(x,y)

print("Best Parameters : {}".format(grid_search1.best_params_))
print("Best cross_val score : {}".format(grid_search1.best_score_))

Best Parameters : {'C': 0.05, 'max_iter': 100}
Best cross_val score : 0.8876909931644552


In [48]:
logreg = LogisticRegression(C=0.05, max_iter = 100)
logreg.fit(x,y)
print('Accuracy score (training): {:.3f}'.format(logreg.score(x,y)))

Accuracy score (training): 0.888


### Random Forest

In [49]:
from sklearn.ensemble import RandomForestClassifier
ranf = RandomForestClassifier(random_state=100)
param_grid = {'max_depth':[1,2,3,4,5,6,7,8,9,10,15,20],
              'max_features':[1,2,3,4,5,6,7,8,9,10]}
grid_search3= GridSearchCV(ranf,param_grid, cv=5, scoring = 'accuracy', n_jobs=1)
grid_search3.fit(x,y)

print("Best Parameters : {}".format(grid_search3.best_params_))
print("Best cross_val score : {}".format(grid_search3.best_score_))

Best Parameters : {'max_depth': 10, 'max_features': 8}
Best cross_val score : 0.8879171692802573


In [50]:
ranfor = RandomForestClassifier(random_state=100, max_depth=10,max_features=8)
ranfor.fit(x,y)
print('Accuracy score (training): {:.3f}'.format(ranfor.score(x,y)))

Accuracy score (training): 0.890


### Bagging and Boosting can improve model performance

In [52]:
from sklearn.ensemble import GradientBoostingClassifier

boost = GradientBoostingClassifier()
param_grid = {'max_depth':[1,2,3,4,5,6,7,8,9,10],
              'learning_rate':[0.1, 0.05, 0.01, 0.001]}
grid_search_boost= GridSearchCV(boost,param_grid, cv=5, scoring = 'accuracy', n_jobs=1)
grid_search_boost.fit(x,y)

print("Best Parameters : {}".format(grid_search_boost.best_params_))
print("Best cross_val score : {}".format(grid_search_boost.best_score_))

KeyboardInterrupt: 

In [53]:
boost1 = GradientBoostingClassifier(learning_rate = 0.01, max_depth=1)
boost1.fit(x,y)
print('Accuracy score (training): {:.3f}'.format(boost1.score(x,y)))

Accuracy score (training): 0.888


In [54]:
from sklearn.ensemble import BaggingClassifier
bagging = BaggingClassifier()
param_grid = {'max_samples':[10,50,100],
               'n_estimators':[1,10,50,100,500]}
grid_search_bagging = GridSearchCV(bagging, param_grid, cv=5, scoring = 'accuracy', n_jobs=1)
grid_search_bagging.fit(x,y)

print("Best Parameters : {}".format(grid_search_bagging.best_params_))
print("Best cross_val score : {}".format(grid_search_bagging.best_score_))

KeyboardInterrupt: 

In [55]:
bag_logreg= BaggingClassifier(logreg, max_samples = 100, n_estimators =100, random_state = 10)
bag_logreg.fit(x,y)
print('Accuracy score of bagging classifier on training set: {:.2f}'.format(bag_logreg.score(x,y)))

Accuracy score of bagging classifier on training set: 0.89


### Importing test data set

In [56]:
test = pd.read_csv('test_data.csv', na_values=['?'])
test.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed
0,15738,63555939,Caucasian,Female,[90-100),,3,3,4,12,...,No,No,Steady,No,No,No,No,No,Ch,Yes
1,62256,49726791,AfricanAmerican,Female,[60-70),,3,1,2,1,...,No,No,Steady,No,No,No,No,No,No,Yes
2,150006,22864131,,Female,[50-60),,2,1,4,2,...,No,No,Down,No,No,No,No,No,Ch,Yes
3,183930,107400762,Caucasian,Female,[80-90),,2,6,1,11,...,No,No,No,No,No,No,No,No,No,No
4,248916,115196778,Caucasian,Female,[50-60),,1,1,1,2,...,No,No,Steady,No,No,No,No,No,No,Yes


In [57]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20352 entries, 0 to 20351
Data columns (total 49 columns):
encounter_id                20352 non-null int64
patient_nbr                 20352 non-null int64
race                        19892 non-null object
gender                      20352 non-null object
age                         20352 non-null object
weight                      627 non-null object
admission_type_id           20352 non-null int64
discharge_disposition_id    20352 non-null int64
admission_source_id         20352 non-null int64
time_in_hospital            20352 non-null int64
payer_code                  12327 non-null object
medical_specialty           10338 non-null object
num_lab_procedures          20352 non-null int64
num_procedures              20352 non-null int64
num_medications             20352 non-null int64
number_outpatient           20352 non-null int64
number_emergency            20352 non-null int64
number_inpatient            20352 non-null int64
diag_

In [58]:
percentage_missing_test = (test.isnull().sum()*100)/(len(test))
print(percentage_missing_test)

encounter_id                 0.000000
patient_nbr                  0.000000
race                         2.260220
gender                       0.000000
age                          0.000000
weight                      96.919222
admission_type_id            0.000000
discharge_disposition_id     0.000000
admission_source_id          0.000000
time_in_hospital             0.000000
payer_code                  39.431014
medical_specialty           49.204009
num_lab_procedures           0.000000
num_procedures               0.000000
num_medications              0.000000
number_outpatient            0.000000
number_emergency             0.000000
number_inpatient             0.000000
diag_1                       0.014741
diag_2                       0.343947
diag_3                       1.464230
number_diagnoses             0.000000
max_glu_serum                0.000000
A1Cresult                    0.000000
metformin                    0.000000
repaglinide                  0.000000
nateglinide 

In [59]:
test.drop(['weight','payer_code','medical_specialty','metformin','repaglinide','nateglinide','chlorpropamide','glimepiride',
               'acetohexamide','glipizide','glyburide','tolbutamide','pioglitazone','rosiglitazone','acarbose','miglitol',
               'troglitazone','tolazamide','examide','citoglipton','glyburide-metformin','glipizide-metformin',
               'glimepiride-pioglitazone','metformin-rosiglitazone','metformin-pioglitazone','diag_2','diag_3',
               'encounter_id','patient_nbr'], axis=1, inplace=True)

In [61]:
test.dropna(subset = ["race","diag_1"], inplace=True)
test = test[test["gender"] != "Unknown/Invalid"]
test.reset_index(drop=True, inplace=True)

In [80]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19889 entries, 0 to 19888
Data columns (total 43 columns):
time_in_hospital      19889 non-null int64
num_lab_procedures    19889 non-null int64
num_procedures        19889 non-null int64
num_medications       19889 non-null int64
number_outpatient     19889 non-null int64
number_emergency      19889 non-null int64
number_inpatient      19889 non-null int64
diag_1                19889 non-null int64
number_diagnoses      19889 non-null int64
[0-50)                19889 non-null uint8
[50-60)               19889 non-null uint8
[60-70)               19889 non-null uint8
[70-80)               19889 non-null uint8
[80-100)              19889 non-null uint8
AfricanAmerican       19889 non-null uint8
Asian                 19889 non-null uint8
Caucasian             19889 non-null uint8
Hispanic              19889 non-null uint8
Other                 19889 non-null uint8
Female                19889 non-null uint8
Male                  19889 non

In [62]:
test['diag_1'] = test['diag_1'].astype(str)
test['diag_1'] = pd.Series([1 if val.startswith('250') else 0 for val in test['diag_1']], index=test.index)

In [63]:
test['age'] = pd.Series(['[0-50)' if val in ['[0-10)', '[10-20)', '[20-30)', '[30-40)', '[40-50)'] else val 
                         for val in test['age']], index=test.index)
test['age'] = pd.Series(['[80-100)' if val in ['[80-90)', '[90-100)'] else val 
                         for val in test['age']], index=test.index)

In [64]:
test['discharge_disposition_id'] = pd.Series(['Home' if val == 1 else 'Other' 
                                              for val in test['discharge_disposition_id']], index=test.index)

In [65]:
test['admission_source_id'] = pd.Series(['ER' if val == 7 else 'Other' 
                                              for val in test['admission_source_id']], index=test.index)

In [66]:
test['admission_type_id'] = pd.Series(['Emergency' if val == 1 else 'Other' 
                                              for val in test['admission_type_id']], index=test.index)

In [67]:
test_age = pd.get_dummies(test['age'])
test_race = pd.get_dummies(test['race'])
test_gender = pd.get_dummies(test['gender'])
test_max_glu_serum = pd.get_dummies(test['max_glu_serum'])
test_A1Cresult = pd.get_dummies(test['A1Cresult'])
test_insulin = pd.get_dummies(test['insulin'])
test_change = pd.get_dummies(test['change'])
test_diabetesMed = pd.get_dummies(test['diabetesMed'])
test_discharge_disposition_id = pd.get_dummies(test['discharge_disposition_id'])
test_admission_source_id = pd.get_dummies(test['admission_source_id'])
test_admission_type_id = pd.get_dummies(test['admission_type_id'])

test = pd.concat([test, test_age, test_race, test_gender, test_max_glu_serum, test_A1Cresult, 
                  test_insulin, test_change, test_diabetesMed, test_discharge_disposition_id, 
                  test_admission_source_id, test_admission_type_id], axis=1)
test.drop(['age', 'race', 'gender', 'max_glu_serum', 'A1Cresult', 'insulin', 'change', 
                  'diabetesMed', 'discharge_disposition_id', 'admission_source_id', 
                  'admission_type_id'], axis=1, inplace=True)

### Predictions

In [79]:
y_pred = logreg.predict(test)
y_pred_df = pd.DataFrame(y_pred)
y_pred_df.to_csv('output.csv')
test.to_csv('test_final.csv')

### Conclusion

#### The logistic regression classifier achieves an accuracy of 0.88 and random forest achieves 0.99. I noticed that there is class imbalance in the dependent variable and we can use other scoring methods such as recall, f-1 instead of accuracy.
#### The metrics of the model can be altered by choosing a different classification threshold.
#### The performance and cost of project can be improved by doing recursive feature extraction or similar methods to identify most important factors.

### References

#### https://www.hindawi.com/journals/bmri/2014/781670/
#### https://www.kaggle.com/c/diabetes-hospital-readmission/data

Start Time - 6/5/2018 - 10PM
End Time - 6/6/2018 - 2.30AM
I could not run bagging and boosting as my system took too long to run them. However, the code is correct and would run faster in a better system