##Data Preparation

###Unzipping Data

In [None]:
!unzip "/content/diabetes-readmission-prediction-i43.zip"

Archive:  /content/diabetes-readmission-prediction-i43.zip
replace sample_submission.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

###Importing Libraries and Data 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score

In [None]:
df_train = pd.read_csv("/content/train.csv",encoding='ascii')
df_test = pd.read_csv("/content/test.csv",encoding='ascii')

###Handleing Duplicates

In [None]:
df_train = df_train.drop_duplicates(subset=['patient_nbr'])

###Handleing incorrect Values and Dropping unuseful Data 

In [None]:
df_train = df_train.replace("?", np.nan)
df_test = df_test.replace("?", np.nan)

In [None]:
df_train = df_train.drop(columns = ["encounter_id","patient_nbr","admission_type_id","discharge_disposition_id","admission_source_id","weight","payer_code","medical_specialty","metformin-pioglitazone","metformin-rosiglitazone","glimepiride-pioglitazone","glipizide-metformin","glyburide-metformin","citoglipton","examide","tolazamide","troglitazone","miglitol","acarbose","tolbutamide","acetohexamide","chlorpropamide","nateglinide"])

In [None]:
df_test = df_test.drop(columns = ["patient_nbr","admission_type_id","discharge_disposition_id","admission_source_id","weight","payer_code","medical_specialty","metformin-pioglitazone","metformin-rosiglitazone","glimepiride-pioglitazone","glipizide-metformin","glyburide-metformin","citoglipton","examide","tolazamide","troglitazone","miglitol","acarbose","tolbutamide","acetohexamide","chlorpropamide","nateglinide"])

In [None]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54085 entries, 0 to 71235
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   race                52713 non-null  object
 1   gender              54085 non-null  object
 2   age                 54085 non-null  object
 3   time_in_hospital    54085 non-null  int64 
 4   num_lab_procedures  54085 non-null  int64 
 5   num_procedures      54085 non-null  int64 
 6   num_medications     54085 non-null  int64 
 7   number_outpatient   54085 non-null  int64 
 8   number_emergency    54085 non-null  int64 
 9   number_inpatient    54085 non-null  int64 
 10  diag_1              54074 non-null  object
 11  diag_2              53879 non-null  object
 12  diag_3              53213 non-null  object
 13  number_diagnoses    54085 non-null  int64 
 14  max_glu_serum       54085 non-null  object
 15  A1Cresult           54085 non-null  object
 16  metformin           54

In [None]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30530 entries, 0 to 30529
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   encounter_id        30530 non-null  int64 
 1   race                29849 non-null  object
 2   gender              30530 non-null  object
 3   age                 30530 non-null  object
 4   time_in_hospital    30530 non-null  int64 
 5   num_lab_procedures  30530 non-null  int64 
 6   num_procedures      30530 non-null  int64 
 7   num_medications     30530 non-null  int64 
 8   number_outpatient   30530 non-null  int64 
 9   number_emergency    30530 non-null  int64 
 10  number_inpatient    30530 non-null  int64 
 11  diag_1              30521 non-null  object
 12  diag_2              30417 non-null  object
 13  diag_3              30116 non-null  object
 14  number_diagnoses    30530 non-null  int64 
 15  max_glu_serum       30530 non-null  object
 16  A1Cresult           30

##Handling Categorical Data

###Label Encoding 

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df_train['diag_1'] = encoder.fit_transform(df_train['diag_1'])
df_train['diag_2'] = encoder.fit_transform(df_train['diag_2'])
df_train['diag_3'] = encoder.fit_transform(df_train['diag_3'])
df_train['diag_1'] = df_train['diag_1'].fillna(df_train['diag_1'].mode())
df_train['diag_2'] = df_train['diag_2'].fillna(df_train['diag_2'].mode())
df_train['diag_3'] = df_train['diag_3'].fillna(df_train['diag_3'].mode())

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df_test['diag_1'] = encoder.fit_transform(df_test['diag_1'])
df_test['diag_2'] = encoder.fit_transform(df_test['diag_2'])
df_test['diag_3'] = encoder.fit_transform(df_test['diag_3'])
df_test['diag_1'] = df_test['diag_1'].fillna(df_test['diag_1'].mode())
df_test['diag_2'] = df_test['diag_2'].fillna(df_test['diag_2'].mode())
df_test['diag_3'] = df_test['diag_3'].fillna(df_test['diag_3'].mode())

In [None]:
df_test['diag_3'].mode()

0    61
dtype: int64

In [None]:
cols = ["A1Cresult","metformin","repaglinide","glimepiride","glipizide","glyburide","pioglitazone","rosiglitazone","insulin"]
for i in cols:
  df_test[i] = encoder.fit_transform(df_test[i])
  df_train[i] = encoder.fit_transform(df_train[i])

In [None]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54085 entries, 0 to 71235
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   race                52713 non-null  object
 1   gender              54085 non-null  object
 2   age                 54085 non-null  object
 3   time_in_hospital    54085 non-null  int64 
 4   num_lab_procedures  54085 non-null  int64 
 5   num_procedures      54085 non-null  int64 
 6   num_medications     54085 non-null  int64 
 7   number_outpatient   54085 non-null  int64 
 8   number_emergency    54085 non-null  int64 
 9   number_inpatient    54085 non-null  int64 
 10  diag_1              54085 non-null  int64 
 11  diag_2              54085 non-null  int64 
 12  diag_3              54085 non-null  int64 
 13  number_diagnoses    54085 non-null  int64 
 14  max_glu_serum       54085 non-null  object
 15  A1Cresult           54085 non-null  int64 
 16  metformin           54

In [None]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30530 entries, 0 to 30529
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   encounter_id        30530 non-null  int64 
 1   race                29849 non-null  object
 2   gender              30530 non-null  object
 3   age                 30530 non-null  object
 4   time_in_hospital    30530 non-null  int64 
 5   num_lab_procedures  30530 non-null  int64 
 6   num_procedures      30530 non-null  int64 
 7   num_medications     30530 non-null  int64 
 8   number_outpatient   30530 non-null  int64 
 9   number_emergency    30530 non-null  int64 
 10  number_inpatient    30530 non-null  int64 
 11  diag_1              30530 non-null  int64 
 12  diag_2              30530 non-null  int64 
 13  diag_3              30530 non-null  int64 
 14  number_diagnoses    30530 non-null  int64 
 15  max_glu_serum       30530 non-null  object
 16  A1Cresult           30

###Handling Missing Values and inapproprite Values

In [None]:
df_train.isna().sum()

race                  1372
gender                   0
age                      0
time_in_hospital         0
num_lab_procedures       0
num_procedures           0
num_medications          0
number_outpatient        0
number_emergency         0
number_inpatient         0
diag_1                   0
diag_2                   0
diag_3                   0
number_diagnoses         0
max_glu_serum            0
A1Cresult                0
metformin                0
repaglinide              0
glimepiride              0
glipizide                0
glyburide                0
pioglitazone             0
rosiglitazone            0
insulin                  0
change                   0
diabetesMed              0
readmitted               0
dtype: int64

In [None]:
df_train['race'].mode()

0    Caucasian
dtype: object

In [None]:
df_train['race'] = df_train['race'].fillna('Caucasian')
df_train.isna().sum()

race                  0
gender                0
age                   0
time_in_hospital      0
num_lab_procedures    0
num_procedures        0
num_medications       0
number_outpatient     0
number_emergency      0
number_inpatient      0
diag_1                0
diag_2                0
diag_3                0
number_diagnoses      0
max_glu_serum         0
A1Cresult             0
metformin             0
repaglinide           0
glimepiride           0
glipizide             0
glyburide             0
pioglitazone          0
rosiglitazone         0
insulin               0
change                0
diabetesMed           0
readmitted            0
dtype: int64

In [None]:

df_test['race'] = df_test['race'].fillna('Caucasian')
df_test.isna().sum()

encounter_id          0
race                  0
gender                0
age                   0
time_in_hospital      0
num_lab_procedures    0
num_procedures        0
num_medications       0
number_outpatient     0
number_emergency      0
number_inpatient      0
diag_1                0
diag_2                0
diag_3                0
number_diagnoses      0
max_glu_serum         0
A1Cresult             0
metformin             0
repaglinide           0
glimepiride           0
glipizide             0
glyburide             0
pioglitazone          0
rosiglitazone         0
insulin               0
change                0
diabetesMed           0
dtype: int64

In [None]:
df_train["gender"] = df_train["gender"].replace("Unknown/Invalid","Female")
df_test["gender"] = df_test["gender"].replace("Unknown/Invalid","Female")

### One Hot Encoding

In [None]:
df_train = pd.get_dummies(data = df_train, columns = ["race","gender","max_glu_serum","change","diabetesMed"])
df_test = pd.get_dummies(data = df_test, columns = ["race","gender","max_glu_serum","change","diabetesMed"])

In [None]:
df_train = pd.get_dummies(data = df_train,columns = ["age"])
df_train = df_train.rename(columns = {"age_[0-10)":"age_(0-10)",
                           "age_[10-20)":"age_(10-20)",
                           "age_[20-30)":"age_(20-30)",
                           "age_[30-40)":"age_(30-40)",
                           "age_[40-50)":"age_(40-50)",
                           "age_[50-60)":"age_(50-60)",
                           "age_[60-70)":"age_(60-70)",
                           "age_[70-80)":"age_(70-80)",
                           "age_[80-90)":"age_(80-90)",
                           "age_[90-100)":"age_(90-100)"})

In [None]:
df_test = pd.get_dummies(data = df_test,columns = ["age"])
df_test = df_test.rename(columns = {"age_[0-10)":"age_(0-10)",
                           "age_[10-20)":"age_(10-20)",
                           "age_[20-30)":"age_(20-30)",
                           "age_[30-40)":"age_(30-40)",
                           "age_[40-50)":"age_(40-50)",
                           "age_[50-60)":"age_(50-60)",
                           "age_[60-70)":"age_(60-70)",
                           "age_[70-80)":"age_(70-80)",
                           "age_[80-90)":"age_(80-90)",
                           "age_[90-100)":"age_(90-100)"})

In [None]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30530 entries, 0 to 30529
Data columns (total 46 columns):
 #   Column                Non-Null Count  Dtype
---  ------                --------------  -----
 0   encounter_id          30530 non-null  int64
 1   time_in_hospital      30530 non-null  int64
 2   num_lab_procedures    30530 non-null  int64
 3   num_procedures        30530 non-null  int64
 4   num_medications       30530 non-null  int64
 5   number_outpatient     30530 non-null  int64
 6   number_emergency      30530 non-null  int64
 7   number_inpatient      30530 non-null  int64
 8   diag_1                30530 non-null  int64
 9   diag_2                30530 non-null  int64
 10  diag_3                30530 non-null  int64
 11  number_diagnoses      30530 non-null  int64
 12  A1Cresult             30530 non-null  int64
 13  metformin             30530 non-null  int64
 14  repaglinide           30530 non-null  int64
 15  glimepiride           30530 non-null  int64
 16  glip

In [None]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54085 entries, 0 to 71235
Data columns (total 46 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   time_in_hospital      54085 non-null  int64 
 1   num_lab_procedures    54085 non-null  int64 
 2   num_procedures        54085 non-null  int64 
 3   num_medications       54085 non-null  int64 
 4   number_outpatient     54085 non-null  int64 
 5   number_emergency      54085 non-null  int64 
 6   number_inpatient      54085 non-null  int64 
 7   diag_1                54085 non-null  int64 
 8   diag_2                54085 non-null  int64 
 9   diag_3                54085 non-null  int64 
 10  number_diagnoses      54085 non-null  int64 
 11  A1Cresult             54085 non-null  int64 
 12  metformin             54085 non-null  int64 
 13  repaglinide           54085 non-null  int64 
 14  glimepiride           54085 non-null  int64 
 15  glipizide             54085 non-null

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
cols = ["time_in_hospital","num_lab_procedures","num_procedures","num_medications","number_outpatient","number_emergency","number_inpatient"]
df_train[cols] = scaler.fit_transform(df_train[cols])
df_test[cols] = scaler.fit_transform(df_test[cols])

In [None]:
df_train.head()

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,...,age_(0-10),age_(10-20),age_(20-30),age_(30-40),age_(40-50),age_(50-60),age_(60-70),age_(70-80),age_(80-90),age_(90-100)
0,0.225542,-0.162572,0.344421,1.110921,-0.281665,-0.186603,-0.430581,453,118,227,...,0,0,0,0,0,0,0,0,1,0
1,-0.449002,-1.070184,-0.229862,0.747059,-0.281665,-0.186603,-0.430581,475,296,235,...,0,0,0,0,0,0,0,1,0,0
2,1.237359,-0.515532,2.067268,-1.314825,-0.281665,-0.186603,-0.430581,24,34,360,...,0,0,0,0,0,0,0,1,0,0
3,0.900087,0.442503,-0.229862,-0.101952,-0.281665,-0.186603,4.121186,135,220,369,...,0,0,0,0,0,1,0,0,0,0
4,-0.786275,0.190389,0.344421,-0.708389,-0.281665,-0.186603,0.707361,365,118,77,...,0,0,0,0,1,0,0,0,0,0


In [None]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30530 entries, 0 to 30529
Data columns (total 46 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   encounter_id          30530 non-null  int64  
 1   time_in_hospital      30530 non-null  float64
 2   num_lab_procedures    30530 non-null  float64
 3   num_procedures        30530 non-null  float64
 4   num_medications       30530 non-null  float64
 5   number_outpatient     30530 non-null  float64
 6   number_emergency      30530 non-null  float64
 7   number_inpatient      30530 non-null  float64
 8   diag_1                30530 non-null  int64  
 9   diag_2                30530 non-null  int64  
 10  diag_3                30530 non-null  int64  
 11  number_diagnoses      30530 non-null  int64  
 12  A1Cresult             30530 non-null  int64  
 13  metformin             30530 non-null  int64  
 14  repaglinide           30530 non-null  int64  
 15  glimepiride        

## Training Model

###Splitting Data

In [None]:
features = df_train.drop(columns=["readmitted"])
labels = df_train["readmitted"]
test = df_test.drop(columns = ["encounter_id"])

In [None]:
from sklearn.model_selection import train_test_split
features_train,features_valid,labels_train,labels_valid = train_test_split(features,labels,test_size = 0.2,train_size=0.8)

###Model Selection and Training

In [None]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(learning_rate=0.1,colsample_bytree=0.6, objective='multi:softprob', subsample=0.8,gamma =1,)
xgb_model.fit(features_train,labels_train)
f1_score(xgb_model.predict(features_valid),labels_valid,average='micro')

0.6501802717943977

In [None]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression()
lr_model.fit(X=features_train,y =labels_train)
f1_score(lr_model.predict(features_valid),labels_valid,average='micro')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.6453730239437921

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(features_train,labels_train)
rf_model.score(features_valid,labels_valid)

0.6474068595728946

In [None]:
from sklearn.ensemble import AdaBoostClassifier

ab_model = AdaBoostClassifier(n_estimators=100)

ab_model.fit(features_train,labels_train)
ab_model.score(features_valid,labels_valid)

0.6468521771285939

In [None]:
# print accuracy of each model on validation data
print("Scores of the models")
print("Logistic regression:", lr_model.score(features_valid,labels_valid))
#print("Decision tree:", dt_model.score(features_valid,labels_valid))
#print("SVM:", svm_model.score(features_valid,labels_valid))
print("Random forest:", rf_model.score(features_valid,labels_valid))
print("Gradient boosting:", xgb_model.score(features_valid,labels_valid))
print("AdaBoost:", ab_model.score(features_valid,labels_valid))

Scores of the models
Logistic regression: 0.6453730239437921
Random forest: 0.6474068595728946
Gradient boosting: 0.6501802717943977
AdaBoost: 0.6468521771285939


In [193]:
from sklearn.model_selection import GridSearchCV
params = {
        #'min_child_weight': [1, 5, 10],
        #'gamma': [0.5, 1, 1.5, 2, 5],
        #'subsample': [0.6, 0.8, 1.0],
        #'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5],
        }
#parameters = {'max_depth': np.arange(1,10,2),'learning_rate': np.arange(0.1,1,0.2)}
# grid search pitch
gs_model = XGBClassifier(colsample_bytree=0.6, gamma=2, min_child_weight=10,objective='multi:softprob', subsample=0.8)
gs = GridSearchCV(estimator = gs_model,param_grid = params)
gs.fit(features_train, labels_train)
pitch = gs.best_estimator_

In [194]:
pitch

XGBClassifier(colsample_bytree=0.6, gamma=2, min_child_weight=10,
              objective='multi:softprob', subsample=0.8)

In [195]:
pitch.score(features_valid,labels_valid)

0.6511047425348988

##Generating Output

In [None]:
test.info()

In [196]:
pred = pitch.predict(test)

In [197]:
df_test['readmitted'] = pred

In [198]:
df_test[['encounter_id', 'readmitted']].to_csv('submission.csv', index=False)