In [3]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import xgboost
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LassoCV
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score

In [65]:
import warnings
warnings.filterwarnings("ignore")

# Data Set 3: Diabetes

## Content
Data Set Information:

The dataset represents 10 years (1999-2008) of clinical care at 130 US hospitals and integrated delivery networks. It includes over 50 features representing patient and hospital outcomes. Information was extracted from the database for encounters that satisfied the following criteria.

(1) It is an inpatient encounter (a hospital admission).

(2) It is a diabetic encounter, that is, one during which any kind of diabetes was entered to the system as a diagnosis.

(3) The length of stay was at least 1 day and at most 14 days.

(4) Laboratory tests were performed during the encounter.

(5) Medications were administered during the encounter.

The data contains such attributes as patient number, race, gender, age, admission type, time in hospital, medical specialty of admitting physician, number of lab test performed, HbA1c test result, diagnosis, number of medication, diabetic medications, number of outpatient, inpatient, and emergency visits in the year before the hospitalization, etc.

## Task
This data has been prepared to analyze factors related to readmission as well as other outcomes pertaining to patients with diabetes.

In [126]:
ds3 = pd.read_csv('diabetic_data.csv')
ID = pd.read_csv('IDs_mapping.csv')
ds3

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,443847548,100162476,AfricanAmerican,Male,[70-80),?,1,3,7,3,...,No,Down,No,No,No,No,No,Ch,Yes,>30
101762,443847782,74694222,AfricanAmerican,Female,[80-90),?,1,4,5,5,...,No,Steady,No,No,No,No,No,No,Yes,NO
101763,443854148,41088789,Caucasian,Male,[70-80),?,1,1,7,1,...,No,Down,No,No,No,No,No,Ch,Yes,NO
101764,443857166,31693671,Caucasian,Female,[80-90),?,2,3,7,10,...,No,Up,No,No,No,No,No,Ch,Yes,NO


In [127]:
ID.head(8).T

Unnamed: 0,0,1,2,3,4,5,6,7
admission_type_id,1,2,3,4,5,6.0,7,8
description,Emergency,Urgent,Elective,Newborn,Not Available,,Trauma Center,Not Mapped


In [128]:
ds3.groupby('admission_type_id').readmitted.count()

admission_type_id
1    53990
2    18480
3    18869
4       10
5     4785
6     5291
7       21
8      320
Name: readmitted, dtype: int64

With the information above, we see some of the admission type id column could be important for us. First, three type is quite important for me so i will group the rest.

In [129]:
ds3.admission_type_id[ds3.admission_type_id > 3] = 4

In [130]:
ds3.groupby('admission_type_id').readmitted.count()

admission_type_id
1    53990
2    18480
3    18869
4    10427
Name: readmitted, dtype: int64

In [131]:
ID.head(41).tail(32)

Unnamed: 0,admission_type_id,description
9,discharge_disposition_id,description
10,1,Discharged to home
11,2,Discharged/transferred to another short term h...
12,3,Discharged/transferred to SNF
13,4,Discharged/transferred to ICF
14,5,Discharged/transferred to another type of inpa...
15,6,Discharged/transferred to home with home healt...
16,7,Left AMA
17,8,Discharged/transferred to home under care of H...
18,9,Admitted as an inpatient to this hospital


In [132]:
ds3.groupby('discharge_disposition_id').readmitted.count().sort_values(ascending = False)

discharge_disposition_id
1     60234
3     13954
6     12902
18     3691
2      2128
22     1993
11     1642
5      1184
25      989
4       815
7       623
23      412
13      399
14      372
28      139
8       108
15       63
24       48
9        21
17       14
16       11
19        8
10        6
27        5
12        3
20        2
Name: readmitted, dtype: int64

In [133]:
ds3.discharge_disposition_id.unique()

array([25,  1,  3,  6,  2,  5, 11,  7, 10,  4, 14, 18,  8, 13, 12, 16, 17,
       22, 23,  9, 20, 15, 24, 28, 19, 27], dtype=int64)

With the pareto rule, I chose the ones which makes more than %80 and group others as id:0. 

In [134]:
ddi_list = [25,  2,  5, 11,  7, 10,  4, 14, 8, 13, 12, 16, 17, 22, 23,  9, 20, 15, 24, 28, 19, 27]

ds3.discharge_disposition_id[ds3.discharge_disposition_id.isin(ddi_list)] = 0


In [135]:
ds3.groupby('discharge_disposition_id').readmitted.count().sort_values(ascending = False)

discharge_disposition_id
1     60234
3     13954
6     12902
0     10985
18     3691
Name: readmitted, dtype: int64

In [136]:
ID.tail(26)

Unnamed: 0,admission_type_id,description
41,admission_source_id,description
42,1,Physician Referral
43,2,Clinic Referral
44,3,HMO Referral
45,4,Transfer from a hospital
46,5,Transfer from a Skilled Nursing Facility (SNF)
47,6,Transfer from another health care facility
48,7,Emergency Room
49,8,Court/Law Enforcement
50,9,Not Available


In [137]:
ds3.groupby('admission_source_id').readmitted.count().sort_values(ascending = False)

admission_source_id
7     57494
1     29565
17     6781
4      3187
6      2264
2      1104
5       855
3       187
20      161
9       125
8        16
22       12
10        8
11        2
14        2
25        2
13        1
Name: readmitted, dtype: int64

In [138]:
ds3.admission_source_id.unique()

array([ 1,  7,  2,  4,  5,  6, 20,  3, 17,  8,  9, 14, 10, 22, 11, 25, 13],
      dtype=int64)

In [139]:
asi_list = [ 2,  4,  5,  6, 20,  3, 8,  9, 14, 10, 22, 11, 25, 13]

ds3.admission_source_id[ds3.admission_source_id.isin(asi_list)] = 0

In [140]:
ds3.groupby('admission_source_id').readmitted.count().sort_values(ascending = False)

admission_source_id
7     57494
1     29565
0      7926
17     6781
Name: readmitted, dtype: int64

In [141]:
ds3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  int64 
 1   patient_nbr               101766 non-null  int64 
 2   race                      101766 non-null  object
 3   gender                    101766 non-null  object
 4   age                       101766 non-null  object
 5   weight                    101766 non-null  object
 6   admission_type_id         101766 non-null  int64 
 7   discharge_disposition_id  101766 non-null  int64 
 8   admission_source_id       101766 non-null  int64 
 9   time_in_hospital          101766 non-null  int64 
 10  payer_code                101766 non-null  object
 11  medical_specialty         101766 non-null  object
 12  num_lab_procedures        101766 non-null  int64 
 13  num_procedures            101766 non-null  int64 
 14  num_

All of the data is non-null and most of it is object. There is just a few integer data in the data set. I need integer encoding for the categorical data and dummy encoding if the object that cannot be ordered.

In [142]:
ds3_unique = ds3.nunique().to_frame().reset_index()
ds3_unique.columns = ['Variable','DistinctCount']
ds3_unique.loc[ds3_unique['DistinctCount'] > 2].T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,33,34,35,36,38,41,42,49
Variable,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,glipizide,glyburide,pioglitazone,rosiglitazone,acarbose,miglitol,tolazamide,insulin,glyburide-metformin,readmitted
DistinctCount,101766,71518,6,3,10,10,4,5,4,14,...,4,4,4,4,4,4,3,4,4,3


In [143]:
ds3_unique.loc[ds3_unique['DistinctCount'] == 2].T

Unnamed: 0,29,32,37,43,44,45,46,47,48
Variable,acetohexamide,tolbutamide,troglitazone,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed
DistinctCount,2,2,2,2,2,2,2,2,2


### Manipulating the Data (?)

#### Null Data

With the information above, I filtered the objects which includes more than three different values and examined them.
At first, I realized there are letters in some 

In [144]:
ds3.loc[ds3['diag_1'].str.startswith("V"), 'diag_1'] = "?"
ds3.loc[ds3['diag_2'].str.startswith("V"), 'diag_2'] = "?"
ds3.loc[ds3['diag_3'].str.startswith("V"), 'diag_3'] = "?"
ds3.loc[ds3['diag_1'].str.startswith("E"), 'diag_1'] = "?"
ds3.loc[ds3['diag_2'].str.startswith("E"), 'diag_2'] = "?"
ds3.loc[ds3['diag_3'].str.startswith("E"), 'diag_3'] = "?"

In [145]:
#My trials while i manipulate the data
#ds3['diag_1'] = ds3['diag_1'].str.replace('V','')
#ds3['diag_2'] = ds3['diag_2'].str.replace('V','')
#ds3['diag_3'] = ds3['diag_3'].str.replace('V','')
#ds3['diag_1'] = ds3['diag_1'].str.replace('E','')
#ds3['diag_2'] = ds3['diag_2'].str.replace('E','')
#ds3['diag_3'] = ds3['diag_3'].str.replace('E','')

In [146]:
ds3['diag_1'] = ds3['diag_1'].replace('?',np.NaN)
ds3['diag_2'] = ds3['diag_2'].replace('?',np.NaN)
ds3['diag_3'] = ds3['diag_3'].replace('?',np.NaN)

In [147]:
ds3.diag_1 = ds3.diag_1.astype("float64")
ds3.diag_2 = ds3.diag_2.astype("float64")
ds3.diag_3 = ds3.diag_3.astype("float64")

In [148]:
ds3.diag_1 = ds3.diag_1.fillna(ds3.diag_1.median())
ds3.diag_2 = ds3.diag_2.fillna(ds3.diag_2.median())
ds3.diag_3 = ds3.diag_3.fillna(ds3.diag_3.median())

In [149]:
#Erasing null data
#ds3 = ds3[ds3 != '?']

In [150]:
str_cols = ds3.select_dtypes(['object']).columns
str_cols

Index(['race', 'gender', 'age', 'weight', 'payer_code', 'medical_specialty',
       'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton',
       'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [151]:
for column_name in str_cols:
    print(column_name)
    s = ds3[column_name].unique()
    for i in s:
        print(str(i))
    print(" ")

race
Caucasian
AfricanAmerican
?
Other
Asian
Hispanic
 
gender
Female
Male
Unknown/Invalid
 
age
[0-10)
[10-20)
[20-30)
[30-40)
[40-50)
[50-60)
[60-70)
[70-80)
[80-90)
[90-100)
 
weight
?
[75-100)
[50-75)
[0-25)
[100-125)
[25-50)
[125-150)
[175-200)
[150-175)
>200
 
payer_code
?
MC
MD
HM
UN
BC
SP
CP
SI
DM
CM
CH
PO
WC
OT
OG
MP
FR
 
medical_specialty
Pediatrics-Endocrinology
?
InternalMedicine
Family/GeneralPractice
Cardiology
Surgery-General
Orthopedics
Gastroenterology
Surgery-Cardiovascular/Thoracic
Nephrology
Orthopedics-Reconstructive
Psychiatry
Emergency/Trauma
Pulmonology
Surgery-Neuro
Obsterics&Gynecology-GynecologicOnco
ObstetricsandGynecology
Pediatrics
Hematology/Oncology
Otolaryngology
Surgery-Colon&Rectal
Pediatrics-CriticalCare
Endocrinology
Urology
Psychiatry-Child/Adolescent
Pediatrics-Pulmonology
Neurology
Anesthesiology-Pediatric
Radiology
Pediatrics-Hematology-Oncology
Psychology
Podiatry
Gynecology
Oncology
Pediatrics-Neurology
Surgery-Plastic
Surgery-Thoracic
Surgery

In [152]:
ds3.groupby('readmitted').readmitted.count()

readmitted
<30    11357
>30    35545
NO     54864
Name: readmitted, dtype: int64

In [153]:
ds3.groupby('gender').gender.count()

gender
Female             54708
Male               47055
Unknown/Invalid        3
Name: gender, dtype: int64

In [154]:
ds3.groupby('race').race.count()

race
?                   2273
AfricanAmerican    19210
Asian                641
Caucasian          76099
Hispanic            2037
Other               1506
Name: race, dtype: int64

In [155]:
ds3.groupby('weight').weight.count()

weight
>200             3
?            98569
[0-25)          48
[100-125)      625
[125-150)      145
[150-175)       35
[175-200)       11
[25-50)         97
[50-75)        897
[75-100)      1336
Name: weight, dtype: int64

In [156]:
ds3.groupby('payer_code').payer_code.count()

payer_code
?     40256
BC     4655
CH      146
CM     1937
CP     2533
DM      549
FR        1
HM     6274
MC    32439
MD     3532
MP       79
OG     1033
OT       95
PO      592
SI       55
SP     5007
UN     2448
WC      135
Name: payer_code, dtype: int64

In [157]:
ds3.groupby('medical_specialty').medical_specialty.count()

medical_specialty
?                                   49949
AllergyandImmunology                    7
Anesthesiology                         12
Anesthesiology-Pediatric               19
Cardiology                           5352
                                    ...  
Surgery-PlasticwithinHeadandNeck        1
Surgery-Thoracic                      109
Surgery-Vascular                      533
SurgicalSpecialty                      33
Urology                               685
Name: medical_specialty, Length: 73, dtype: int64

In [158]:
ds3.groupby('max_glu_serum').max_glu_serum.count()

max_glu_serum
>200     1485
>300     1264
None    96420
Norm     2597
Name: max_glu_serum, dtype: int64

Since there is just 3 unknowns for the gender, we can erase it from the data. There is also similar situation for the race column where the number of '?' is quite small and i will transform it to Caucasian as its frequency is more than the others.  

Moreover, there is %97 missing in the weight data. Even though I think this data is related with diabetes, I will erase the related column.  

However, payer_code and medical_specialty is almost half of the data.Thus, I won't erase them and I will transform it to dummy encoding. 

In [159]:
#Column manipulation
ds3 = ds3[ds3.gender != 'Unknown/Invalid']
ds3['race'] = ds3['race'].replace('?','Other')

In [160]:
s = ds3['age'].unique()
for i in s:
    print(str(i))
print(" ")

[0-10)
[10-20)
[20-30)
[30-40)
[40-50)
[50-60)
[60-70)
[70-80)
[80-90)
[90-100)
 


### Binary Encoding
gender : Female and Male

diabetesMed:  Yes and No

change: Change and No Change

In [161]:
ds3['gender'] = pd.Series(np.where(ds3.gender == 'Female', 1, 0), ds3.index)
ds3['diabetesMed'] = pd.Series(np.where(ds3.diabetesMed == 'Yes', 1, 0), ds3.index)
ds3['change'] = pd.Series(np.where(ds3.change == 'Ch', 1, 0), ds3.index)

### Ordinal Encoding
age: [0-10):1, [10-20):2, [20-30):3,[30-40):4, [40-50):5, [50-60):6,[60-70):7, [70-80):8, [80-90):9,[90-100):10

In [162]:
age_scale_mapper = {"[0-10)":1, "[10-20)":2, "[20-30)":3,"[30-40)":4, "[40-50)":5, "[50-60)":6,"[60-70)":7, "[70-80)":8, "[80-90)":9,"[90-100)":10}
ds3["age"] = ds3["age"].replace(age_scale_mapper)

readmitted_scale_mapper = {"NO":0, ">30":2, "<30":1}
ds3["readmitted"] = ds3["readmitted"].replace(readmitted_scale_mapper)

### Dummy Encoding
Columns below will be processed as dummy encoding.

In [163]:
categoricColumns = ['gender', 'Partner', 'Dependents', 'PhoneService','MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup','DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies','Contract', 'PaperlessBilling', 'PaymentMethod']
categoricColumns

['gender',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod']

In [164]:
str_cols

Index(['race', 'gender', 'age', 'weight', 'payer_code', 'medical_specialty',
       'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton',
       'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [165]:
categoricColumns = ['admission_type_id', 'discharge_disposition_id', 'admission_source_id',
                    'race', 'payer_code', 'medical_specialty', 
                    'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
                    'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
                    'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
                    'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 
                    'insulin', 'glyburide-metformin', 'glipizide-metformin',
                    'glimepiride-pioglitazone', 'metformin-rosiglitazone','metformin-pioglitazone']
categoricColumns

['admission_type_id',
 'discharge_disposition_id',
 'admission_source_id',
 'race',
 'payer_code',
 'medical_specialty',
 'max_glu_serum',
 'A1Cresult',
 'metformin',
 'repaglinide',
 'nateglinide',
 'chlorpropamide',
 'glimepiride',
 'acetohexamide',
 'glipizide',
 'glyburide',
 'tolbutamide',
 'pioglitazone',
 'rosiglitazone',
 'acarbose',
 'miglitol',
 'troglitazone',
 'tolazamide',
 'examide',
 'citoglipton',
 'insulin',
 'glyburide-metformin',
 'glipizide-metformin',
 'glimepiride-pioglitazone',
 'metformin-rosiglitazone',
 'metformin-pioglitazone']

I need dummy encoding for all of the columns as they cannot be ordered.

In [166]:
#Creating a new dataframe to concat new numerical columns on. 
dummy_ds3 = pd.DataFrame()
#By using a loop concating all columns in a df
for var in categoricColumns:
    dummy_ds3 = pd.concat([dummy_ds3, pd.get_dummies(ds3[var], prefix=var)], axis=1)
dummy_ds3

Unnamed: 0,admission_type_id_1,admission_type_id_2,admission_type_id_3,admission_type_id_4,discharge_disposition_id_0,discharge_disposition_id_1,discharge_disposition_id_3,discharge_disposition_id_6,discharge_disposition_id_18,admission_source_id_0,...,glyburide-metformin_Steady,glyburide-metformin_Up,glipizide-metformin_No,glipizide-metformin_Steady,glimepiride-pioglitazone_No,glimepiride-pioglitazone_Steady,metformin-rosiglitazone_No,metformin-rosiglitazone_Steady,metformin-pioglitazone_No,metformin-pioglitazone_Steady
0,0,0,0,1,1,0,0,0,0,0,...,0,0,1,0,1,0,1,0,1,0
1,1,0,0,0,0,1,0,0,0,0,...,0,0,1,0,1,0,1,0,1,0
2,1,0,0,0,0,1,0,0,0,0,...,0,0,1,0,1,0,1,0,1,0
3,1,0,0,0,0,1,0,0,0,0,...,0,0,1,0,1,0,1,0,1,0
4,1,0,0,0,0,1,0,0,0,0,...,0,0,1,0,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,1,0,0,0,0,0,1,0,0,0,...,0,0,1,0,1,0,1,0,1,0
101762,1,0,0,0,1,0,0,0,0,1,...,0,0,1,0,1,0,1,0,1,0
101763,1,0,0,0,0,1,0,0,0,0,...,0,0,1,0,1,0,1,0,1,0
101764,0,1,0,0,0,0,1,0,0,0,...,0,0,1,0,1,0,1,0,1,0


In [167]:
ds3.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [168]:
dummy_compliment = ['gender', 'age', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
                    'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications',
                    'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
                    'diag_2', 'diag_3', 'number_diagnoses', 'change', 'diabetesMed', 'readmitted']

In [169]:
new_ds3 = pd.concat([dummy_ds3, ds3[dummy_compliment]], axis=1)
new_ds3

Unnamed: 0,admission_type_id_1,admission_type_id_2,admission_type_id_3,admission_type_id_4,discharge_disposition_id_0,discharge_disposition_id_1,discharge_disposition_id_3,discharge_disposition_id_6,discharge_disposition_id_18,admission_source_id_0,...,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,change,diabetesMed,readmitted
0,0,0,0,1,1,0,0,0,0,0,...,0,0,0,250.83,425.00,403.0,1,0,0,0
1,1,0,0,0,0,1,0,0,0,0,...,0,0,0,276.00,250.01,255.0,9,1,1,2
2,1,0,0,0,0,1,0,0,0,0,...,2,0,1,648.00,250.00,403.0,6,0,1,0
3,1,0,0,0,0,1,0,0,0,0,...,0,0,0,8.00,250.43,403.0,7,1,1,0
4,1,0,0,0,0,1,0,0,0,0,...,0,0,0,197.00,157.00,250.0,5,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,1,0,0,0,0,0,1,0,0,0,...,0,0,0,250.13,291.00,458.0,9,1,1,2
101762,1,0,0,0,1,0,0,0,0,1,...,0,0,1,560.00,276.00,787.0,9,0,1,0
101763,1,0,0,0,0,1,0,0,0,0,...,1,0,0,38.00,590.00,296.0,13,1,1,0
101764,0,1,0,0,0,0,1,0,0,0,...,0,0,1,996.00,285.00,998.0,9,1,1,0


In [170]:
new_ds3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 101763 entries, 0 to 101765
Columns: 207 entries, admission_type_id_1 to readmitted
dtypes: float64(3), int32(3), int64(13), uint8(188)
memory usage: 32.6 MB


In [171]:
new_ds3.groupby('readmitted').readmitted.count()

readmitted
0    54861
1    11357
2    35545
Name: readmitted, dtype: int64

## Train and Test Data

In [172]:
new_ds3_1, new_ds3_2 = train_test_split(new_ds3, test_size=0.1, random_state=42, stratify=new_ds3.readmitted)

In [173]:
#new_ds3 = pd.DataFrame(new_ds3)
new_ds3_2

Unnamed: 0,admission_type_id_1,admission_type_id_2,admission_type_id_3,admission_type_id_4,discharge_disposition_id_0,discharge_disposition_id_1,discharge_disposition_id_3,discharge_disposition_id_6,discharge_disposition_id_18,admission_source_id_0,...,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,change,diabetesMed,readmitted
11660,1,0,0,0,0,0,0,0,1,1,...,0,1,2,482.00,496.0,780.0,8,0,0,0
85055,1,0,0,0,0,1,0,0,0,0,...,0,0,3,491.00,428.0,244.0,9,1,1,0
29716,1,0,0,0,0,1,0,0,0,1,...,0,0,0,786.00,496.0,250.0,4,0,1,0
63871,0,1,0,0,0,0,0,1,0,1,...,0,0,1,428.00,427.0,285.0,9,0,1,0
4635,0,0,1,0,0,1,0,0,0,1,...,0,0,0,414.00,411.0,414.0,8,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100663,1,0,0,0,0,1,0,0,0,0,...,1,0,1,486.00,491.0,799.0,9,1,1,2
22148,0,0,1,0,0,1,0,0,0,0,...,0,0,1,414.00,250.6,250.5,9,1,1,2
83424,1,0,0,0,0,1,0,0,0,0,...,0,0,0,599.00,276.0,250.0,4,0,0,2
45378,1,0,0,0,0,1,0,0,0,0,...,0,0,0,250.13,276.0,424.0,5,1,1,0


In [174]:
X_train, X_test, y_train, y_test = train_test_split(new_ds3_2.drop('readmitted', axis=1), new_ds3_2['readmitted'], test_size=0.2, random_state=42, stratify=new_ds3_2.readmitted)

## Selected Parameters

In [186]:
param_grid_sgb = dict(max_depth = [2,3,5,7],
                  learning_rate = [0.1, 0.2, 0.5],
                  n_estimators = [10,30,50,100])
param_grid_tree = dict(ccp_alpha = [0.001, 0.002, 0.003, 0.004], 
                       min_samples_leaf = [2,3,5,7,10])
param_grid_rf = dict(n_estimators = [500],
                     min_samples_leaf = [5],
                     max_features = [2,3,4,5])

In [176]:
kfold = KFold(n_splits = 10, shuffle = True, random_state = 42)

# Classifiers and Regressors

#### Lasso

In [177]:
#Lasso
lasso_reg = LassoCV(cv = kfold)
lasso_reg = lasso_reg.fit(np.array(X_train), np.array(y_train))
lasso_reg.alpha_

0.0026567506265212882

In [197]:
p_lasso_val = lasso_reg.predict(np.array(X_test))
#p_lasso_val = np.where(p_lasso_val > 0.5, 1, 0).tolist()
print(r2_score(p_lasso_val, y_test) / len(y_test))

-0.008272835166803433


In [198]:
def lasso_correction(prediction_val):
    if prediction_val >= 2.5 : 
        prediction_val = 3
    elif prediction_val >= 1.5 : 
        prediction_val = 2
    elif prediction_val >= 0.5 : 
        prediction_val = 1
    else :
        prediction_val=0
    return prediction_val

In [199]:
lasso_df = pd.DataFrame(p_lasso_val)
lasso_df

Unnamed: 0,0
0,0.796079
1,0.747773
2,0.645135
3,0.583185
4,0.316196
...,...
2031,1.191129
2032,0.741744
2033,0.513865
2034,0.806485


In [200]:
p_lasso_val = lasso_df.loc[::,0].apply(lasso_correction)
p_lasso_val

0       1
1       1
2       1
3       1
4       0
       ..
2031    1
2032    1
2033    1
2034    1
2035    1
Name: 0, Length: 2036, dtype: int64

#### Decision Tree

In [179]:
tree_clf = DecisionTreeClassifier()
grid_search_tree_clf = GridSearchCV(tree_clf, param_grid_tree, cv = kfold )
results_tree_clf = grid_search_tree_clf.fit(np.array(X_train), np.array(y_train))

#tree_reg = DecisionTreeRegressor()
#grid_search_tree_reg = GridSearchCV(tree_reg, param_grid_tree, cv = kfold)
#results_tree_reg = grid_search_tree_reg.fit(np.array(X_train), np.array(y_train))


#### Random Forest

In [184]:
rf_clf = RandomForestClassifier()
grid_search_rf_clf = GridSearchCV(rf_clf, param_grid_rf, cv = kfold )
results_rf_clf = grid_search_rf_clf.fit(np.array(X_train), np.array((y_train)))

#rf_reg = RandomForestRegressor()
#grid_search_rf_reg = GridSearchCV(rf_reg, param_grid_rf, cv = kfold)
#results_rf_reg = grid_search_rf_reg.fit(np.array(X_train), np.array(y_train))


#### Stochastic Gradient Boosting

In [188]:
sgb_classifier = xgboost.XGBClassifier(min_child_weight=10, verbosity = 0)
grid_search_clf = GridSearchCV(sgb_classifier, param_grid_sgb, cv = kfold)
results_sgb_clf = grid_search_clf.fit(np.array(X_train), np.array(y_train))

#sgb_regressor = xgboost.XGBRegressor(min_child_weight=10)
#grid_search = GridSearchCV(sgb_regressor, param_grid_sgb, cv = kfold )
#results_sgb_reg = grid_search.fit(np.array(X_train), np.array(y_train))

### Best Parameters

In [189]:
#Classification
print("Best parameters of \n")
print("Alpha Value of Lasso: \n{}\n".format(lasso_reg.alpha_))
print("Best parameters of Decision Tree: \n{}\n".format(results_tree_clf.best_params_))
print("Best parameters of Random Tree: \n{}\n".format(results_rf_clf.best_params_))
print("Best parameters of Stochastic Gradient Boosting: \n{}\n".format(results_sgb_clf.best_params_))

#Regression
#print("Alpha Value of Lasso: \n{}\n".format(lasso_reg.alpha_))
#print("Best parameters of Decision Tree: \n{}\n".format(results_tree_reg.best_params_))
#print("Best parameters of Random Tree: \n{}\n".format(results_rf_reg.best_params_))
#print("Best parameters of Stochastic Gradient Boosting: \n{}\n".format(results_sgb_reg.best_params_))

Best parameters of 

Alpha Value of Lasso: 
0.0026567506265212882

Best parameters of Decision Tree: 
{'ccp_alpha': 0.001, 'min_samples_leaf': 2}

Best parameters of Random Tree: 
{'max_features': 5, 'min_samples_leaf': 5, 'n_estimators': 500}

Best parameters of Stochastic Gradient Boosting: 
{'learning_rate': 0.5, 'max_depth': 2, 'n_estimators': 100}



### Accuracy Score and  Best Score

In [201]:
d = ["LassoCV", "Decision Tree","Random Forest","Stochastic Gradient Boosting"]
score_table = pd.DataFrame(d, columns=['Models'])

accuracy_list = [accuracy_score(p_lasso_val,(y_test)),
                 accuracy_score(results_tree_clf.best_estimator_.predict(np.array(X_test)),(y_test)),
                 accuracy_score(results_rf_clf.best_estimator_.predict(np.array(X_test)), (y_test)),
                 accuracy_score(results_sgb_clf.best_estimator_.predict(np.array(X_test)), (y_test))]

score_table['Accuracy Score'] = accuracy_list

bestscore_list = [np.nan, results_tree_clf.best_score_, results_rf_clf.best_score_, results_sgb_clf.best_score_]
score_table['Best Score'] = bestscore_list
score_table

Unnamed: 0,Models,Accuracy Score,Best Score
0,LassoCV,0.146365,
1,Decision Tree,0.583006,0.569462
2,Random Forest,0.551572,0.547966
3,Stochastic Gradient Boosting,0.602652,0.585307


In [202]:
accuracy_score(results_tree_clf.best_estimator_.predict(np.array(X_test)),(y_test))

0.5830058939096268

### Classification Report

In [203]:
from sklearn.metrics import classification_report
print('Decision Tree')
print(classification_report(y_test,results_tree_clf.best_estimator_.predict(np.array(X_test))))
print('Random Forest')
print(classification_report(y_test,results_rf_clf.best_estimator_.predict(np.array(X_test))))
print('Stochastic Gradient Boost')
print(classification_report(y_test,results_sgb_clf.best_estimator_.predict(np.array(X_test))))

Decision Tree
              precision    recall  f1-score   support

           0       0.62      0.81      0.71      1098
           1       0.00      0.00      0.00       227
           2       0.49      0.41      0.45       711

    accuracy                           0.58      2036
   macro avg       0.37      0.41      0.38      2036
weighted avg       0.51      0.58      0.54      2036

Random Forest
              precision    recall  f1-score   support

           0       0.55      0.99      0.71      1098
           1       0.00      0.00      0.00       227
           2       0.69      0.04      0.08       711

    accuracy                           0.55      2036
   macro avg       0.41      0.35      0.26      2036
weighted avg       0.54      0.55      0.41      2036

Stochastic Gradient Boost
              precision    recall  f1-score   support

           0       0.63      0.86      0.72      1098
           1       0.40      0.03      0.05       227
           2       0.

In [204]:
print(r2_score(p_lasso_val, y_test) / len(y_test))

-0.007916807588565089


# Comment Section

In this classification data set, the diabetes data set is the hardest one to manipulate compared to other ones since it was hard to choose the ones that would be dummy encoding and it was hard to determine if I should keep the null ones and fill them. As you can see, even though I personally think the weight can be related with diabetes, I dropped it since %97 was null data.

In this data set, there is 101766 rows and 50 columns. After the manipulation, number of columns increased more than 200. Thus, it took 10 minutes even for decision tree. Random forrest never ended so I couldn't run stochastic gradient boosting model. Therefore, I worked with ten percent of instances. It still took lots of time.

Accuracy scores for the models are quite low and the lasso model is far worst compared to tree-based models. Thus, I can say that lasso regressor is not enough for the classification problems. Tree-based models' accuracy scores are around 0.55 and the best predictor is stochastic gradient boosting for this particular data set with 0.60 accuracy score. 