In [1]:
import pandas as pd
import numpy as np
import pandas_profiling
import seaborn as sns
import re

from scipy import stats
from scipy.stats import skew
from sklearn import linear_model
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.linear_model import LogisticRegression

import sklearn.model_selection as ms
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

pd.set_option('display.max_columns',99)
pd.set_option('display.max_rows',300)

In [2]:
test_dataset = pd.read_csv('./test_dataset.csv')
train_dataset = pd.read_csv('./train_dataset.csv')
#import datasets

In [3]:
print(train_dataset.shape)
print(test_dataset.shape)
#get shape of train and test data set

(91589, 50)
(10177, 50)


In [4]:
train_dataset['dataset'] = 'train'
test_dataset['dataset'] = 'test'
#add new column to make separating easier later

In [5]:
merged_data = pd.concat([train_dataset, test_dataset], axis=0)
merged_data.shape
#merge data set

(101766, 51)

In [6]:
df = merged_data.copy(deep=True)
#make a copy for data exploration

### Data Exploration

#### Exploring the target variable, readmitted rate

In [7]:
def summarize_feature(dataframe,feature):
    feature_summary = {'values':dataframe[feature].value_counts().index.tolist(),
                     'counts':dataframe[feature].value_counts().values.tolist(),
                     'frequency': dataframe[feature].value_counts(normalize = True).values.tolist()}
    feature_summary = pd.DataFrame(feature_summary)
    return feature_summary


In [8]:
summarize_feature(df, 'readmitted')

Unnamed: 0,values,counts,frequency
0,NO,54864,0.539119
1,>30,35545,0.349282
2,<30,11357,0.111599


In [9]:
df['readmitted'] = df['readmitted'].replace('<30', 1)
df['readmitted'] = df['readmitted'].replace('>30', 0)
df['readmitted'] = df['readmitted'].replace('NO', 0)
#replace target variable with numerical value


In [10]:
summarize_feature(df, 'readmitted')

Unnamed: 0,values,counts,frequency
0,0,90409,0.888401
1,1,11357,0.111599


### feature examination

In [11]:
summarize_feature(df, 'race')

Unnamed: 0,values,counts,frequency
0,Caucasian,76099,0.747784
1,AfricanAmerican,19210,0.188766
2,?,2273,0.022336
3,Hispanic,2037,0.020017
4,Other,1506,0.014799
5,Asian,641,0.006299


In [12]:
dict_race = {'?': -999, 'Caucasian':1, 'AfricanAmerican':2,'Asian':3,'Other':4,'Hispanic':5}
df.race.replace(dict_race, inplace=True)
#impute race feature with nominal values, '?' is imputed with -999

In [13]:
summarize_feature(df, 'race')

Unnamed: 0,values,counts,frequency
0,1,76099,0.747784
1,2,19210,0.188766
2,-999,2273,0.022336
3,5,2037,0.020017
4,4,1506,0.014799
5,3,641,0.006299


In [14]:
summarize_feature(df, 'gender')

Unnamed: 0,values,counts,frequency
0,Female,54708,0.537586
1,Male,47055,0.462384
2,Unknown/Invalid,3,2.9e-05


In [15]:
df = df[df.gender != 'Unknown/Invalid']
#only 3 unknown/invalid, drop rows

In [16]:
dict_gender = {'Female':0, 'Male':1}
df.gender.replace(dict_gender, inplace=True)
#impute gender feature with nominal value


In [17]:
summarize_feature(df, 'gender')

Unnamed: 0,values,counts,frequency
0,0,54708,0.537602
1,1,47055,0.462398


In [18]:
summarize_feature(df, 'age')

Unnamed: 0,values,counts,frequency
0,[70-80),26066,0.256144
1,[60-70),22482,0.220925
2,[50-60),17256,0.16957
3,[80-90),17197,0.168991
4,[40-50),9685,0.095172
5,[30-40),3775,0.037096
6,[90-100),2793,0.027446
7,[20-30),1657,0.016283
8,[10-20),691,0.00679
9,[0-10),161,0.001582


In [19]:
dict_age = {'[60-70)': 65, 
            '[40-50)':45, 
            '[80-90)':85,
               '[70-80)':75,
               '[50-60)':55,
               '[20-30)':25,
               '[90-100)':95,
               '[30-40)':35,
               '[0-10)':5,
               '[10-20)':15}
df.age.replace(dict_age, inplace=True)
#impute age feature with median num


In [20]:
summarize_feature(df, 'age')

Unnamed: 0,values,counts,frequency
0,75,26066,0.256144
1,65,22482,0.220925
2,55,17256,0.16957
3,85,17197,0.168991
4,45,9685,0.095172
5,35,3775,0.037096
6,95,2793,0.027446
7,25,1657,0.016283
8,15,691,0.00679
9,5,161,0.001582


In [21]:
# Creating additional columns for diagnosis
df['level1_diag1'] = df['diag_1']
df['level1_diag2'] = df['diag_2']
df['level1_diag3'] = df['diag_3']


In [22]:
df.loc[df['diag_1'].str.contains('V'), ['level1_diag1']] = 0
df.loc[df['diag_1'].str.contains('E'), ['level1_diag1']] = 0
df.loc[df['diag_2'].str.contains('V'), ['level1_diag2']] = 0
df.loc[df['diag_2'].str.contains('E'), ['level1_diag2']] = 0
df.loc[df['diag_3'].str.contains('V'), ['level1_diag3']] = 0
df.loc[df['diag_3'].str.contains('E'), ['level1_diag3']] = 0
df['level1_diag1'] = df['level1_diag1'].replace('?', -1)
df['level1_diag2'] = df['level1_diag2'].replace('?', -1)
df['level1_diag3'] = df['level1_diag3'].replace('?', -1)


In [23]:
df['level1_diag1'] = df['level1_diag1'].astype(float)
df['level1_diag2'] = df['level1_diag2'].astype(float)
df['level1_diag3'] = df['level1_diag3'].astype(float)


In [None]:
for index, row in df.iterrows():
    if (row['level1_diag1'] >= 390 and row['level1_diag1'] < 460) or (np.floor(row['level1_diag1']) == 785):
        df.loc[index, 'level1_diag1'] = 1
    elif (row['level1_diag1'] >= 460 and row['level1_diag1'] < 520) or (np.floor(row['level1_diag1']) == 786):
        df.loc[index, 'level1_diag1'] = 2
    elif (row['level1_diag1'] >= 520 and row['level1_diag1'] < 580) or (np.floor(row['level1_diag1']) == 787):
        df.loc[index, 'level1_diag1'] = 3
    elif (np.floor(row['level1_diag1']) == 250):
        df.loc[index, 'level1_diag1'] = 4
    elif (row['level1_diag1'] >= 800 and row['level1_diag1'] < 1000):
        df.loc[index, 'level1_diag1'] = 5
    elif (row['level1_diag1'] >= 710 and row['level1_diag1'] < 740):
        df.loc[index, 'level1_diag1'] = 6
    elif (row['level1_diag1'] >= 580 and row['level1_diag1'] < 630) or (np.floor(row['level1_diag1']) == 788):
        df.loc[index, 'level1_diag1'] = 7
    elif (row['level1_diag1'] >= 140 and row['level1_diag1'] < 240):
        df.loc[index, 'level1_diag1'] = 8
    else:
        df.loc[index, 'level1_diag1'] = 0
        
    if (row['level1_diag2'] >= 390 and row['level1_diag2'] < 460) or (np.floor(row['level1_diag2']) == 785):
        df.loc[index, 'level1_diag2'] = 1
    elif (row['level1_diag2'] >= 460 and row['level1_diag2'] < 520) or (np.floor(row['level1_diag2']) == 786):
        df.loc[index, 'level1_diag2'] = 2
    elif (row['level1_diag2'] >= 520 and row['level1_diag2'] < 580) or (np.floor(row['level1_diag2']) == 787):
        df.loc[index, 'level1_diag2'] = 3
    elif (np.floor(row['level1_diag2']) == 250):
        df.loc[index, 'level1_diag2'] = 4
    elif (row['level1_diag2'] >= 800 and row['level1_diag2'] < 1000):
        df.loc[index, 'level1_diag2'] = 5
    elif (row['level1_diag2'] >= 710 and row['level1_diag2'] < 740):
        df.loc[index, 'level1_diag2'] = 6
    elif (row['level1_diag2'] >= 580 and row['level1_diag2'] < 630) or (np.floor(row['level1_diag2']) == 788):
        df.loc[index, 'level1_diag2'] = 7
    elif (row['level1_diag2'] >= 140 and row['level1_diag2'] < 240):
        df.loc[index, 'level1_diag2'] = 8
    else:
        df.loc[index, 'level1_diag2'] = 0
    
    if (row['level1_diag3'] >= 390 and row['level1_diag3'] < 460) or (np.floor(row['level1_diag3']) == 785):
        df.loc[index, 'level1_diag3'] = 1
    elif (row['level1_diag3'] >= 460 and row['level1_diag3'] < 520) or (np.floor(row['level1_diag3']) == 786):
        df.loc[index, 'level1_diag3'] = 2
    elif (row['level1_diag3'] >= 520 and row['level1_diag3'] < 580) or (np.floor(row['level1_diag3']) == 787):
        df.loc[index, 'level1_diag3'] = 3
    elif (np.floor(row['level1_diag3']) == 250):
        df.loc[index, 'level1_diag3'] = 4
    elif (row['level1_diag3'] >= 800 and row['level1_diag3'] < 1000):
        df.loc[index, 'level1_diag3'] = 5
    elif (row['level1_diag3'] >= 710 and row['level1_diag3'] < 740):
        df.loc[index, 'level1_diag3'] = 6
    elif (row['level1_diag3'] >= 580 and row['level1_diag3'] < 630) or (np.floor(row['level1_diag3']) == 788):
        df.loc[index, 'level1_diag3'] = 7
    elif (row['level1_diag3'] >= 140 and row['level1_diag3'] < 240):
        df.loc[index, 'level1_diag3'] = 8
    else:
        df.loc[index, 'level1_diag3'] = 0

In [25]:
df.to_csv('./temp_merged_data_preprocessed.csv')
#above line takes time to process, making a temp csv

In [26]:
df = pd.read_csv('./temp_merged_data_preprocessed.csv')
#read the temp file back in

In [27]:
summarize_feature(df,'max_glu_serum')

Unnamed: 0,values,counts,frequency
0,,96417,0.947466
1,Norm,2597,0.02552
2,>200,1485,0.014593
3,>300,1264,0.012421


In [28]:
dict_max_glu_seru = {'None': -999, 
                     'Norm':0, 
                     '>300':2,
                    '>200':1}
df.max_glu_serum.replace(dict_max_glu_seru, inplace=True)
#replace max_glu_serum with numerical values

In [29]:
summarize_feature(df,'max_glu_serum')
#check again

Unnamed: 0,values,counts,frequency
0,-999,96417,0.947466
1,0,2597,0.02552
2,1,1485,0.014593
3,2,1264,0.012421


In [30]:
summarize_feature(df,'A1Cresult')

Unnamed: 0,values,counts,frequency
0,,84745,0.832768
1,>8,8216,0.080737
2,Norm,4990,0.049036
3,>7,3812,0.03746


In [31]:
dict_A1Cresult = {'None': -999, 
                     'Norm':0, 
                     '>7':1,
                    '>8':2}
df.A1Cresult.replace(dict_A1Cresult, inplace=True)
#replace A1C with numerical values

In [32]:
summarize_feature(df,'A1Cresult')
#check again

Unnamed: 0,values,counts,frequency
0,-999,84745,0.832768
1,2,8216,0.080737
2,0,4990,0.049036
3,1,3812,0.03746


In [33]:
#here we want to note any changes in the dosage of medicine.
keys = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'glipizide', 'glyburide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'insulin', 'glyburide-metformin', 'tolazamide', 'metformin-pioglitazone','metformin-rosiglitazone', 'glimepiride-pioglitazone', 'glipizide-metformin', 'troglitazone', 'tolbutamide', 'acetohexamide']
for col in keys:
    colname = str(col) + 'temp'
    df[colname] = df[col].apply(lambda x: 0 if (x == 'No' or x == 'Steady') else 1)
df['med_dosage_change'] = 0
for col in keys:
    colname = str(col) + 'temp'
    df['med_dosage_change'] = df['med_dosage_change'] + df[colname]
    del df[colname]
#here 0 means no changes in doses of medicine, >0 means there is increase or decrease in dosage

In [34]:
df['number_of_medicine'] = 0
for col in keys:
    df['number_of_medicine'] = df['number_of_medicine'] + df[col].apply(lambda x: 0 if (x == 'No') else 1)
#new column to account for the total number of medicine patient is taking

In [35]:
drug_df = df.loc[:, 'metformin':'diabetesMed']
for drug in list(drug_df):
    print('\n'+drug)
    print('-'*50)
    n = summarize_feature(drug_df,  drug)
    print(n)
#values in drug list is has similar values, make 1 library


metformin
--------------------------------------------------
   values  counts  frequency
0      No   81776   0.803593
1  Steady   18345   0.180272
2      Up    1067   0.010485
3    Down     575   0.005650

repaglinide
--------------------------------------------------
   values  counts  frequency
0      No  100224   0.984877
1  Steady    1384   0.013600
2      Up     110   0.001081
3    Down      45   0.000442

nateglinide
--------------------------------------------------
   values  counts  frequency
0      No  101060   0.993092
1  Steady     668   0.006564
2      Up      24   0.000236
3    Down      11   0.000108

chlorpropamide
--------------------------------------------------
   values  counts  frequency
0      No  101677   0.999155
1  Steady      79   0.000776
2      Up       6   0.000059
3    Down       1   0.000010

glimepiride
--------------------------------------------------
   values  counts  frequency
0      No   96572   0.948989
1  Steady    4670   0.045891
2      Up   

In [36]:
dict_drug = {'No': 0, 
            'Down':1, 
            'Steady':2,
            'Up':3,
             'Ch':1,
             'Yes':1
            }
#replace drug text with numerical value

In [37]:
drug_df = df.loc[:, 'metformin':'diabetesMed']

In [38]:
for drug in drug_df:
    drug_df[drug] = drug_df[drug].map(dict_drug)


In [39]:
df.loc[:, 'metformin':'diabetesMed'] = drug_df

In [40]:
df

Unnamed: 0.1,Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,dataset,level1_diag1,level1_diag2,level1_diag3,med_dosage_change,number_of_medicine
0,0,7733208,3291489,1,0,65,?,1,1,7,2,?,Cardiology,51,3,11,0,0,0,786,530,250,4,-999,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,train,4.0,7.0,0.0,0,0
1,1,152449578,84529188,1,1,45,?,1,2,7,3,HM,?,86,1,15,1,0,1,511,276,276,9,-999,-999,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,train,0.0,1.0,0.0,0,0
2,2,440311646,121372727,1,1,45,?,1,3,7,13,?,?,88,5,34,0,0,0,507,453,518,9,-999,-999,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,train,6.0,6.0,6.0,1,1
3,3,106684962,24066279,1,0,85,?,5,3,17,4,MC,Orthopedics-Reconstructive,18,2,17,3,2,0,820,285,428,9,0,-999,0,0,0,0,0,0,2,0,0,2,0,0,0,0,0,0,0,2,0,0,0,0,0,1,1,0,train,2.0,0.0,0.0,0,3
4,4,139779162,86645961,2,1,65,?,1,4,1,3,?,?,22,0,11,1,0,2,428,491,295,6,-999,-999,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,1,1,0,train,1.0,7.0,1.0,0,3
5,5,223277988,59559255,1,1,75,?,3,22,1,5,MC,?,68,1,25,0,1,0,822,585,427,9,-999,-999,0,0,0,0,0,0,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,train,8.0,4.0,0.0,0,2
6,6,127005696,72034335,1,1,85,?,3,1,1,4,?,Surgery-Thoracic,66,4,21,1,0,1,414,250,440,6,-999,-999,0,0,0,0,0,0,0,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,train,3.0,4.0,1.0,0,2
7,7,37320456,23560803,1,0,65,?,5,3,17,6,?,Family/GeneralPractice,17,0,9,0,1,0,331,250.01,294,3,0,-999,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,1,train,5.0,0.0,0.0,1,1
8,8,144033276,6382269,2,1,55,?,1,1,7,2,CP,InternalMedicine,43,6,13,0,0,0,414,593,250,6,-999,-999,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,1,1,0,train,2.0,1.0,0.0,0,2
9,9,166008048,89445645,1,1,75,?,2,1,1,2,?,InternalMedicine,52,3,13,1,0,0,410,414,250,9,-999,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,train,4.0,1.0,0.0,0,1


### Missing Values

In [41]:
df = df.replace('?', np.nan)
#saw in dataset missing value or unknown has '?', will replace with nan


In [42]:
df.isnull().sum().sort_values(ascending=False).head(10)
#list col with missing values


weight                98566
medical_specialty     49947
payer_code            40255
diag_3                 1423
diag_2                  358
diag_1                   21
number_of_medicine        0
num_medications           0
metformin                 0
A1Cresult                 0
dtype: int64

In [43]:
df = df.drop(['weight', 'payer_code', 'medical_specialty'], axis = 1)
#dropping columns with too many missing values

df = df.drop(['examide', 'citoglipton'], axis = 1)
#these 2 drugs were all 'no'


In [44]:
df = df.loc[~df.discharge_disposition_id.isin([11,13,14,19,20,21])]
#11, 13, 14, 19, 20, 21 are patients who are expired so no re-admission possible


In [45]:
df.columns

Index(['Unnamed: 0', 'encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses',
       'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'miglitol', 'troglitazone', 'tolazamide', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted',
       'dataset', 'level1_diag1', 'level1_diag2', 'level1_diag3',
       'med_dosage_change', 'number_of_medicine'],
      dtype='object')

In [46]:
df = df.drop(['diag_1', 
              'diag_2', 
              'diag_3'], axis = 1)
#remove the columns that were engineered into other columns


In [47]:
df.shape

(99340, 49)

In [48]:
df['random'] = np.random.randint(-999, 999, df.shape[0])
df.random

0         785
1          44
2         651
3        -878
4         631
5        -864
6        -787
7         843
8         756
9         375
10       -382
11         23
12        674
13        722
14        -85
15       -470
16        488
17       -242
18       -468
19       -413
20        839
22       -406
23       -317
24        337
25       -495
26        268
27       -590
28        247
29        589
30       -100
31        205
32        906
33       -306
34        505
35       -225
36        642
37        594
38       -862
39         11
40         75
41        399
42        955
43        156
44        174
45        650
46        576
47        834
48         99
49        846
50       -758
51       -932
52        678
53       -733
54       -609
55        893
56       -805
57        784
58        -36
59        527
60        -96
61        232
62        572
63       -346
64       -824
65       -652
66       -285
67        495
68       -135
69        946
70       -860
71       -544
73    

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99340 entries, 0 to 101762
Data columns (total 50 columns):
Unnamed: 0                  99340 non-null int64
encounter_id                99340 non-null int64
patient_nbr                 99340 non-null int64
race                        99340 non-null int64
gender                      99340 non-null int64
age                         99340 non-null int64
admission_type_id           99340 non-null int64
discharge_disposition_id    99340 non-null int64
admission_source_id         99340 non-null int64
time_in_hospital            99340 non-null int64
num_lab_procedures          99340 non-null int64
num_procedures              99340 non-null int64
num_medications             99340 non-null int64
number_outpatient           99340 non-null int64
number_emergency            99340 non-null int64
number_inpatient            99340 non-null int64
number_diagnoses            99340 non-null int64
max_glu_serum               99340 non-null int64
A1Cresul

In [50]:
df.columns

Index(['Unnamed: 0', 'encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted',
       'dataset', 'level1_diag1', 'level1_diag2', 'level1_diag3',
       'med_dosage_change', 'number_of_medicine', 'random'],
      dtype='object')

In [51]:
#convert to object dtype
df[['encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted',
       'dataset', 'level1_diag1', 'level1_diag2', 'level1_diag3']] = df[['encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted',
       'dataset', 'level1_diag1', 'level1_diag2', 'level1_diag3']].astype('object')


In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99340 entries, 0 to 101762
Data columns (total 50 columns):
Unnamed: 0                  99340 non-null int64
encounter_id                99340 non-null object
patient_nbr                 99340 non-null object
race                        99340 non-null object
gender                      99340 non-null object
age                         99340 non-null object
admission_type_id           99340 non-null object
discharge_disposition_id    99340 non-null object
admission_source_id         99340 non-null object
time_in_hospital            99340 non-null int64
num_lab_procedures          99340 non-null int64
num_procedures              99340 non-null int64
num_medications             99340 non-null int64
number_outpatient           99340 non-null int64
number_emergency            99340 non-null int64
number_inpatient            99340 non-null int64
number_diagnoses            99340 non-null int64
max_glu_serum               99340 non-null object

In [53]:
df.to_csv('./merged_data_preprocessed.csv')

In [54]:
df.drop

<bound method DataFrame.drop of         Unnamed: 0 encounter_id patient_nbr  race gender age  \
0                0      7733208     3291489     1      0  65   
1                1    152449578    84529188     1      1  45   
2                2    440311646   121372727     1      1  45   
3                3    106684962    24066279     1      0  85   
4                4    139779162    86645961     2      1  65   
5                5    223277988    59559255     1      1  75   
6                6    127005696    72034335     1      1  85   
7                7     37320456    23560803     1      0  65   
8                8    144033276     6382269     2      1  55   
9                9    166008048    89445645     1      1  75   
10              10    255458850    84752271     1      0  55   
11              11     80899056    21250872     1      0  85   
12              12     30278250    74280573     1      0  85   
13              13     25723410      105264     1      1  75   
14      

In [55]:
df.to_csv('./merged_data_preprocessed.csv')

In [66]:
df = pd.read_csv('./merged_data_preprocessed.csv')

In [67]:
df_train = df[df.dataset=='train']
df_train = df_train.drop(columns='dataset')
df_train.sample(5)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,level1_diag1,level1_diag2,level1_diag3,med_dosage_change,number_of_medicine,random
78380,80297,80299,53468418,17273673,1,1,75,1,18,7,2,44,0,17,0,0,0,9,-999,-999,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,2.0,0.0,0.0,0,1,-429
7448,7646,7646,252983448,44325009,1,1,75,3,1,1,1,18,6,11,0,0,0,8,-999,-999,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,3,0,0,0,0,0,1,1,1,0.0,7.0,1.0,1,2,-397
85107,87180,87182,88738794,7467201,1,0,85,2,3,1,5,40,0,14,0,0,0,5,-999,-999,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,1,0,0.0,0.0,1.0,0,1,725
28132,28820,28822,106475526,7948755,-999,0,75,2,1,1,3,29,0,9,0,0,0,5,-999,2,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,2,0,0,0,0,0,1,1,0,3.0,1.0,4.0,0,3,567
74565,76391,76393,229254690,85166847,1,1,85,2,6,1,6,49,5,9,7,0,0,9,-999,-999,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,3.0,0.0,3.0,0,1,-374


In [68]:
df_test = df[df.dataset=='test']
df_test = df_test.drop(columns='dataset')
df_test.sample(5)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,level1_diag1,level1_diag2,level1_diag3,med_dosage_change,number_of_medicine,random
93597,95892,4306,32707902,389637,2,0,45,1,1,7,7,59,0,8,0,0,0,3,-999,-999,3,0,0,0,2,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,1,1,0,0.0,4.0,1.0,1,3,-591
97425,99809,8223,37662246,18101385,1,0,65,1,1,7,6,41,0,11,0,0,0,8,-999,-999,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0.0,2.0,0.0,0,1,-709
99153,101572,9986,254180364,37176165,1,1,65,3,1,1,2,33,4,22,0,0,1,9,-999,-999,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,6.0,4.0,0.0,0,1,529
94749,97071,5485,46282254,10350018,1,1,65,1,18,7,3,49,1,7,0,0,1,9,-999,-999,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,2.0,1.0,2.0,0,1,533
91368,93600,2014,383093114,45575883,1,0,75,2,4,7,2,49,0,16,0,1,1,9,-999,-999,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,4.0,0.0,1.0,1,2,386


In [71]:
df_test.to_csv('./df_test_cleaned.csv')
df_train.to_csv('./df_train_cleaned.csv')

In [63]:
df_test.sample(5)

Unnamed: 0.1,Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,level1_diag1,level1_diag2,level1_diag3,med_dosage_change,number_of_medicine,random
97923,6337,36973062,380691,1,1,75,1,1,7,3,49,4,15,0,0,0,5,-999,-999,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,0,1,690
98170,6584,173914326,25401933,1,0,65,2,1,1,9,40,4,39,0,0,0,9,-999,-999,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,2,0,0,0,0,0,1,1,0,8,8,0,0,3,-53
95238,3652,166865976,80618877,1,1,85,1,3,7,9,86,0,25,0,0,2,9,-999,-999,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,2,0,0,0,0,0,1,1,0,0,2,2,0,2,-714
98776,7190,131691174,73911672,1,0,65,8,1,1,1,1,3,20,0,0,0,9,-999,-999,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,949
94189,2603,227609100,67682817,1,0,55,2,1,1,4,1,4,8,1,0,0,9,-999,-999,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,3,0,0,0,-787


In [58]:
y_test_dataset = df_test.readmitted
X_test_dataset = df_test.drop(columns='readmitted')

In [59]:
y_train_dataset = df_train.readmitted
X_train_dataset = df_train.drop(columns='readmitted')

In [60]:
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

In [61]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train_dataset, y_train_dataset, test_size=0.2)


In [62]:
#logistic regression

from sklearn.linear_model import LogisticRegression

logistic = LogisticRegression(C=1e4)
logistic.fit(X_train, y_train)
logistic.score(X_train, y_train)



ValueError: Unknown label type: 'unknown'

In [52]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train, logistic.predict(X_train))

array([[63340,     0],
       [ 8187,     0]])

In [53]:
y_train.value_counts()

0    63340
1     8187
Name: readmitted, dtype: int64

In [54]:
logistic.score(X_valid, y_valid)

0.8885471423778101

In [55]:
y_valid_predict = logistic.predict(X_valid)


In [56]:
pd.crosstab(pd.Series(y_valid, name = 'Actual'), pd.Series(y_valid_predict, name = 'Predict'), margins = True)


Predict,0,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3087,3087
1,395,395
All,3482,3482


In [61]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score


In [62]:
thresh = 0.5


In [64]:
def print_report(y_actual, y_pred, thresh):
    
    auc = roc_auc_score(y_actual, y_pred)
    accuracy = accuracy_score(y_actual, (y_pred > thresh))
    recall = recall_score(y_actual, (y_pred > thresh))
    precision = precision_score(y_actual, (y_pred > thresh))
    specificity = calc_specificity(y_actual, y_pred, thresh)
    print('AUC:%.3f'%auc)
    print('accuracy:%.3f'%accuracy)
    print('recall:%.3f'%recall)
    print('precision:%.3f'%precision)
    print('specificity:%.3f'%specificity)
    print('prevalence:%.3f'%calc_prevalence(y_actual))
    print(' ')
    return auc, accuracy, recall, precision, specificity

In [69]:
y_train_preds = logistic.predict_proba(X_train)
y_valid_preds = logistic.predict_proba(X_valid)

print('Logistic Regression')
print('Training:')
lr_train_auc, lr_train_accuracy, lr_train_recall, lr_train_precision, lr_train_specificity = print_report(y_train,y_train_preds, thresh)
print('Validation:')
lr_valid_auc, lr_valid_accuracy, lr_valid_recall, lr_valid_precision, lr_valid_specificity = print_report(y_valid,y_valid_preds, thresh)

Logistic Regression
Training:


ValueError: bad input shape (71527, 2)