In [1]:
import pandas as pd
import numpy as np
import pandas_profiling
import seaborn as sns
import re

from scipy import stats
from scipy.stats import skew
from sklearn import linear_model
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.linear_model import LogisticRegression

import sklearn.model_selection as ms
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

pd.set_option('display.max_columns',99)
pd.set_option('display.max_rows',300)

In [2]:
test_dataset = pd.read_csv('./test_dataset.csv')
train_dataset = pd.read_csv('./train_dataset.csv')
#import datasets

In [3]:
print(train_dataset.shape)
print(test_dataset.shape)
#get shape of train and test data set

(91589, 50)
(10177, 50)


In [4]:
train_dataset['dataset'] = 'train'
test_dataset['dataset'] = 'test'
#add new column to make separating easier later

In [5]:
merged_data = pd.concat([train_dataset, test_dataset], axis=0)
merged_data.shape
#merge data set

(101766, 51)

In [6]:
df = merged_data.copy(deep=True)
#make a copy for data exploration

### Data Exploration

#### Exploring the target variable, readmitted rate

In [2]:
def summarize_feature(dataframe,feature):
    feature_summary = {'values':dataframe[feature].value_counts().index.tolist(),
                     'counts':dataframe[feature].value_counts().values.tolist(),
                     'frequency': dataframe[feature].value_counts(normalize = True).values.tolist()}
    feature_summary = pd.DataFrame(feature_summary)
    return feature_summary


In [8]:
summarize_feature(df, 'readmitted')

Unnamed: 0,values,counts,frequency
0,NO,54864,0.539119
1,>30,35545,0.349282
2,<30,11357,0.111599


In [10]:
df['readmitted'] = df['readmitted'].replace('<30', 1)
df['readmitted'] = df['readmitted'].replace('>30', 0)
df['readmitted'] = df['readmitted'].replace('NO', 0)
#replace target variable with numerical value


In [11]:
summarize_feature(df, 'readmitted')

Unnamed: 0,values,counts,frequency
0,0,90409,0.888401
1,1,11357,0.111599


### feature examination

In [12]:
summarize_feature(df, 'race')

Unnamed: 0,values,counts,frequency
0,Caucasian,76099,0.747784
1,AfricanAmerican,19210,0.188766
2,?,2273,0.022336
3,Hispanic,2037,0.020017
4,Other,1506,0.014799
5,Asian,641,0.006299


In [13]:
dict_race = {'?': -999, 'Caucasian':1, 'AfricanAmerican':2,'Asian':3,'Other':4,'Hispanic':5}
df.race.replace(dict_race, inplace=True)
#impute race feature with nominal values, '?' is imputed with -999

In [14]:
summarize_feature(df, 'race')

Unnamed: 0,values,counts,frequency
0,1,76099,0.747784
1,2,19210,0.188766
2,-999,2273,0.022336
3,5,2037,0.020017
4,4,1506,0.014799
5,3,641,0.006299


In [15]:
summarize_feature(df, 'gender')

Unnamed: 0,values,counts,frequency
0,Female,54708,0.537586
1,Male,47055,0.462384
2,Unknown/Invalid,3,2.9e-05


In [16]:
df = df[df.gender != 'Unknown/Invalid']
#only 3 unknown/invalid, drop rows

In [13]:
dict_gender = {'Female':0, 'Male':1}
df.gender.replace(dict_gender, inplace=True)
#impute gender feature with nominal value


In [17]:
summarize_feature(df, 'gender')

Unnamed: 0,values,counts,frequency
0,Female,54708,0.537602
1,Male,47055,0.462398


In [18]:
summarize_feature(df, 'age')

Unnamed: 0,values,counts,frequency
0,[70-80),26066,0.256144
1,[60-70),22482,0.220925
2,[50-60),17256,0.16957
3,[80-90),17197,0.168991
4,[40-50),9685,0.095172
5,[30-40),3775,0.037096
6,[90-100),2793,0.027446
7,[20-30),1657,0.016283
8,[10-20),691,0.00679
9,[0-10),161,0.001582


In [19]:
dict_age = {'[60-70)': 65, 
            '[40-50)':45, 
            '[80-90)':85,
               '[70-80)':75,
               '[50-60)':55,
               '[20-30)':25,
               '[90-100)':95,
               '[30-40)':35,
               '[0-10)':5,
               '[10-20)':15}
df.age.replace(dict_age, inplace=True)
#impute age feature with median num


In [21]:
summarize_feature(df, 'age')

Unnamed: 0,values,counts,frequency
0,75,26066,0.256144
1,65,22482,0.220925
2,55,17256,0.16957
3,85,17197,0.168991
4,45,9685,0.095172
5,35,3775,0.037096
6,95,2793,0.027446
7,25,1657,0.016283
8,15,691,0.00679
9,5,161,0.001582


In [22]:
# Creating additional columns for diagnosis
df['level1_diag1'] = df['diag_1']
df['level1_diag2'] = df['diag_2']
df['level1_diag3'] = df['diag_3']


In [23]:
df.loc[df['diag_1'].str.contains('V'), ['level1_diag1']] = 0
df.loc[df['diag_1'].str.contains('E'), ['level1_diag1']] = 0
df.loc[df['diag_2'].str.contains('V'), ['level1_diag2']] = 0
df.loc[df['diag_2'].str.contains('E'), ['level1_diag2']] = 0
df.loc[df['diag_3'].str.contains('V'), ['level1_diag3']] = 0
df.loc[df['diag_3'].str.contains('E'), ['level1_diag3']] = 0
df['level1_diag1'] = df['level1_diag1'].replace('?', -1)
df['level1_diag2'] = df['level1_diag2'].replace('?', -1)
df['level1_diag3'] = df['level1_diag3'].replace('?', -1)


In [24]:
df['level1_diag1'] = df['level1_diag1'].astype(float)
df['level1_diag2'] = df['level1_diag2'].astype(float)
df['level1_diag3'] = df['level1_diag3'].astype(float)


In [25]:
for index, row in df.iterrows():
    if (row['level1_diag1'] >= 390 and row['level1_diag1'] < 460) or (np.floor(row['level1_diag1']) == 785):
        df.loc[index, 'level1_diag1'] = 1
    elif (row['level1_diag1'] >= 460 and row['level1_diag1'] < 520) or (np.floor(row['level1_diag1']) == 786):
        df.loc[index, 'level1_diag1'] = 2
    elif (row['level1_diag1'] >= 520 and row['level1_diag1'] < 580) or (np.floor(row['level1_diag1']) == 787):
        df.loc[index, 'level1_diag1'] = 3
    elif (np.floor(row['level1_diag1']) == 250):
        df.loc[index, 'level1_diag1'] = 4
    elif (row['level1_diag1'] >= 800 and row['level1_diag1'] < 1000):
        df.loc[index, 'level1_diag1'] = 5
    elif (row['level1_diag1'] >= 710 and row['level1_diag1'] < 740):
        df.loc[index, 'level1_diag1'] = 6
    elif (row['level1_diag1'] >= 580 and row['level1_diag1'] < 630) or (np.floor(row['level1_diag1']) == 788):
        df.loc[index, 'level1_diag1'] = 7
    elif (row['level1_diag1'] >= 140 and row['level1_diag1'] < 240):
        df.loc[index, 'level1_diag1'] = 8
    else:
        df.loc[index, 'level1_diag1'] = 0
        
    if (row['level1_diag2'] >= 390 and row['level1_diag2'] < 460) or (np.floor(row['level1_diag2']) == 785):
        df.loc[index, 'level1_diag2'] = 1
    elif (row['level1_diag2'] >= 460 and row['level1_diag2'] < 520) or (np.floor(row['level1_diag2']) == 786):
        df.loc[index, 'level1_diag2'] = 2
    elif (row['level1_diag2'] >= 520 and row['level1_diag2'] < 580) or (np.floor(row['level1_diag2']) == 787):
        df.loc[index, 'level1_diag2'] = 3
    elif (np.floor(row['level1_diag2']) == 250):
        df.loc[index, 'level1_diag2'] = 4
    elif (row['level1_diag2'] >= 800 and row['level1_diag2'] < 1000):
        df.loc[index, 'level1_diag2'] = 5
    elif (row['level1_diag2'] >= 710 and row['level1_diag2'] < 740):
        df.loc[index, 'level1_diag2'] = 6
    elif (row['level1_diag2'] >= 580 and row['level1_diag2'] < 630) or (np.floor(row['level1_diag2']) == 788):
        df.loc[index, 'level1_diag2'] = 7
    elif (row['level1_diag2'] >= 140 and row['level1_diag2'] < 240):
        df.loc[index, 'level1_diag2'] = 8
    else:
        df.loc[index, 'level1_diag2'] = 0
    
    if (row['level1_diag3'] >= 390 and row['level1_diag3'] < 460) or (np.floor(row['level1_diag3']) == 785):
        df.loc[index, 'level1_diag3'] = 1
    elif (row['level1_diag3'] >= 460 and row['level1_diag3'] < 520) or (np.floor(row['level1_diag3']) == 786):
        df.loc[index, 'level1_diag3'] = 2
    elif (row['level1_diag3'] >= 520 and row['level1_diag3'] < 580) or (np.floor(row['level1_diag3']) == 787):
        df.loc[index, 'level1_diag3'] = 3
    elif (np.floor(row['level1_diag3']) == 250):
        df.loc[index, 'level1_diag3'] = 4
    elif (row['level1_diag3'] >= 800 and row['level1_diag3'] < 1000):
        df.loc[index, 'level1_diag3'] = 5
    elif (row['level1_diag3'] >= 710 and row['level1_diag3'] < 740):
        df.loc[index, 'level1_diag3'] = 6
    elif (row['level1_diag3'] >= 580 and row['level1_diag3'] < 630) or (np.floor(row['level1_diag3']) == 788):
        df.loc[index, 'level1_diag3'] = 7
    elif (row['level1_diag3'] >= 140 and row['level1_diag3'] < 240):
        df.loc[index, 'level1_diag3'] = 8
    else:
        df.loc[index, 'level1_diag3'] = 0

In [3]:
df.to_csv('./temp_merged_data_preprocessed.csv')
#above line takes time to process, making a temp csv

NameError: name 'df' is not defined

In [5]:
df = pd.read_csv('./temp_merged_data_preprocessed.csv')
#read the temp file back in

FileNotFoundError: File b'./temp_merged_data_preprocessed.csv' does not exist

In [22]:
summarize_feature(df,'max_glu_serum')

Unnamed: 0,values,counts,frequency
0,,96417,0.947466
1,Norm,2597,0.02552
2,>200,1485,0.014593
3,>300,1264,0.012421


In [23]:
dict_max_glu_seru = {'None': -999, 
                     'Norm':0, 
                     '>300':2,
                    '>200':1}
df.max_glu_serum.replace(dict_max_glu_seru, inplace=True)
#replace max_glu_serum with numerical values

In [24]:
summarize_feature(df,'max_glu_serum')
#check again

Unnamed: 0,values,counts,frequency
0,-999,96417,0.947466
1,0,2597,0.02552
2,1,1485,0.014593
3,2,1264,0.012421


In [25]:
summarize_feature(df,'A1Cresult')

Unnamed: 0,values,counts,frequency
0,,84745,0.832768
1,>8,8216,0.080737
2,Norm,4990,0.049036
3,>7,3812,0.03746


In [26]:
dict_A1Cresult = {'None': -999, 
                     'Norm':0, 
                     '>7':1,
                    '>8':2}
df.A1Cresult.replace(dict_A1Cresult, inplace=True)
#replace A1C with numerical values

In [27]:
summarize_feature(df,'A1Cresult')
#check again

Unnamed: 0,values,counts,frequency
0,-999,84745,0.832768
1,2,8216,0.080737
2,0,4990,0.049036
3,1,3812,0.03746


In [28]:
#here we want to note any changes in the dosage of medicine.
keys = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'glipizide', 'glyburide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'insulin', 'glyburide-metformin', 'tolazamide', 'metformin-pioglitazone','metformin-rosiglitazone', 'glimepiride-pioglitazone', 'glipizide-metformin', 'troglitazone', 'tolbutamide', 'acetohexamide']
for col in keys:
    colname = str(col) + 'temp'
    df[colname] = df[col].apply(lambda x: 0 if (x == 'No' or x == 'Steady') else 1)
df['med_dosage_change'] = 0
for col in keys:
    colname = str(col) + 'temp'
    df['med_dosage_change'] = df['med_dosage_change'] + df[colname]
    del df[colname]
#here 0 means no changes in doses of medicine, >0 means there is increase or decrease in dosage

In [32]:
df['number_of_medicine'] = 0
for col in keys:
    df['number_of_medicine'] = df['number_of_medicine'] + df[col].apply(lambda x: 0 if (x == 'No') else 1)
#new column to account for the total number of medicine patient is taking

In [34]:
drug_df = df.loc[:, 'metformin':'diabetesMed']
for drug in list(drug_df):
    print('\n'+drug)
    print('-'*50)
    n = summarize_feature(drug_df,  drug)
    print(n)
#values in drug list is has similar values, make 1 library


metformin
--------------------------------------------------
   values  counts  frequency
0      No   81776   0.803593
1  Steady   18345   0.180272
2      Up    1067   0.010485
3    Down     575   0.005650

repaglinide
--------------------------------------------------
   values  counts  frequency
0      No  100224   0.984877
1  Steady    1384   0.013600
2      Up     110   0.001081
3    Down      45   0.000442

nateglinide
--------------------------------------------------
   values  counts  frequency
0      No  101060   0.993092
1  Steady     668   0.006564
2      Up      24   0.000236
3    Down      11   0.000108

chlorpropamide
--------------------------------------------------
   values  counts  frequency
0      No  101677   0.999155
1  Steady      79   0.000776
2      Up       6   0.000059
3    Down       1   0.000010

glimepiride
--------------------------------------------------
   values  counts  frequency
0      No   96572   0.948989
1  Steady    4670   0.045891
2      Up   

In [40]:
dict_drug = {'No': 0, 
            'Down':1, 
            'Steady':2,
            'Up':3,
             'Ch':1,
             'Yes':1
            }
#replace drug text with numerical value

In [41]:
drug_df = df.loc[:, 'metformin':'diabetesMed']

In [45]:
for drug in df:
    df[drug] = df[drug].map(dict_drug)


In [47]:
df

Unnamed: 0.1,Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,dataset,level1_diag1,level1_diag2,level1_diag3,numchange,nummed
0,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,
1,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,
2,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,,,,,,,
3,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,2,0,0,2,0,0,0,0,0,0,0,2,0,0,0,0,0,1,1,,,,,,,
4,,,,,,,,,,,,,,,,,,,,,,,,,,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,1,1,,,,,,,
5,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,,,,,,,
6,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,1,1,,,,,,,
7,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,,,,,,,
8,,,,,,,,,,,,,,,,,,,,,,,,,,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,1,1,,,,,,,
9,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,,,,,,,


### Missing Values

In [31]:
df = df.replace('?', np.nan)
#saw in dataset missing value or unknown has '?', will replace with nan


In [32]:
df.isnull().sum().sort_values(ascending=False).head(10)
#list col with missing values


weight               98569
medical_specialty    49949
payer_code           40256
diag_3                1423
diag_2                 358
diag_1                  21
level1_diag3             0
num_medications          0
metformin                0
A1Cresult                0
dtype: int64

In [33]:
df = df.drop(['weight', 'payer_code', 'medical_specialty'], axis = 1)
#dropping columns with too many missing values

df = df.drop(['examide', 'citoglipton'], axis = 1)
#these 2 drugs were all 'no'


In [34]:
df = df.loc[~df.discharge_disposition_id.isin([11,13,14,19,20,21])]
#11, 13, 14, 19, 20, 21 are patients who are expired so no re-admission possible


In [35]:
df.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses',
       'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'miglitol', 'troglitazone', 'tolazamide', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted',
       'dataset', 'level1_diag1', 'level1_diag2', 'level1_diag3'],
      dtype='object')

In [36]:
df = df.drop(['diag_1', 
              'diag_2', 
              'diag_3'], axis = 1)
#remove the columns that were engineered into other columns


In [37]:
df.shape

(99343, 46)

In [38]:
df['random'] = np.random.randint(-999, 999, df.shape[0])
df.random

0        179
1        270
2       -213
3         83
4        380
5        490
6        972
7       -855
8        527
9        898
10       -32
11      -146
12       321
13       165
14      -300
15        47
16      -707
17       389
18      -105
19       -86
20      -857
22       588
23      -145
24       388
25       886
26       889
27       464
28       624
29       807
30       440
        ... 
10146   -118
10147    266
10148    933
10149   -176
10150    422
10151   -545
10152   -105
10153   -170
10154    317
10155    289
10156    734
10157    694
10158    161
10159    395
10160    154
10161   -266
10162    454
10163    873
10164    430
10165   -481
10166    185
10167    559
10168   -514
10169    112
10170    339
10171    922
10172   -511
10174      4
10175    106
10176    -29
Name: random, Length: 99343, dtype: int64

In [39]:
df.to_csv('./merged_data_preprocessed.csv')

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99343 entries, 0 to 10176
Data columns (total 47 columns):
encounter_id                99343 non-null int64
patient_nbr                 99343 non-null int64
race                        99343 non-null int64
gender                      99343 non-null int64
age                         99343 non-null int64
admission_type_id           99343 non-null int64
discharge_disposition_id    99343 non-null int64
admission_source_id         99343 non-null int64
time_in_hospital            99343 non-null int64
num_lab_procedures          99343 non-null int64
num_procedures              99343 non-null int64
num_medications             99343 non-null int64
number_outpatient           99343 non-null int64
number_emergency            99343 non-null int64
number_inpatient            99343 non-null int64
number_diagnoses            99343 non-null int64
max_glu_serum               99343 non-null int64
A1Cresult                   99343 non-null int64
metformin

In [41]:
df.isnull().sum().sort_values(ascending=False)


random                      0
num_medications             0
nateglinide                 0
repaglinide                 0
metformin                   0
A1Cresult                   0
max_glu_serum               0
number_diagnoses            0
number_inpatient            0
number_emergency            0
number_outpatient           0
num_procedures              0
glimepiride                 0
num_lab_procedures          0
time_in_hospital            0
admission_source_id         0
discharge_disposition_id    0
admission_type_id           0
age                         0
gender                      0
race                        0
patient_nbr                 0
chlorpropamide              0
acetohexamide               0
level1_diag3                0
glipizide-metformin         0
level1_diag2                0
level1_diag1                0
dataset                     0
readmitted                  0
diabetesMed                 0
change                      0
metformin-pioglitazone      0
metformin-

In [42]:
df.to_csv('./merged_data_preprocessed.csv')

In [43]:
df.dataset.unique()

array(['train', 'test'], dtype=object)

In [44]:
df_train = df[df.dataset=='train']
df_train = df_train.drop(columns='dataset')
df_train.sample(5)

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,...,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,level1_diag1,level1_diag2,level1_diag3,random
49564,230116170,87185628,2,2,65,1,1,7,4,46,...,0,0,0,0,0,0,3.0,0.0,0.0,282
56802,156643446,35687133,1,2,65,1,1,7,7,37,...,0,0,0,1,1,0,5.0,2.0,1.0,193
45217,73228146,1699029,1,1,65,3,3,1,7,24,...,0,0,0,0,1,0,6.0,4.0,0.0,655
74592,431476676,41827590,0,2,85,1,6,7,5,45,...,0,0,0,0,1,0,2.0,1.0,1.0,960
29477,41239698,24485670,1,1,55,2,1,1,2,51,...,0,0,0,0,1,0,3.0,4.0,0.0,402


In [45]:
df_test = df[df.dataset=='test']
df_test = df_test.drop(columns='dataset')
df_test.sample(5)

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,...,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,level1_diag1,level1_diag2,level1_diag3,random
9091,136287744,24481782,1,1,75,1,3,7,8,64,...,0,0,0,1,1,0,2.0,1.0,2.0,-79
4221,122782782,99637119,1,2,75,2,6,7,4,25,...,0,0,0,1,1,0,1.0,1.0,0.0,43
1744,190664100,60206454,1,2,55,1,1,7,4,49,...,0,0,0,1,1,0,5.0,4.0,1.0,-865
129,148870182,74016756,2,2,55,1,1,7,5,46,...,0,0,0,1,1,0,0.0,4.0,0.0,-228
4295,15254370,4383171,1,2,55,1,2,7,4,39,...,0,0,0,1,1,0,1.0,1.0,4.0,-408


In [46]:
df.corr().abs()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,...,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,level1_diag1,level1_diag2,level1_diag3,random
encounter_id,1.0,0.513446,0.017388,0.005916,0.070989,0.157088,0.143416,0.110612,0.063806,0.029005,...,0.002121,0.00672,0.007379,0.096787,0.052211,0.007992,0.014005,0.032001,0.007295,0.000896
patient_nbr,0.513446,1.0,0.001953,0.006925,0.07008,0.009455,0.145774,0.0306,0.025033,0.01172,...,0.001056,0.009588,0.001058,0.056542,0.019005,0.008765,0.013789,0.012821,0.009122,0.00147
race,0.017388,0.001953,1.0,0.032038,0.139067,0.020048,0.001065,0.005506,0.006321,0.009122,...,0.001246,0.007545,0.001246,0.007123,0.002401,0.003157,0.000263,0.006677,0.000731,0.001125
gender,0.005916,0.006925,0.032038,1.0,0.050532,0.014805,0.023388,0.003925,0.030237,0.002746,...,0.002939,0.004843,0.002939,0.015297,0.016956,0.002512,0.018798,0.023048,0.009421,0.00013
age,0.070989,0.07008,0.139067,0.050532,1.0,0.004857,0.096318,0.040766,0.107077,0.016323,...,0.000148,0.002605,0.000148,0.033694,0.018653,0.022196,0.018116,0.015802,0.043006,0.003437
admission_type_id,0.157088,0.009455,0.020048,0.014805,0.004857,1.0,0.09364,0.103586,0.012865,0.141733,...,0.00226,9.4e-05,0.002127,0.006844,0.001169,0.013218,0.054168,0.018127,0.017616,0.002009
discharge_disposition_id,0.143416,0.145774,0.001065,0.023388,0.096318,0.09364,1.0,0.009605,0.161163,0.009782,...,0.001541,1.5e-05,0.000317,0.005675,0.022022,0.063374,0.028421,0.001891,0.005912,0.005007
admission_source_id,0.110612,0.0306,0.005506,0.003925,0.040766,0.103586,0.009605,1.0,0.007218,0.052851,...,0.00099,0.001911,0.003693,0.003962,0.003561,0.007877,0.070802,0.022565,0.023092,0.002812
time_in_hospital,0.063806,0.025033,0.006321,0.030237,0.107077,0.012865,0.161163,0.007218,1.0,0.319855,...,0.002543,0.000573,0.001732,0.107613,0.060719,0.046886,0.001306,0.015729,0.035342,0.003027
num_lab_procedures,0.029005,0.01172,0.009122,0.002746,0.016323,0.141733,0.009782,0.052851,0.319855,1.0,...,0.000794,0.00128,0.003221,0.065333,0.033979,0.024044,0.034308,0.004042,0.023004,0.0022


In [47]:
y_test_dataset = df_test.readmitted
X_test_dataset = df_test.drop(columns='readmitted')

In [48]:
y_train_dataset = df_train.readmitted
X_train_dataset = df_train.drop(columns='readmitted')

In [49]:
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

In [50]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train_dataset, y_train_dataset, test_size=0.2)


In [51]:
#logistic regression

from sklearn.linear_model import LogisticRegression

logistic = LogisticRegression(C=1e4)
logistic.fit(X_train, y_train)
logistic.score(X_train, y_train)



0.8855397262572176

In [52]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train, logistic.predict(X_train))

array([[63340,     0],
       [ 8187,     0]])

In [53]:
y_train.value_counts()

0    63340
1     8187
Name: readmitted, dtype: int64

In [54]:
logistic.score(X_valid, y_valid)

0.8885471423778101

In [55]:
y_valid_predict = logistic.predict(X_valid)


In [56]:
pd.crosstab(pd.Series(y_valid, name = 'Actual'), pd.Series(y_valid_predict, name = 'Predict'), margins = True)


Predict,0,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3087,3087
1,395,395
All,3482,3482


In [61]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score


In [62]:
thresh = 0.5


In [64]:
def print_report(y_actual, y_pred, thresh):
    
    auc = roc_auc_score(y_actual, y_pred)
    accuracy = accuracy_score(y_actual, (y_pred > thresh))
    recall = recall_score(y_actual, (y_pred > thresh))
    precision = precision_score(y_actual, (y_pred > thresh))
    specificity = calc_specificity(y_actual, y_pred, thresh)
    print('AUC:%.3f'%auc)
    print('accuracy:%.3f'%accuracy)
    print('recall:%.3f'%recall)
    print('precision:%.3f'%precision)
    print('specificity:%.3f'%specificity)
    print('prevalence:%.3f'%calc_prevalence(y_actual))
    print(' ')
    return auc, accuracy, recall, precision, specificity

In [69]:
y_train_preds = logistic.predict_proba(X_train)
y_valid_preds = logistic.predict_proba(X_valid)

print('Logistic Regression')
print('Training:')
lr_train_auc, lr_train_accuracy, lr_train_recall, lr_train_precision, lr_train_specificity = print_report(y_train,y_train_preds, thresh)
print('Validation:')
lr_valid_auc, lr_valid_accuracy, lr_valid_recall, lr_valid_precision, lr_valid_specificity = print_report(y_valid,y_valid_preds, thresh)

Logistic Regression
Training:


ValueError: bad input shape (71527, 2)