Braden Anderson and Hien Lam  
DS7333: Quantifying the World, Fall 2022
# <center> <u>**Case Study 2**</u>
Your case study is to build a classifier using logistic regression to predict hospital readmittance. There is missing data that must be imputed. Once again, discuss variable importances as part of your submission.

In [2]:
import pandas as pd, numpy as np, seaborn as sns, matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold, cross_validate, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer

In [3]:
diabetes = pd.read_csv('dataset_diabetes/diabetic_data.csv')
id_map = pd.read_csv('dataset_diabetes/IDs_mapping.csv')

# Preprocess
- `id_map` contains mapping to three columns in `diabetes` dataset: `admission_type_id`, `discharge_disposition_id`, `admission_source_id`. Will separate them into 3 dfs and join to diabetes df for eda purposes
- convert ? to nan
- drop column with 90% NAN
- drop columns with single values
- check if there are any duplicated columns
- decide imputation methods for necessary columns


In [3]:
pd.set_option('display.max_columns', None)
diabetes.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,?,Pediatrics-Endocrinology,41,0,1,0,0,0,250.83,?,?,1,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,?,?,59,0,18,0,0,0,276.0,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,?,?,11,5,13,2,0,1,648.0,250,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,?,?,44,1,16,0,0,0,8.0,250.43,403,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,?,?,51,0,8,0,0,0,197.0,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [4]:
diabetes.shape

(101766, 50)

In [5]:
id_map.head(15)

Unnamed: 0,admission_type_id,description
0,1,Emergency
1,2,Urgent
2,3,Elective
3,4,Newborn
4,5,Not Available
5,6,
6,7,Trauma Center
7,8,Not Mapped
8,,
9,discharge_disposition_id,description


In [6]:
id_map.shape

(67, 2)

## Clean id_map and map to diabetes df

In [7]:
admission_type_id = id_map.iloc[0:9].rename(columns={"description" : "admission_type_desc"})
discharge_disposition_id = id_map.iloc[10:41].rename(columns={"admission_type_id":"discharge_disposition_id",
                                                             "description" : "discharge_desc"})
admission_source_id = id_map.iloc[42:67].rename(columns={"admission_type_id":"admission_source_id",
                                                        "description" : "admission_source_desc"})

In [8]:
#def mapping_function(row):
#    return dictionary[row["admission_type_id"]]
#df["admission_decoded"] = df["admission_type_id"].apply(lambda x: mapping_function(x))

In [9]:
diabetes.loc[:, ["admission_type_id", "discharge_disposition_id", "admission_source_id"]].isna().sum().sum()

0

## NAs, single values, duplicates

In [4]:
# Figure out columns with missing values
missing_cols = [col for col in diabetes.columns if "?" in diabetes[col].unique()]
for col in missing_cols:
    diabetes.loc[diabetes[col]=="?", col] = np.nan

  missing_cols = [col for col in diabetes.columns if "?" in diabetes[col].unique()]


In [5]:
# Percentage of NA values. Drop `weight`
diabetes.isna().sum()[diabetes.isna().sum() > 0]/diabetes.shape[0]*100

race                  2.233555
weight               96.858479
payer_code           39.557416
medical_specialty    49.082208
diag_1                0.020636
diag_2                0.351787
diag_3                1.398306
dtype: float64

In [12]:
diabetes.drop(columns="weight", inplace=True)

In [6]:
# Remove columns with single values
single_vals = diabetes.columns[diabetes.eq(diabetes.iloc[0]).all()].tolist()
print(f"There are {len(single_vals)} columns with single values: {single_vals}")
print(f"Original data dimiension: {diabetes.shape}")
diabetes.drop(columns=single_vals, inplace=True)
print(f"Final data dim after dropping aforementioned columns: {diabetes.shape}")
print(f"There are {diabetes.isna().sum().sum()} NA values and {diabetes.duplicated().sum()} duplicated records")

There are 2 columns with single values: ['examide', 'citoglipton']
Original data dimiension: (101766, 50)
Final data dim after dropping aforementioned columns: (101766, 48)
There are 192849 NA values and 0 duplicated records


## EDA

In [8]:
diabetes.dtypes.value_counts()

object    35
int64     13
dtype: int64

In [28]:
diabetes.describe()

Unnamed: 0,encounter_id,patient_nbr,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
count,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0
mean,165201600.0,54330400.0,2.024006,3.715642,5.754437,4.395987,43.095641,1.33973,16.021844,0.369357,0.197836,0.635566,7.422607
std,102640300.0,38696360.0,1.445403,5.280166,4.064081,2.985108,19.674362,1.705807,8.127566,1.267265,0.930472,1.262863,1.9336
min,12522.0,135.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
25%,84961190.0,23413220.0,1.0,1.0,1.0,2.0,31.0,0.0,10.0,0.0,0.0,0.0,6.0
50%,152389000.0,45505140.0,1.0,1.0,7.0,4.0,44.0,1.0,15.0,0.0,0.0,0.0,8.0
75%,230270900.0,87545950.0,3.0,4.0,7.0,6.0,57.0,2.0,20.0,0.0,0.0,1.0,9.0
max,443867200.0,189502600.0,8.0,28.0,25.0,14.0,132.0,6.0,81.0,42.0,76.0,21.0,16.0


In [31]:
diabetes.describe(exclude=np.number)

Unnamed: 0,race,gender,age,payer_code,medical_specialty,diag_1,diag_2,diag_3,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
count,99493,101766,101766,61510,51817,101745,101408,100343,101766.0,101766.0,101766,101766,101766,101766,101766,101766,101766,101766,101766,101766,101766,101766,101766,101766,101766,101766,101766,101766,101766,101766,101766,101766,101766,101766
unique,5,3,10,17,72,716,748,789,4.0,4.0,4,4,4,4,4,2,4,4,2,4,4,4,4,2,3,4,4,2,2,2,2,2,2,3
top,Caucasian,Female,[70-80),MC,InternalMedicine,428,276,250,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO
freq,76099,54708,26068,32439,14635,6862,6752,11555,96420.0,84748.0,81778,100227,101063,101680,96575,101765,89080,91116,101743,94438,95401,101458,101728,101763,101727,47383,101060,101753,101765,101764,101765,54755,78363,54864


In [12]:
import seaborn as sns, matplotlib.pyplot as plt

In [18]:
# for i in diabetes.columns:
#     sns.displot(diabetes[i], kde=True)
#     plt.show()

## Imputation

### `medical_speciality`
- 45% missing values
- categorical dtype
- imputation method: 

In [16]:
diabetes.medical_specialty.value_counts(normalize=True)

InternalMedicine                    0.282436
Emergency/Trauma                    0.145995
Family/GeneralPractice              0.143582
Cardiology                          0.103287
Surgery-General                     0.059807
                                      ...   
Perinatology                        0.000019
Neurophysiology                     0.000019
Psychiatry-Addictive                0.000019
Pediatrics-InfectiousDiseases       0.000019
Surgery-PlasticwithinHeadandNeck    0.000019
Name: medical_specialty, Length: 72, dtype: float64

### `payer_code`
- 40% missing values
- categorical dtype
- imputation method: 

In [17]:
diabetes.payer_code.value_counts(normalize=True)

MC    0.527378
HM    0.102000
SP    0.081401
BC    0.075679
MD    0.057422
CP    0.041180
UN    0.039798
CM    0.031491
OG    0.016794
PO    0.009624
DM    0.008925
CH    0.002374
WC    0.002195
OT    0.001544
MP    0.001284
SI    0.000894
FR    0.000016
Name: payer_code, dtype: float64

### `race`
- 2% missing values
- categorical dtype
- imputation method: 

In [18]:
diabetes.race.value_counts(normalize=True)

Caucasian          0.764868
AfricanAmerican    0.193079
Hispanic           0.020474
Other              0.015137
Asian              0.006443
Name: race, dtype: float64

### `diag_3`
- Additional secondary diagnosis; 789 distinct values
- 1.4% missing values
- numerical dtype
- imputation method: 

In [24]:
diabetes.diag_3.nunique()

789

In [19]:
diabetes.diag_3.value_counts(normalize=True)

250     0.115155
401     0.082607
276     0.051573
428     0.045614
427     0.039415
          ...   
657     0.000010
684     0.000010
603     0.000010
E826    0.000010
971     0.000010
Name: diag_3, Length: 789, dtype: float64

### `diag_2`
- Secondary dignosis; 748 distinct values
- 0.35% missing values
- numerical dtype
- imputation method: 

In [23]:
diabetes.diag_2.nunique()

748

In [20]:
diabetes.diag_2.value_counts(normalize=True)

276     0.066583
428     0.065695
250     0.059867
427     0.049661
401     0.036841
          ...   
232     0.000010
908     0.000010
52      0.000010
E817    0.000010
927     0.000010
Name: diag_2, Length: 748, dtype: float64

### `diag_1`
- The primary diagnosis; 716 distinct values
- 0.02% missing values
- numerical dtype
- imputation method: 

In [22]:
diabetes.diag_1.nunique()

716

In [21]:
diabetes.diag_1.value_counts(normalize=True)

428    0.067443
414    0.064681
786    0.039471
410    0.035520
486    0.034478
         ...   
817    0.000010
61     0.000010
148    0.000010
870    0.000010
V51    0.000010
Name: diag_1, Length: 716, dtype: float64

# Modeling
- Scaler used:
- Imputation methods used:
- Summary stats after imputation

# Feature Importance

# References/Links
- Research paper/data dictionary: https://www.hindawi.com/journals/bmri/2014/781670/
- https://scikit-learn.org/stable/modules/impute.html
- https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html#sphx-glr-auto-examples-compose-plot-column-transformer-mixed-types-py