Braden Anderson and Hien Lam  
DS7333: Quantifying the World, Fall 2022
# <center> <u>Case Study 2</u>
Your case study is to build a classifier using logistic regression to predict hospital readmittance. There is missing data that must be imputed. Once again, discuss variable importances as part of your submission.

In [1]:
import pandas as pd, numpy as np, seaborn as sns, matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold, cross_validate, GridSearchCV
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, PowerTransformer
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from sklearn.compose import ColumnTransformer, make_column_selector as selector

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import PolynomialFeatures
from diabetes_case_study import *

In [2]:
diabetes = pd.read_csv('dataset_diabetes/diabetic_data.csv')
id_map = pd.read_csv('dataset_diabetes/IDs_mapping.csv')

# Preprocess
- `id_map` contains mapping to three columns in `diabetes` dataset: `admission_type_id`, `discharge_disposition_id`, `admission_source_id`. Will separate them into 3 dfs and join to diabetes df for eda purposes
- convert ? to nan
- drop column with 90% NAN
- drop columns with single values
- check if there are any duplicated columns
- decide imputation methods for necessary columns


In [3]:
pd.set_option('display.max_columns', None)
diabetes.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,?,Pediatrics-Endocrinology,41,0,1,0,0,0,250.83,?,?,1,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,?,?,59,0,18,0,0,0,276.0,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,?,?,11,5,13,2,0,1,648.0,250,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,?,?,44,1,16,0,0,0,8.0,250.43,403,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,?,?,51,0,8,0,0,0,197.0,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [4]:
diabetes.shape

(101766, 50)

In [5]:
diabetes = diabetes.loc[diabetes["gender"] != 'Unknown/Invalid', :]

In [6]:
id_map.head(15)

Unnamed: 0,admission_type_id,description
0,1,Emergency
1,2,Urgent
2,3,Elective
3,4,Newborn
4,5,Not Available
5,6,
6,7,Trauma Center
7,8,Not Mapped
8,,
9,discharge_disposition_id,description


In [7]:
id_map.shape

(67, 2)

## Clean id_map and map to diabetes df

In [8]:
admission_type_id = id_map.iloc[0:9].rename(columns={"description" : "admission_type_desc"})
admission_type_id["admission_type_id"] = admission_type_id["admission_type_id"].astype(float)


discharge_disposition_id = id_map.iloc[10:41].rename(columns={"admission_type_id":"discharge_disposition_id",
                                                             "description" : "discharge_desc"})
discharge_disposition_id["discharge_disposition_id"] = discharge_disposition_id["discharge_disposition_id"].astype(float)


admission_source_id = id_map.iloc[42:67].rename(columns={"admission_type_id":"admission_source_id",
                                                        "description" : "admission_source_desc"})
admission_source_id["admission_source_id"] = admission_source_id["admission_source_id"].astype(float)

In [9]:
def get_code_map(df, code_col, desc_col):
    return {code:desc for code, desc in zip(df[code_col], df[desc_col])}

# Map admission type id to its description, creating new admission_type_desc column
admit_type_map = get_code_map(df=admission_type_id, 
                              code_col="admission_type_id", 
                              desc_col="admission_type_desc")
diabetes["admission_type_desc"] = [admit_type_map[code] for code in diabetes["admission_type_id"]]


# Map discharge dispo id to its description, creating new discharge_desc column
discharge_map = get_code_map(df=discharge_disposition_id, 
                             code_col="discharge_disposition_id", 
                             desc_col="discharge_desc")
diabetes["discharge_desc"] = [discharge_map[code] for code in diabetes["discharge_disposition_id"]]

# Map admission source id to its description, creating new admission_source_desc column
admit_source_map = get_code_map(df=admission_source_id, 
                                code_col="admission_source_id", 
                                desc_col="admission_source_desc")
diabetes["admission_source_desc"] = [admit_source_map[code] for code in diabetes["admission_source_id"]]

In [10]:
diabetes["admission_type_desc"] = diabetes["admission_type_desc"].fillna(value="Unknown")
diabetes["discharge_desc"] = diabetes["discharge_desc"].fillna(value="Unknown")
diabetes["admission_source_desc"] = diabetes["admission_source_desc"].fillna(value="Unknown")

for column in ["discharge_desc", "admission_type_desc", "admission_source_desc"]:
    print(f"Column: {column}, Number of NAs:{diabetes[column].isna().sum()}")

Column: discharge_desc, Number of NAs:0
Column: admission_type_desc, Number of NAs:0
Column: admission_source_desc, Number of NAs:0


In [11]:
#discharge_disposition_id
#diabetes.loc[diabetes["discharge_disposition_id"]==18,:]
#admission_source_id
#diabetes.loc[diabetes["admission_source_id"]==17,:].shape
#admission_type_id
#diabetes.loc[diabetes["admission_type_id"]==6,:].shape

In [12]:
id_desc_cols = ["admission_type_id", "admission_type_desc", 
                "discharge_disposition_id", "discharge_desc", 
                "admission_source_id", "admission_source_desc"]

diabetes.loc[:, id_desc_cols].head(10)

Unnamed: 0,admission_type_id,admission_type_desc,discharge_disposition_id,discharge_desc,admission_source_id,admission_source_desc
0,6,Unknown,25,Not Mapped,1,Physician Referral
1,1,Emergency,1,Discharged to home,7,Emergency Room
2,1,Emergency,1,Discharged to home,7,Emergency Room
3,1,Emergency,1,Discharged to home,7,Emergency Room
4,1,Emergency,1,Discharged to home,7,Emergency Room
5,2,Urgent,1,Discharged to home,2,Clinic Referral
6,3,Elective,1,Discharged to home,2,Clinic Referral
7,1,Emergency,1,Discharged to home,7,Emergency Room
8,2,Urgent,1,Discharged to home,4,Transfer from a hospital
9,3,Elective,3,Discharged/transferred to SNF,4,Transfer from a hospital


## NAs, single values, duplicates

In [13]:
# Figure out columns with missing values
missing_cols = [col for col in diabetes.columns if "?" in diabetes[col].unique().tolist()]
for col in missing_cols:
    diabetes.loc[diabetes[col]=="?", col] = np.nan

In [14]:
# Percentage of NA values. Drop `weight`
diabetes.isna().sum()[diabetes.isna().sum() > 0]/diabetes.shape[0]*100

race                  2.231656
weight               96.858387
payer_code           39.557600
medical_specialty    49.081690
diag_1                0.020636
diag_2                0.351798
diag_3                1.398347
dtype: float64

In [15]:
diabetes["age"] = diabetes["age"].map({"[0-10)":'Young', 
                                           "[10-20)":'Young', 
                                           "[20-30)":'Young', 
                                           "[30-40)":'Middle', 
                                           "[40-50)":'Middle', 
                                           "[50-60)":'Middle', 
                                           "[60-70)":'Old', 
                                           "[70-80)":'Old', 
                                           "[80-90)":'Old', 
                                           "[90-100)":'Old'})
diabetes.drop(columns="weight", inplace=True)

In [16]:
# Remove columns with single values
single_vals = diabetes.columns[diabetes.eq(diabetes.iloc[0]).all()].tolist()
print(f"There are {len(single_vals)} columns with single values: {single_vals}")
print(f"Original data dimiension: {diabetes.shape}")
diabetes.drop(columns=single_vals, inplace=True)
print(f"Final data dim after dropping aforementioned columns: {diabetes.shape}")
print(f"There are {diabetes.isna().sum().sum()} NA values and {diabetes.duplicated().sum()} duplicated records")

There are 2 columns with single values: ['examide', 'citoglipton']
Original data dimiension: (101763, 52)
Final data dim after dropping aforementioned columns: (101763, 50)
There are 94275 NA values and 0 duplicated records


## EDA

In [17]:
diabetes.dtypes.value_counts()

object    37
int64     13
dtype: int64

In [18]:
diabetes.describe()

Unnamed: 0,encounter_id,patient_nbr,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
count,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0
mean,165200800.0,54329650.0,2.024017,3.715515,5.754459,4.396018,43.095909,1.339691,16.021835,0.369368,0.197842,0.635585,7.422649
std,102641000.0,38696580.0,1.445414,5.279919,4.06411,2.985092,19.67422,1.705792,8.127589,1.267282,0.930485,1.262877,1.933578
min,12522.0,135.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
25%,84959750.0,23412960.0,1.0,1.0,1.0,2.0,31.0,0.0,10.0,0.0,0.0,0.0,6.0
50%,152388300.0,45500490.0,1.0,1.0,7.0,4.0,44.0,1.0,15.0,0.0,0.0,0.0,8.0
75%,230269800.0,87545710.0,3.0,4.0,7.0,6.0,57.0,2.0,20.0,0.0,0.0,1.0,9.0
max,443867200.0,189502600.0,8.0,28.0,25.0,14.0,132.0,6.0,81.0,42.0,76.0,21.0,16.0


In [19]:
diabetes.describe(exclude=np.number)

Unnamed: 0,race,gender,age,payer_code,medical_specialty,diag_1,diag_2,diag_3,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,admission_type_desc,discharge_desc,admission_source_desc
count,99492,101763,101763,61508,51816,101742,101405,100340,101763.0,101763.0,101763,101763,101763,101763,101763,101763,101763,101763,101763,101763,101763,101763,101763,101763,101763,101763,101763,101763,101763,101763,101763,101763,101763,101763,101763,101763,101763
unique,5,2,3,17,72,716,748,789,4.0,4.0,4,4,4,4,4,2,4,4,2,4,4,4,4,2,3,4,4,2,2,2,2,2,2,3,8,26,17
top,Caucasian,Female,Old,MC,InternalMedicine,428,276,250,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO,Emergency,Discharged to home,Emergency Room
freq,76099,54708,68538,32439,14635,6862,6752,11555,96417.0,84745.0,81776,100224,101060,101677,96572,101762,89078,91113,101740,94436,95399,101455,101725,101760,101724,47380,101057,101750,101762,101761,101762,54754,78361,54861,53988,60232,57492


In [20]:
diabetes.select_dtypes(include=object).head()

Unnamed: 0,race,gender,age,payer_code,medical_specialty,diag_1,diag_2,diag_3,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,admission_type_desc,discharge_desc,admission_source_desc
0,Caucasian,Female,Young,,Pediatrics-Endocrinology,250.83,,,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO,Unknown,Not Mapped,Physician Referral
1,Caucasian,Female,Young,,,276.0,250.01,255,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30,Emergency,Discharged to home,Emergency Room
2,AfricanAmerican,Female,Young,,,648.0,250.0,V27,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO,Emergency,Discharged to home,Emergency Room
3,Caucasian,Male,Middle,,,8.0,250.43,403,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,Emergency,Discharged to home,Emergency Room
4,Caucasian,Male,Middle,,,197.0,157.0,250,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO,Emergency,Discharged to home,Emergency Room


In [21]:
#cols = ["A1Cresult", "metformin", 'metformin-pioglitazone', "glipizide-metformin", "metformin-rosiglitazone"]
#diabetes.loc[:,cols]

In [22]:
# TODO: pipeline without encoder, plot after imputing

# numeric_pipeline = Pipeline(steps=[('num_imputer', SimpleImputer(strategy="median")),
#                                    ('scaler', PowerTransformer(method="yeo-johnson"))])

# nominal_pipeline = Pipeline(steps=[('nom_imputer', SimpleImputer(strategy="most_frequent")), 
#                                    ('nom_encoder', OneHotEncoder(handle_unknown="infrequent_if_exist"))])

# numeric_features = ["diag_1", "diag_2", "diag_3"]
# nominal_cat_feats = ["race", "payer_code", "medical_specialty"]

# preprocess = ColumnTransformer(transformers=[("numeric_feats", numeric_pipeline, selector(dtype_include=int)), 
#                                              ("nom_cat_feats", nominal_pipeline, selector(dtype_include=object))],
#                                remainder="passthrough",
#                                sparse_threshold=0.3)

In [23]:
# for i in diabetes.columns:
#     #fig, ax = plt.subplots(nrows=1, ncols=1, figsize(10, 20))
#     sns.displot(diabetes[i], kde=True)
#     plt.show()

In [24]:
diabetes["age"].nunique()

3

## Imputation

### `medical_speciality`
- 45% missing values
- categorical dtype
- imputation method: 

### `payer_code`
- 40% missing values
- categorical dtype
- imputation method: 

In [25]:
diabetes.payer_code.value_counts(normalize=True)

MC    0.527395
HM    0.102003
SP    0.081404
BC    0.075681
MD    0.057423
CP    0.041149
UN    0.039800
CM    0.031492
OG    0.016795
PO    0.009625
DM    0.008926
CH    0.002374
WC    0.002195
OT    0.001545
MP    0.001284
SI    0.000894
FR    0.000016
Name: payer_code, dtype: float64

In [26]:
# Fill missing payer code values with unknown
diabetes["payer_code"] = diabetes["payer_code"].fillna(value="Unknown")

### `race`
- 2% missing values
- categorical dtype
- imputation method: 

In [27]:
diabetes.race.value_counts(normalize=True)

Caucasian          0.764876
AfricanAmerican    0.193081
Hispanic           0.020474
Other              0.015127
Asian              0.006443
Name: race, dtype: float64

### `diag_3`
- Additional secondary diagnosis; 789 distinct values
- 1.4% missing values
- numerical dtype
- imputation method: 

In [28]:
diabetes.diag_3.nunique()

789

In [29]:
diabetes.diag_3.value_counts(normalize=True)

250     0.115158
401     0.082599
276     0.051575
428     0.045615
427     0.039416
          ...   
657     0.000010
684     0.000010
603     0.000010
E826    0.000010
971     0.000010
Name: diag_3, Length: 789, dtype: float64

### `diag_2`
- Secondary dignosis; 748 distinct values
- 0.35% missing values
- numerical dtype
- imputation method: 

In [30]:
diabetes.diag_2.nunique()

748

In [31]:
diabetes.diag_2.value_counts(normalize=True)

276     0.066584
428     0.065697
250     0.059869
427     0.049662
401     0.036842
          ...   
232     0.000010
908     0.000010
52      0.000010
E817    0.000010
927     0.000010
Name: diag_2, Length: 748, dtype: float64

### `diag_1`
- The primary diagnosis; 716 distinct values
- 0.02% missing values
- numerical dtype
- imputation method: 

Map diag_1 column values per Table 2 of Impact of HbA1c Measurement on Hospital Readmission Rates:
Analysis of 70,000 Clinical Database Patient Records

In [32]:
def combine_dicts(*args):
    combined = {}
    for d in args:
        for k, v in d.items():
            combined[k]=v
    return combined

m1 = {str(num):"Circulatory" for num in list(range(390, 460)) + [785]}
m2 = {str(num):"Respiratory" for num in list(range(460, 520)) + [786]}
m3 = {str(num):"Digestive" for num in list(range(520, 580)) + [787]}
m4 = {str(num):"Diabetes" for num in [val for val in diabetes["diag_1"].unique().tolist() if str(val).startswith("250")]}
m5 = {str(num):"Injury" for num in range(800, 1_000)}
m6 = {str(num):"Musculoskeletal" for num in range(710, 740)}
m7 = {str(num):"Genitourinary" for num in list(range(580, 630)) + [788]}
m8 = {str(num):"Neoplasms" for num in list(range(140, 240)) + 
      [780, 781, 782, 784] + list(range(790, 800)) + [val for val in range(240, 280) if val != 250] + 
      list(range(680, 710)) + list(range(1, 140))}

combined_map = combine_dicts(m1,m2,m3,m4,m5,m6,m7,m8)
other_map = {val:"Other" for val in diabetes["diag_1"].to_numpy() if val not in combined_map.keys()}
full_map = combine_dicts(combined_map,other_map)
diabetes["diag_1"] = diabetes["diag_1"].map(full_map)

In [33]:
diabetes.diag_1.nunique()

9

In [34]:
diabetes.diag_1.value_counts(normalize=True)

Circulatory        0.299087
Respiratory        0.141731
Neoplasms          0.138125
Digestive          0.093108
Diabetes           0.086053
Other              0.074389
Injury             0.068512
Genitourinary      0.050284
Musculoskeletal    0.048711
Name: diag_1, dtype: float64

train_df# Modeling
- Scaler used:
- Imputation methods used:
- Summary stats after imputation

In [35]:
# plot_categorical_interactions(df=diabetes, 
#                               x_categorical="age", 
#                               interaction_categorical="num_procedures", 
#                               response_variable="readmitted", 
#                               response_success_level="NO", 
#                               conf_level=0.95, 
#                               figsize=(18, 6))


# plot_categorical_interactions(df=diabetes, 
#                               x_categorical="age", 
#                               interaction_categorical="diag_1", 
#                               response_variable="readmitted", 
#                               response_success_level="NO", 
#                               conf_level=0.95, 
#                               figsize=(18, 6))


# plot_categorical_interactions(df=model_df, 
#                               x_categorical="race", 
#                               interaction_categorical="gender", 
#                               response_variable="readmitted", 
#                               response_success_level="NO", 
#                               conf_level=0.95, 
#                               figsize=(18, 6))


# plot_categorical_proportions(df=model_df, 
#                               grouping_variable_list=["race", "gender"], response_variable="readmitted",
#                               response_success_level="NO")

In [36]:
model_df = diabetes.drop(columns=['admission_type_id', 'discharge_disposition_id', 'admission_source_id']).copy(deep=True)

keep_diag1 = model_df["diag_1"].value_counts(normalize=True).sort_values(ascending=True).index.to_numpy()[:10]
model_df.loc[~model_df["diag_1"].isin(keep_diag1),"diag_1"] = "Other"
model_df.drop(columns=["diag_2", "diag_3"], inplace=True)

columns_with_missings = model_df.isna().sum()[model_df.isna().sum() > 0].index.tolist()

model_df.loc[:,columns_with_missings]

Unnamed: 0,race,medical_specialty
0,Caucasian,Pediatrics-Endocrinology
1,Caucasian,
2,AfricanAmerican,
3,Caucasian,
4,Caucasian,
...,...,...
101761,AfricanAmerican,
101762,AfricanAmerican,
101763,Caucasian,
101764,Caucasian,Surgery-General


In [37]:
poly_features = ["num_procedures", "num_medications", "number_inpatient", "time_in_hospital", 
                 "number_diagnoses", "number_outpatient", "number_emergency"]

p_feats = PolynomialFeatures(interaction_only=True, include_bias=False)

poly_df = pd.DataFrame(p_feats.fit_transform(model_df.loc[:,poly_features]), 
                       columns=p_feats.get_feature_names_out())

poly_df = poly_df.loc[:,[col for col in poly_df.columns if col not in model_df.columns]]

In [38]:
model_df.reset_index(drop=True, inplace=True)
poly_df.reset_index(drop=True, inplace=True)

In [39]:
model_df.loc[:,poly_df.columns] = poly_df

In [40]:
model_df.isna().sum()[model_df.isna().sum() > 0 ]

race                  2271
medical_specialty    49947
dtype: int64

In [41]:
# train test split
# cross validation with training --> pick model
# with picked model, test 
train_df, test_df = train_test_split(model_df,
                                     test_size=0.05, 
                                     stratify=model_df["readmitted"],
                                     random_state=77)

In [42]:
X_train, y_train = train_df.drop(columns="readmitted"), train_df["readmitted"]
X_test, y_test = test_df.drop(columns="readmitted"), test_df["readmitted"]

In [52]:
# TODO: pipeline without encoder, plot after imputing

numeric_pipeline = Pipeline(steps=[('num_imputer', SimpleImputer(strategy="median")),
                                   ('scaler', PowerTransformer(method="yeo-johnson"))])

nominal_pipeline = Pipeline(steps=[('nom_imputer', SimpleImputer(strategy="most_frequent")), 
                                   ('nom_encoder', OneHotEncoder(handle_unknown="infrequent_if_exist"))])



preprocess = ColumnTransformer(transformers=[("num", numeric_pipeline, selector(dtype_include=[int,float])), 
                                             ("cat", nominal_pipeline, selector(dtype_include=[object, "category"]))],
                               remainder="passthrough",
                               sparse_threshold=0.3)

model = Pipeline(steps=[("preprocess", preprocess), 
                        ("logreg", LogisticRegression(random_state=77, 
                                                      solver="saga",
                                                      multi_class="multinomial", 
                                                      max_iter=10_000))])

In [53]:
## Baseline model
cv_results = cross_validate(estimator=model, 
                            X=X_train,
                            y=y_train, 
                            cv=StratifiedKFold(n_splits=5, 
                                               shuffle=True, 
                                               random_state=77), 
                            n_jobs=-1,
                            return_train_score=True, 
                            return_estimator=True,
                            scoring=["accuracy", 
                                     "f1_weighted", 
                                     "precision_weighted"], 
                            error_score="raise")

coef_df = pd.DataFrame({"parameter":cv_results['estimator'][0][:-1].get_feature_names_out(),
                        "model1_coefs":cv_results['estimator'][0].named_steps["logreg"].coef_[0,:], 
                        "model2_coefs":cv_results['estimator'][0].named_steps["logreg"].coef_[1,:],
                        "model3_coefs":cv_results['estimator'][0].named_steps["logreg"].coef_[2,:]})

model1_coef_df = coef_df.reindex(coef_df["model1_coefs"].abs().sort_values(ascending=False).index)
model2_coef_df = coef_df.reindex(coef_df["model2_coefs"].abs().sort_values(ascending=False).index)
model3_coef_df = coef_df.reindex(coef_df["model3_coefs"].abs().sort_values(ascending=False).index)

print(f"Train Accuracy: {cv_results['train_accuracy']}")
print(f"Test Accuracy: {cv_results['test_accuracy']}")

Train Accuracy: [0.59752518 0.59607701 0.59743467 0.59831392 0.59868115]
Test Accuracy: [0.59534523 0.59829325 0.59550039 0.59405224 0.59010034]


In [54]:
cv_results['test_accuracy'].mean()

0.5946582891165386

In [57]:
%%time

# Smaller value = Stronger Regularization
reg_strength = [0.05, 0.1, 0.2, 0.3, 0.5, 0.7, 0.9, 1.0, 1.1, 1.3, 1.5, 2.0, 2.5, 5.0, 10.0]

parameter_grid = [{"logreg__penalty":["l1", "l2"], 
                   "logreg__class_weight":['balanced', None],
                   "logreg__C":reg_strength}, 
                  
                  {"logreg__penalty":["elasticnet"], 
                   "logreg__class_weight":["balanced", None], 
                   "logreg__C":reg_strength, 
                   "logreg__l1_ratio":[0.25, 0.5, 0.75]}]

gs = run_gridsearch(X=X_train, 
                    y=y_train, 
                    estimator=model, 
                    param_grid=parameter_grid, 
                    save_name=get_gs_save_name(model_name="logreg"))

gs_df = pd.DataFrame(gs.cv_results_)

#g = load_gs_from_pickle(pickle_filepath="./models/logreg_20220914_2311_gs.pkl")
#gs_df = pd.DataFrame(g.cv_results_)

Fitting 5 folds for each of 150 candidates, totalling 750 fits
CPU times: total: 3min 42s
Wall time: 4h 27min 25s


In [60]:
gs.best_score_

0.5951651713997503

In [59]:
gs.best_params_

{'logreg__C': 0.9, 'logreg__class_weight': None, 'logreg__penalty': 'l2'}

In [43]:
# No variance issue, model suffering from bias
gs_df.sort_values(by="rank_test_accuracy").head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_logreg__C,param_logreg__class_weight,param_logreg__penalty,param_logreg__l1_ratio,params,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,split3_test_accuracy,split4_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy,split0_train_accuracy,split1_train_accuracy,split2_train_accuracy,split3_train_accuracy,split4_train_accuracy,mean_train_accuracy,std_train_accuracy,split0_test_f1_weighted,split1_test_f1_weighted,split2_test_f1_weighted,split3_test_f1_weighted,split4_test_f1_weighted,mean_test_f1_weighted,std_test_f1_weighted,rank_test_f1_weighted,split0_train_f1_weighted,split1_train_f1_weighted,split2_train_f1_weighted,split3_train_f1_weighted,split4_train_f1_weighted,mean_train_f1_weighted,std_train_f1_weighted,split0_test_precision_weighted,split1_test_precision_weighted,split2_test_precision_weighted,split3_test_precision_weighted,split4_test_precision_weighted,mean_test_precision_weighted,std_test_precision_weighted,rank_test_precision_weighted,split0_train_precision_weighted,split1_train_precision_weighted,split2_train_precision_weighted,split3_train_precision_weighted,split4_train_precision_weighted,mean_train_precision_weighted,std_train_precision_weighted
7,21.782903,0.988658,0.6522,0.017452,0.3,,l2,,"{'logreg__C': 0.3, 'logreg__class_weight': Non...",0.580265,0.581765,0.581484,0.582105,0.580657,0.581255,0.000689,1,0.582705,0.582046,0.582025,0.582142,0.583344,0.582452,0.000511,0.525883,0.529178,0.527676,0.528666,0.527054,0.527691,0.001169,1,0.529411,0.528244,0.528626,0.52833,0.530207,0.528963,0.000745,0.51734,0.526117,0.54049,0.57784,0.502257,0.532809,0.025707,67,0.532377,0.542366,0.529339,0.533513,0.532867,0.534092,0.004378
68,89.098782,9.186466,0.6522,0.031536,0.5,,elasticnet,0.5,"{'logreg__C': 0.5, 'logreg__class_weight': Non...",0.580161,0.581816,0.581846,0.581795,0.580553,0.581234,0.000727,2,0.582589,0.581929,0.581961,0.582064,0.58306,0.582321,0.00044,0.525809,0.529059,0.528022,0.528276,0.526928,0.527619,0.001133,5,0.529293,0.528049,0.528519,0.528199,0.529873,0.528786,0.000692,0.538378,0.503816,0.540898,0.558928,0.50214,0.528832,0.022274,72,0.532247,0.540606,0.52926,0.535526,0.528162,0.53316,0.004518
22,168.511868,20.864463,0.680217,0.029533,1.0,,l1,,"{'logreg__C': 1.0, 'logreg__class_weight': Non...",0.580213,0.581661,0.581795,0.581691,0.580657,0.581203,0.000644,3,0.582705,0.581916,0.581935,0.582064,0.583163,0.582357,0.000496,0.525925,0.528925,0.528019,0.528216,0.527102,0.527637,0.001035,3,0.529465,0.528139,0.528553,0.528253,0.530024,0.528887,0.000735,0.523688,0.503706,0.531576,0.547662,0.502337,0.521794,0.017171,111,0.537034,0.535862,0.528317,0.534684,0.523435,0.531867,0.005182
74,106.123342,8.874493,0.6468,0.011686,0.7,,elasticnet,0.5,"{'logreg__C': 0.7, 'logreg__class_weight': Non...",0.58011,0.581558,0.581691,0.58195,0.580553,0.581172,0.000712,4,0.582679,0.581968,0.582012,0.582038,0.583215,0.582383,0.000492,0.525821,0.528826,0.527918,0.528485,0.526971,0.527604,0.001091,9,0.529397,0.528183,0.528608,0.52822,0.530073,0.528896,0.000733,0.523578,0.503596,0.531452,0.547939,0.502247,0.521762,0.017282,112,0.53381,0.539354,0.527489,0.529923,0.526119,0.531339,0.004784
67,67.59851,4.25647,0.676564,0.063606,0.5,,elasticnet,0.25,"{'logreg__C': 0.5, 'logreg__class_weight': Non...",0.58011,0.581609,0.581691,0.58195,0.580502,0.581172,0.000727,5,0.582692,0.582007,0.582051,0.58209,0.583267,0.582421,0.000491,0.525794,0.528888,0.5279,0.528485,0.526895,0.527593,0.001122,10,0.529398,0.528193,0.528646,0.528258,0.530129,0.528925,0.000739,0.5291,0.503654,0.540744,0.547939,0.502152,0.524718,0.018805,81,0.532349,0.539227,0.529365,0.533466,0.526178,0.532117,0.004365


# Feature Importance

In [46]:
model1_coef_df.head()

Unnamed: 0,parameter,model1_coefs,model2_coefs,model3_coefs
222,cat__discharge_desc_Expired,-1.681648,-2.0329,3.714548
206,cat__discharge_desc_Admitted as an inpatient t...,1.06742,-0.603056,-0.464365
52,cat__medical_specialty_Hematology,0.92775,-0.297245,-0.630505
218,cat__discharge_desc_Discharged/transferred wit...,0.894149,0.061394,-0.955544
230,cat__discharge_desc_Still patient or expected ...,0.882618,-0.463921,-0.418697


In [47]:
model2_coef_df.head()

Unnamed: 0,parameter,model1_coefs,model2_coefs,model3_coefs
222,cat__discharge_desc_Expired,-1.681648,-2.0329,3.714548
226,cat__discharge_desc_Hospice / medical facility,0.137117,-1.539188,1.402071
140,cat__chlorpropamide_Up,-0.265802,0.903053,-0.637251
79,cat__medical_specialty_Pediatrics-Pulmonology,-0.384556,0.87639,-0.491834
76,cat__medical_specialty_Pediatrics-Hematology-O...,0.283303,-0.775389,0.492087


In [48]:
model3_coef_df.head()

Unnamed: 0,parameter,model1_coefs,model2_coefs,model3_coefs
222,cat__discharge_desc_Expired,-1.681648,-2.0329,3.714548
226,cat__discharge_desc_Hospice / medical facility,0.137117,-1.539188,1.402071
203,cat__admission_type_desc_Trauma Center,-0.359086,-0.660756,1.019842
225,cat__discharge_desc_Hospice / home,-0.734975,-0.280386,1.015362
75,cat__medical_specialty_Pediatrics-Endocrinology,-0.85188,-0.147373,0.999253


In [49]:
model3_coef_df.tail()

Unnamed: 0,parameter,model1_coefs,model2_coefs,model3_coefs
197,cat__diabetesMed_Yes,-0.06152,0.056177,0.005343
235,cat__admission_source_desc_ Not Available,0.365935,-0.361391,-0.004545
33,cat__payer_code_SI,0.368525,-0.366562,-0.001963
130,cat__repaglinide_No,0.069995,-0.071551,0.001556
131,cat__repaglinide_Steady,0.023072,-0.024387,0.001315


# References/Links
- Research paper/data dictionary: https://www.hindawi.com/journals/bmri/2014/781670/
- https://scikit-learn.org/stable/modules/impute.html
- https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html#sphx-glr-auto-examples-compose-plot-column-transformer-mixed-types-py