# Setup

In [861]:
import os
import numpy as np
import pandas as pd
import sklearn as skl
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
import seaborn as sns
import pickle

#I may change environs between desktop & laptop, so I run the below to check my WD and that multicore processing is still available.
print("WD:", os.getcwd())
print("CPU cores:", os.cpu_count())

WD: D:\School\2024 Spring\CSC74020 Machine Learning\Assignment_2
CPU cores: 16



# Part A: Model Code and Exploration (100 pts)

1. Perform Exploratory Data Analysis (EDA) and discuss the data and what you observe
prior to beginning modeling and how impact how to proceed [10 pts]

In [862]:
full_train_data = pd.read_csv("8k_diabetes_train.csv",
                              na_values="?")
print("Length: ", len(full_train_data))
print("Width: ", len(full_train_data.columns))
print("Variables: ", full_train_data.columns)
full_train_data.head()

Length:  8000
Width:  40
Variables:  Index(['discharge_disposition_id', 'admission_source_id', 'payer_code',
       'medical_specialty', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_diagnoses', 'max_glu_serum', 'metformin', 'repaglinide',
       'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide',
       'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone',
       'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide',
       'examide', 'citoglipton', 'insulin', 'glyburide.metformin',
       'glipizide.metformin', 'glimepiride.pioglitazone',
       'metformin.rosiglitazone', 'metformin.pioglitazone', 'change',
       'diabetesMed', 'readmitted', '2nd_diag', '3rd_diag', 'ai_response'],
      dtype='object')


Unnamed: 0,discharge_disposition_id,admission_source_id,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_diagnoses,...,glipizide.metformin,glimepiride.pioglitazone,metformin.rosiglitazone,metformin.pioglitazone,change,diabetesMed,readmitted,2nd_diag,3rd_diag,ai_response
0,Discharged to home,Transfer from a hospital,,,24,2,17,0,0,8,...,No,No,No,No,No,Yes,False,414,428,"Based on the diagnosis code 250.01, which indi..."
1,Discharged to home,Transfer from a hospital,,,37,3,14,0,0,8,...,No,No,No,No,No,Yes,False,410,414,"Based on the information provided, I recommend..."
2,Discharged to home,Emergency Room,SP,Emergency/Trauma,60,4,17,0,0,9,...,No,No,No,No,No,No,True,537,786,"Based on the information provided, the patient..."
3,Discharged to home,Emergency Room,,,40,3,25,1,0,9,...,No,No,No,No,Ch,Yes,True,425,428,"Based on the information provided, the patient..."
4,Discharged to home,Physician Referral,MD,Surgery-General,31,2,18,0,0,7,...,No,No,No,No,No,Yes,True,682,998,"Based on the information provided, the patient..."


In [863]:
# I investigated value counts for every variable. For brevity, I include only this one example in my final submission.
full_train_data.discharge_disposition_id.value_counts(dropna=False)

Discharged to home                                                                                             4864
Discharged/transferred to SNF                                                                                   954
Discharged/transferred to home with home health service                                                         920
NaN                                                                                                             364
Expired                                                                                                         153
Discharged/transferred to another short term hospital                                                           139
Discharged/transferred to another rehab fac including rehab units of a hospital.                                134
Discharged/transferred to another  type of inpatient care institution                                           130
Not Mapped                                                              

In [864]:
full_train_data.payer_code.value_counts()

MC    1988
HM     364
BC     300
SP     242
UN     221
MD     189
CP     159
CM      88
DM      46
OG      39
PO      34
WC       9
SI       8
OT       6
CH       5
Name: payer_code, dtype: int64

For "Payer Code" & "Medical Specialty", "?" is the largest class. I re-wrote my read_csv command to explicitly identify these as NAs. This is a lot of missing data for these columns. Next, I'll look at the distributions of the continuous variables.

In [865]:
# I also did the below for all other continuous variables. Those cells have been deleted from the final submission for brevity.
full_train_data.num_lab_procedures.describe()

count    8000.000000
mean       43.183375
std        19.518187
min         1.000000
25%        32.000000
50%        44.000000
75%        57.000000
max       120.000000
Name: num_lab_procedures, dtype: float64

In [866]:
# I use the below code to identify which columns have any NAs.
pd.isna(full_train_data).max()

discharge_disposition_id     True
admission_source_id          True
payer_code                   True
medical_specialty            True
num_lab_procedures          False
num_procedures              False
num_medications             False
number_outpatient           False
number_emergency            False
number_diagnoses            False
max_glu_serum                True
metformin                   False
repaglinide                 False
nateglinide                 False
chlorpropamide              False
glimepiride                 False
acetohexamide               False
glipizide                   False
glyburide                   False
tolbutamide                 False
pioglitazone                False
rosiglitazone               False
acarbose                    False
miglitol                    False
troglitazone                False
tolazamide                  False
examide                     False
citoglipton                 False
insulin                     False
glyburide.metf

We have 8000 records of 39 features plus one target feature for readmission. We have a variety of continuous, categorical, and boolean variables, plus one string variable of text called "ai_response." For most of the continuous variables (and several of the others), there appear to be a small but not insignificant number of extreme outliers, I will want to keep this in mind when performing test_train_split and training.

Further, we have na values that we will want to somehow deal with in the variables: # Discharge_disposition_id, Admission_source_id, and Max_glu_serum, "Payer Code" & "Medical Specialty". I may also want to rename "?" values to proper NAs before doing so.

"2nd_diag" and "3rd_diag" are unclear. The name, repeated values, occasional letters & decimals all seem to imply diagnosis codes. As such, it may not be reasonable to try to impute the few NAs.

2. Pre-processed categorical data for use in the model and justified pre-processing
method. Note this may be different for each algorithm you try. [10 pts]

I address the columns in order, one at a time. First,  "discharge_disposition_id".

I believe this variable is best left as categorical (rather than ordinal), as there is no clear hierarchy of values. I believe "Not Mapped" should be NAs, and I reason that patients who die in care necessarily can't be readmitted, so other data about them may not be useful for predicting readmission. On inspection, I see no reasonable way to impute NAs as any other values.--The AI output seems to agree for many of those patients.

In [867]:
full_train_data.discharge_disposition_id = full_train_data.discharge_disposition_id.replace("Not Mapped", np.nan)

In [868]:
# I confirm that there are no zombies in our data:
full_train_data.loc[(full_train_data['discharge_disposition_id'] == "Expired")].readmitted.value_counts()

False    153
Name: readmitted, dtype: int64

In [869]:
# I use the following method for dropping rows because alternative methods can result in missing indices.
# This method ensures that I will not encounter missing indices errors later on.
indices_to_drop = full_train_data[full_train_data['discharge_disposition_id'] == "Expired"].index
full_train_data.drop(index=indices_to_drop, inplace=True)
#See how the indices are not decremented
print("Index '135' is now named: ", (full_train_data.iloc[135].name))
# Check that rows have been successfully dropped.
print("Undropped rows: ", len(full_train_data.loc[(full_train_data['discharge_disposition_id'] == "Expired")].readmitted.value_counts()))

Index '135' is now named:  136
Undropped rows:  0


In [870]:
# I correct the index
full_train_data.set_index(np.arange(len(full_train_data)), inplace=True)
print("Index '135' is now named: ", (full_train_data.iloc[135].name))

Index '135' is now named:  135


Next, admission_source_id. I replace "Not Mapped" with NA.
Additionally, I do not believe it is possible to impute NAs. The distribution is relatively split across the two most prevalent categories, and there is no way to convert to numeric values.

In [871]:
full_train_data.admission_source_id.replace("Not Mapped", np.nan, inplace=True)

Next, payer_code. "?" have already been read as NAs by my read_csv command.
My intuition for imputing NAs is that "?" may be patients who either do not have a way to pay right away, or who cannot pay at all. This may correlate with socioeconomic status, an external variable that may influence readmission. However, I cannot confirm this, and it may be a risk to impute NAs (or designate them their own category) when they represent the bulk of the data. I *try* imputing NAs as their own category, but I may change this later and compare the effect on model performance.

In [872]:
full_train_data.payer_code.fillna("No_code", inplace=True)
full_train_data.payer_code.value_counts(dropna=False)

No_code    4229
MC         1933
HM          360
BC          296
SP          239
UN          218
MD          186
CP          155
CM           86
DM           45
OG           38
PO           34
WC            9
SI            8
OT            6
CH            5
Name: payer_code, dtype: int64

Next, medical_specialty. I replace obvious NAs. I also do not believe it is possible to simply impute NAs, as the data is categorical and NAs represent a sizable chunk of the data.

Given more time, it might be possible to do so using information from the "ai_response" column.

I do not change any other outliers or 'suspicious' values, as I am unsure if or how they should be recategorized. e.g. "Surgery-PlasticwithinHeadandNeck".

In [873]:
# Replacing obvious NA values
full_train_data.medical_specialty = full_train_data.medical_specialty.replace("PhysicianNotFound", np.nan)

Additionally, I prep medical_specialty for one-hot encoding by setting all but the 10 most common categories to NA. These also happen to be the only categories which each account for more than 1% of the data.

In [874]:
cols_to_keep = ("InternalMedicine", "Family/GeneralPractice", "Cardiology","Emergency/Trauma", "Surgery-General", "Orthopedics-Reconstructive", "Nephrology", "Psychiatry", "Orthopedics", "ObstetricsandGynecology")

full_train_data.loc[~full_train_data["medical_specialty"].isin(cols_to_keep), "medical_specialty"] = np.nan

In [875]:
full_train_data.medical_specialty.value_counts(dropna=False)

NaN                           3927
InternalMedicine              1494
Family/GeneralPractice         675
Cardiology                     552
Emergency/Trauma               387
Surgery-General                269
Orthopedics-Reconstructive     132
Nephrology                     123
Psychiatry                     104
Orthopedics                    101
ObstetricsandGynecology         83
Name: medical_specialty, dtype: int64

Next, max_glu_serum. I would consider this ordinal data, but not interval data, as I cannot confirm that the distance between "Norm" and ">200" is equal to the distance between ">200" and ">300". So I will One-Hot encode this variable at the end of this section, instead of label encoding.
Assuming these are readings for blood glucose serum test, I'm assuming it is only administered when there is a reasonable chance that the patient's blood glucose might be elevated. As such, I believe I could impute NAs for this column as "Norm". However, I wound up making NAs their own category, since information may be gleaned from the fact that a doctor thought administration of a test was necessary, even if blood glucose came back normal.

In [876]:
full_train_data.max_glu_serum = full_train_data.max_glu_serum.fillna("No_test")
full_train_data.max_glu_serum.value_counts(dropna=False)

No_test    7333
Norm        267
>200        152
>300         95
Name: max_glu_serum, dtype: int64

Next, "Metformin" through "Insulin" appear to be ordinal, but not interval, same problem as above. I will one-hot encode these later. I drop acetohexamide, troglitazone, examide, and citoglipton because they contain only one value.

In [877]:
for col in full_train_data.columns[11:28]:
    print(col,":", len(full_train_data[col].unique()))

metformin : 4
repaglinide : 4
nateglinide : 4
chlorpropamide : 3
glimepiride : 4
acetohexamide : 1
glipizide : 4
glyburide : 4
tolbutamide : 2
pioglitazone : 4
rosiglitazone : 4
acarbose : 3
miglitol : 4
troglitazone : 1
tolazamide : 2
examide : 1
citoglipton : 1


In [878]:
full_train_data = full_train_data.drop(["acetohexamide",
                                        "troglitazone",
                                        "examide",
                                        "citoglipton"],
                                       axis=1)

Next, I drop nateglinide, chlorpropamide, tolbutamide, acarbose, miglitol, and tolazamide, because the "No" class represents 99% or more of records. 99% is an arbitrary threshold; realistically, I could probably drop even more of these columns.

In [879]:
for col in full_train_data.columns[11:24]:
    print(col,":\n", full_train_data[col].value_counts())

metformin :
 No        6279
Steady    1435
Up          95
Down        38
Name: metformin, dtype: int64
repaglinide :
 No        7741
Steady      91
Up          11
Down         4
Name: repaglinide, dtype: int64
nateglinide :
 No        7804
Steady      41
Down         1
Up           1
Name: nateglinide, dtype: int64
chlorpropamide :
 No        7836
Steady      10
Up           1
Name: chlorpropamide, dtype: int64
glimepiride :
 No        7457
Steady     356
Up          26
Down         8
Name: glimepiride, dtype: int64
glipizide :
 No        6800
Steady     934
Up          74
Down        39
Name: glipizide, dtype: int64
glyburide :
 No        6920
Steady     799
Up          80
Down        48
Name: glyburide, dtype: int64
tolbutamide :
 No        7846
Steady       1
Name: tolbutamide, dtype: int64
pioglitazone :
 No        7295
Steady     525
Up          20
Down         7
Name: pioglitazone, dtype: int64
rosiglitazone :
 No        7247
Steady     576
Up          17
Down         7
Name: ros

In [880]:
full_train_data = full_train_data.drop(["nateglinide",
                                        "chlorpropamide",
                                        "tolbutamide",
                                        "acarbose",
                                        "miglitol",
                                        "tolazamide"],
                                       axis=1)

Next, I drop "glyburide.metformin" and "glipizide.metformin" because nearly all values are "no", and I drop "glimepiride.pioglitazone" through "metformin.pioglitazone" because they *only* contain "no" values.


In [881]:
#Print number of unique values
for col in full_train_data.columns[29:34]:
    print(col,":", len(full_train_data[col].unique()))

ai_response : 6802


In [882]:
pd.value_counts(full_train_data["glyburide.metformin"])
# 99.43% of the data is "no".

No        7803
Steady      42
Down         1
Up           1
Name: glyburide.metformin, dtype: int64

In [883]:
pd.value_counts(full_train_data["glipizide.metformin"])

No        7845
Steady       2
Name: glipizide.metformin, dtype: int64

In [884]:
full_train_data = full_train_data.drop(["glyburide.metformin",
                                        "glipizide.metformin",
                                        "glimepiride.pioglitazone",
                                        "metformin.rosiglitazone",
                                        "metformin.pioglitazone"],
                                       axis=1)

"change", "diabetes_med" and "readmitted" I encode as a binary. This is both to ensure it reads as a 0-1 or true-false to my model(s), and for interpretability.

In [885]:
full_train_data.change = full_train_data.change.replace("Ch", 1)
full_train_data.change = full_train_data.change.replace("No", 0)

full_train_data.diabetesMed = full_train_data.diabetesMed.replace("Yes", 1)
full_train_data.diabetesMed = full_train_data.diabetesMed.replace("No", 0)

full_train_data.readmitted = full_train_data.readmitted.replace(True, 1)
full_train_data.readmitted = full_train_data.readmitted.replace(False, 0)

"2nd_diag" and "3rd_diag". As stated above, I believe these may be diagnosis codes. As such, I can't impute NAs and I consider them as categorical. Below I ensure these columns are set as strings, rather than floats.

In [886]:
full_train_data["2nd_diag"] = full_train_data["2nd_diag"].astype(str)
full_train_data["3rd_diag"] = full_train_data["3rd_diag"].astype(str)

In [887]:
# It is not possible for a column to have more than one datatype. I re-confirm that my index has been set correctly (by the absense of errors) and no glitches have occurred in type processing.
for j in ("2nd_diag", "3rd_diag"):
    for i in np.arange(len(full_train_data)):
        if type(full_train_data[j][i]) != str:
            print(type(full_train_data[j][i]))

My biggest lingering concern is how to encode these categorical variables. On one hand, multi-label encoding may cause my models to misinterpret categorical data as having an ordered, ratio relationship; on the other, one-hot encoding would explode the number of columns given the number of categories and categorical variables I have, possibly leading to high-dimensionality problems. For simplicity, I will one-hot encode columns with <100 categoires.

In [888]:
EncCat_full_train_data = full_train_data[full_train_data.dtypes[full_train_data.dtypes == "object"].index[0:25]].copy()

for col in EncCat_full_train_data.columns:
    oneHotCols = 0
    print(col,":", len(EncCat_full_train_data[col].unique()))

discharge_disposition_id : 18
admission_source_id : 10
payer_code : 16
medical_specialty : 11
max_glu_serum : 4
metformin : 4
repaglinide : 4
glimepiride : 4
glipizide : 4
glyburide : 4
pioglitazone : 4
rosiglitazone : 4
insulin : 4
2nd_diag : 390
3rd_diag : 425
ai_response : 6802


In [889]:
full_train_data.examide.value_counts()

AttributeError: 'DataFrame' object has no attribute 'examide'

In [None]:
ohe = skl.preprocessing.OneHotEncoder()
EncCat_full_train_data_matrix = ohe.fit_transform(EncCat_full_train_data)


In [None]:
EncCat_full_train_data.shape

3. Pre-processed numerical data appropriately including handling missing data and
justified methods used. Note this may be different for each algorithm you try. [10 pts]

We already confirmed above that there are no NAs in our continuous data, so no need to impute.

I choose to scale the numeric columns for use in my RF and NN models.

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = skl.preprocessing.StandardScaler()
#Create new df for scaled numeric data
scaledNum_full_train_data = full_train_data[full_train_data.dtypes[full_train_data.dtypes == "int64"].index[1:6]].copy()
numeric_columns = list(full_train_data.dtypes[full_train_data.dtypes == "int64"].index)[0:6]

# I use a for loop to quickly transform all numeric columns
for column in numeric_columns:
    transformed_column = scaler.fit_transform(np.array(full_train_data[column]).reshape(-1,1))
    scaledNum_full_train_data[column] = transformed_column

In [None]:
sns.pairplot(full_train_data[['num_lab_procedures', 'num_procedures',"number_outpatient", 'number_emergency', 'number_diagnoses', 'num_medications']])

4. Implement a model to make predictions using text data using tf-idf [20 pts]

5. Use model stacking to incorporate tf-idf predictions for the text field
(diag_desc_combined) in the downstream algorithm [20 pts]

6. Perform experimentation for multiple modeling algorithms and justify why you
selected the experiments you chose [20 pts]

7. Final model selection and discussion of your model choice and the model weaknesses
(generally, where model doesn’t perform well, etc.) [10 pts]

# Part B: Model Performance (100 pts)

Achieve AUC >= .675