In [66]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from pytorch_tabular import TabularModel
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [12]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [13]:
pd.set_option('display.max_columns', None)

# Part I - Clean Data
- ## In this section we will load and merge the data, putting it in a form that will be ready to be used by our model

In [42]:
def prep_data(): 
    ''''
    
    
    
    '''
    #load csv files
    patient_data = pd.read_csv('../Data_Clean/RTED_ADIMERGE.csv')
    geographic_data = pd.read_csv('../Data_Clean/geographic_data_clean.csv')

    #drop state since it is already in the patient data frame
    geographic_data = geographic_data.drop(labels = ['STATE'], axis = 1)

    #encode ZIP_5 as a object for the merge on geographic data frame 
    geographic_data['ZIP_5'] = geographic_data['ZIP_5'].astype(str)

    #encode ZIP_5 as a object for merge on patient data frame
    patient_data['ZIP_5'] = patient_data['ZIP_5'].astype(str)

    #still some float values in there which we are causing data loss on the merge - grab first 5 characters of the string
    patient_data['ZIP_5'] = patient_data['ZIP_5'].str[:5]

    df_full = pd.merge(patient_data, geographic_data, how = 'left', on = 'ZIP_5')

    #drop these columns as they are directly related to our response and must be dropped, or redundant because there are other columns that represent the data in a simplified way
    cols_drop = ['RET_CSN', 'RET_DAYS', 'RET_HOSPITAL', 'RET_ED_DISPO', 'RET_CHIEF_COMPLAINT', 
                'RET_CLINICAL_IMPRESSION', 'RET_HB_PRIM_DX_CODE', 'RET_HB_PRIM_DX_NAME', 'RET_ED_DENOM',
                'RET_ED30_NUMER', 'EDRevisitDischargedPatient', 'READMISSION', 'Readmission90', 'EDRevisit90', 
                'PAT_ZIP', 'COUNTY', 'CITY', 'location', 'Latitude', 'Longitude', 'WEIGHTED_ADI', 'PAYOR_NAME', 'FINANCIAL_CLASS_NAME', 'OR_LOG_ROW', 
                'LOCATION_NAME', 'SERVICE_NAME', 'PRIMARY_PROCEDURE_NM', 'PRIMARY_PROCEDURE_CPT']
    
    #drop cols that we won't/can't use
    df_simplified = df_full.drop(labels = cols_drop, axis = 1)

    #list of pre-encoded columns from PI that we will drop so we can do the data engineering ourselves
    pre_encoded = ['FinancialClass_Commercial', 'FinancialClass_Liability', 'FinancialClass_ManagedCare', 'FinancialClass_MedicaidPending', 
               'FinancialClass_MediCARE', 'FinancialClass_MedicareAdvantage', 'FinancialClass_CommercialBlueCross', 'FinancialClass_MedicaidNC',
               'FinancialClass_MedicaidManaged', 'FinancialClass_CommercialBCOOS', 'FinancialClass_Medcaid', 'FinancialClass_12Unkonwn',
               'FinancialClass_13', 'FinancialClass_14', 'FinancialClass_15', 'FinancialClass_WorkersComp', 'RaceDummy_4', 'RaceDummy_5', 
               'Hispanic_1', 'Hispanic_2', 'Hispanic_3', 'Hispanic_4', 'Hispanic_5', 'Hispanic_NotHispanic', 'Hispanic_7', 
               'Race_NotValid', 'Race_AmericanIndian', 'Race_Asian', 'Race_Black', 'Race_White', 'Race_NativeHawaiian', 'Race_NotReported', 'Race_Other',
               'Race_OtherAsian', 'WhiteNonHipanic', 'Sex_Female', 'Sex_2', 'Sex_3']
    
    #drop pre-encoded columns
    df_simplified = df_simplified.drop(labels = pre_encoded, axis = 1)

    #columns where we will drop NA
    cols_dropna = ['RACE', 'ETHNIC_GROUP', 'DX_HYPERTENSION', 'DX_RENAL_FAILURE', 'DX_COPD', 'DX_TYPE_2_DM', 'DX_HIP_FRACTURE', 'DX_OSTEOPOROSIS', 'STATE', 'PRIMARY_PROC_CPT_CODE']
    
    #columns where we will fill na with the median using imputation
    cols_fillna = ['BMI', 'distance_to_hospital']   

    #drop na from the columns with low levels of missingness 
    df_simplified = df_simplified.dropna(subset = cols_dropna, axis = 0)

    #fill BMI and distance with the median due to right tailed distribution (will handle outliers later)  
    df_simplified[cols_fillna] = df_simplified[cols_fillna].fillna(df_simplified[cols_fillna].median())

    #rename response columns 
    df_simplified['RETURN_ED_90DAY'] = df_simplified['ED90Day']

    #drop original
    df_simplified = df_simplified.drop(labels = ['ED90Day'], axis = 1)

    return df_simplified

In [43]:
#creates simplified data frame with irrelevant features dropped and missingness handles
df_simplified = prep_data() 

  patient_data = pd.read_csv('../Data_Clean/RTED_ADIMERGE.csv')


Unnamed: 0,PAT_CLASS,PAT_BASE_CLASS,AGE,SEX,RACE,ETHNIC_GROUP,PAT_LANGUAGE,BMI,DX_HYPERTENSION,DX_RENAL_FAILURE,DX_COPD,DX_TYPE_2_DM,DX_HIP_FRACTURE,DX_OSTEOPOROSIS,DISCH_LOC_ABBR,DISCH_DEPT,LOS_DAYS,DISCHARGE_DISPO,ATTENDING_PROV,OR_LOGS,CASE_CLASS_NM,LOCATION_ID,LOCATION_NM,PRIMARY_PHYSICIAN_NM,CLIN_DEP,CLIN_DIV,PRIMARY_PROC_CPT_CODE,MedicaidBinary,SevereObesity,Elderly65,ZIP_5,STATE,distance_to_hospital,RETURN_ED_90DAY
0,Inpatient,Inpatient,41.0,Male,Black or African American,Not Hispanic/Latino,English,23.83,0.0,0.0,0.0,0.0,0.0,0.0,DRAH,DRAH NP 3200N,4.0,Home or Self Care,"CHUANG, ELISEU YUNG",1.0,Level 3,10703005.0,DRAH OR,"REEG, SCOT ERIC",ORTHOPAEDICS-PDC,SPINE,10061.0,0.0,0.0,0.0,27591,NC,36.432272,0.0
1,Inpatient,Inpatient,77.0,Male,Caucasian/White,Not Hispanic/Latino,English,28.58,1.0,0.0,0.0,0.0,0.0,1.0,DRAH,DRAH NP 4200,10.9,Skilled Nursing Facility,"GOPAL, VANITHA ANDAL",1.0,Level 5,10703005.0,DRAH OR,"REEG, SCOT ERIC",ORTHOPAEDICS-PDC,SPINE,22513.0,0.0,0.0,1.0,27614,NC,18.058521,0.0
2,Inpatient,Inpatient,66.0,Female,Black or African American,Not Hispanic/Latino,English,27.12,1.0,1.0,0.0,1.0,0.0,0.0,DRAH,DRAH SP 3700,32.5,Home Health Service,"AMOO, AKUA ADWUBI PANYIN",1.0,Elective,10703005.0,DRAH OR,"REEG, SCOT ERIC",ORTHOPAEDICS-PDC,SPINE,22551.0,0.0,0.0,1.0,27589,NC,48.669063,1.0
3,Inpatient,Inpatient,50.0,Female,Asian,Not Hispanic/Latino,Burmese,22.29,0.0,0.0,0.0,0.0,0.0,0.0,DRAH,DRAH NP 2200N,16.6,Skilled Nursing Facility,"DORR, VICTORIA JOAN",1.0,Elective,10703005.0,DRAH OR,"REEG, SCOT ERIC",ORTHOPAEDICS-PDC,SPINE,22614.0,1.0,0.0,0.0,28334,NC,58.136057,1.0
4,Inpatient,Inpatient,44.0,Male,Black or African American,Not Hispanic/Latino,English,19.11,1.0,0.0,0.0,0.0,0.0,0.0,DRAH,DRAH SP 3700,8.1,Skilled Nursing Facility,"REEG, SCOT ERIC",1.0,Level 5,10703005.0,DRAH OR,"REEG, SCOT ERIC",ORTHOPAEDICS-PDC,SPINE,22850.0,0.0,0.0,0.0,27889,NC,110.376278,1.0


In [52]:
def handle_outliers_engineer_feats(df_simplified): 
    '''
    Removes the points considered to be outliers using  the standard deviation and k =3. Engineers binary flag variables for language, ethnicity and race

    Inputs:
        df_simplified(pd.DataFrame): dataframe contained the cleaned data including outliers

    Returns:
        df_engineered(pd.DataFrame): dataframe with outliers removed
    ''' 

    #creates list of BMI anomolies to drop 
    k = 3
    col = df_simplified['BMI']
    col_std = np.std(df_simplified['BMI'])
    col_mean = np.mean(df_simplified['BMI'])
    thresh = col_std * k
    lower_limit  = col_mean - thresh
    upper_limit = col_mean + thresh
    BMI_anomalies = list(col.index[(col>upper_limit) | (col<lower_limit)])

    #drops BMI outliers
    df_no_outliers = df_simplified.drop(BMI_anomalies, axis =0)

    #drops instances when LOS is >90 days 
    df_no_outliers = df_no_outliers[df_no_outliers['LOS_DAYS'] <= 90]

    #next we will create binary flag variables for minority status and ethnic minority status
    minority_races = ['Black or African American', 'Other', 'Asian', 'American Indian or Alaskan Native', 'Native Hawaiian or Other Pacific Islander']
    ethnic_minority = ['Hispanic Other', 'Hispanic Mexican', 'Hispanic Puerto Rican', 'Hispanic Cuban']

    #creates racial minority column 
    df_no_outliers['RacialMinority'] = np.where(df_no_outliers['RACE'].isin(minority_races), 1,0)

    #creates ethnic minority column 
    df_no_outliers['EthnicMinority'] = np.where(df_no_outliers['ETHNIC_GROUP'].isin(ethnic_minority), 1,0)

    #creates language not english column 
    df_no_outliers['LanguageNotEnglish'] = np.where(df_no_outliers['PAT_LANGUAGE'] == 'English', 0,1)

    #now we have the data ready to be modelled
    df_modelling = df_no_outliers


    return df_modelling

In [56]:
df_modelling = handle_outliers_engineer_feats(df_simplified)
df_modelling.head()



Unnamed: 0,PAT_CLASS,PAT_BASE_CLASS,AGE,SEX,RACE,ETHNIC_GROUP,PAT_LANGUAGE,BMI,DX_HYPERTENSION,DX_RENAL_FAILURE,DX_COPD,DX_TYPE_2_DM,DX_HIP_FRACTURE,DX_OSTEOPOROSIS,DISCH_LOC_ABBR,DISCH_DEPT,LOS_DAYS,DISCHARGE_DISPO,ATTENDING_PROV,OR_LOGS,CASE_CLASS_NM,LOCATION_ID,LOCATION_NM,PRIMARY_PHYSICIAN_NM,CLIN_DEP,CLIN_DIV,PRIMARY_PROC_CPT_CODE,MedicaidBinary,SevereObesity,Elderly65,ZIP_5,STATE,distance_to_hospital,RETURN_ED_90DAY,RacialMinority,EthnicMinority,LanguageNotEnglish
0,Inpatient,Inpatient,41.0,Male,Black or African American,Not Hispanic/Latino,English,23.83,0.0,0.0,0.0,0.0,0.0,0.0,DRAH,DRAH NP 3200N,4.0,Home or Self Care,"CHUANG, ELISEU YUNG",1.0,Level 3,10703005.0,DRAH OR,"REEG, SCOT ERIC",ORTHOPAEDICS-PDC,SPINE,10061.0,0.0,0.0,0.0,27591,NC,36.432272,0.0,1,0,0
1,Inpatient,Inpatient,77.0,Male,Caucasian/White,Not Hispanic/Latino,English,28.58,1.0,0.0,0.0,0.0,0.0,1.0,DRAH,DRAH NP 4200,10.9,Skilled Nursing Facility,"GOPAL, VANITHA ANDAL",1.0,Level 5,10703005.0,DRAH OR,"REEG, SCOT ERIC",ORTHOPAEDICS-PDC,SPINE,22513.0,0.0,0.0,1.0,27614,NC,18.058521,0.0,0,0,0
2,Inpatient,Inpatient,66.0,Female,Black or African American,Not Hispanic/Latino,English,27.12,1.0,1.0,0.0,1.0,0.0,0.0,DRAH,DRAH SP 3700,32.5,Home Health Service,"AMOO, AKUA ADWUBI PANYIN",1.0,Elective,10703005.0,DRAH OR,"REEG, SCOT ERIC",ORTHOPAEDICS-PDC,SPINE,22551.0,0.0,0.0,1.0,27589,NC,48.669063,1.0,1,0,0
3,Inpatient,Inpatient,50.0,Female,Asian,Not Hispanic/Latino,Burmese,22.29,0.0,0.0,0.0,0.0,0.0,0.0,DRAH,DRAH NP 2200N,16.6,Skilled Nursing Facility,"DORR, VICTORIA JOAN",1.0,Elective,10703005.0,DRAH OR,"REEG, SCOT ERIC",ORTHOPAEDICS-PDC,SPINE,22614.0,1.0,0.0,0.0,28334,NC,58.136057,1.0,1,0,1
4,Inpatient,Inpatient,44.0,Male,Black or African American,Not Hispanic/Latino,English,19.11,1.0,0.0,0.0,0.0,0.0,0.0,DRAH,DRAH SP 3700,8.1,Skilled Nursing Facility,"REEG, SCOT ERIC",1.0,Level 5,10703005.0,DRAH OR,"REEG, SCOT ERIC",ORTHOPAEDICS-PDC,SPINE,22850.0,0.0,0.0,0.0,27889,NC,110.376278,1.0,1,0,0


In [92]:
def train_model(df_modelling):
  ''' 
  
  '''
  #define target  
  target_col = 'RETURN_ED_90DAY'

  #define features 
  feature_cols = [col for col in df_modelling.columns if col != target_col]

  #define continuous columns
  cont_cols = ['distance_to_hospital', 'LOS_DAYS']

  #define catagorical columns
  cat_cols = [col for col in df_modelling.columns if col != target_col and col not in cont_cols]

  #convert all catagorical columns to catagories
  df_modelling[cat_cols] = df_modelling[cat_cols].astype('category')

  #split training and test sets
  train_full_data, test_data = train_test_split(df_modelling, test_size=0.2, random_state=0)

  #split training and validation sets
  train_data, val_data = train_test_split(train_full_data, test_size=0.2, random_state=0)

  # Define the data configuration
  data_config = DataConfig(
      target=['RETURN_ED_90DAY'],
      continuous_cols=cont_cols,
      categorical_cols= cat_cols
      normalize_continuous_features= True)

  #Define the optimizer configuration
  optimizer_config = OptimizerConfig()

  #Define the trainer configuration
  trainer_config = TrainerConfig(
      auto_lr_find=True,
      batch_size=1024,
      accelerator = 'gpu'
      max_epochs=100 )

  #Define the model configuration
  model_config = CategoryEmbeddingModelConfig(
    task='classification',
    layers='256-128',
    activation='LeakyReLU',
    learning_rate=1e-3,
    embedding_dropout=0.0,
    use_batch_norm=True)

  #Create the model
  model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config)
  
  # Fit the model
  model.fit(train=train_data, validation=val_data)

  return model, cat_cols

In [100]:
trained_model = train_model1(df_modelling)

2023-04-24 15:22:11,929 - {pytorch_tabular.tabular_model:102} - INFO - Experiment Tracking is turned off
Global seed set to 42
2023-04-24 15:22:12,022 - {pytorch_tabular.tabular_model:465} - INFO - Preparing the DataLoaders
2023-04-24 15:22:12,028 - {pytorch_tabular.tabular_datamodule:286} - INFO - Setting up the datamodule for classification task


TypeError: Cannot setitem on a Categorical with a new category (0), set the categories first


# Evaluate the model
result = model.evaluate(X_test, y_test)

# Make predictions with the model
preds = model.predict(X_test)

# Save the trained model
model.save_model('my_model')

# Load a trained model
loaded_model = TabularModel.load_from_checkpoint('my_model.ckpt')