In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats

import glob, os
import my_lib as ml
import json
import pprint
pp = pprint.PrettyPrinter(indent=4)

from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import classification_report, f1_score, make_scorer, accuracy_score

import seaborn as sns
sns.set_style("darkgrid")

from IPython.display import display, Markdown
pd.set_option('display.max_columns', None)  

SEED = 42
target = "TARGET"

In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [3]:
df = pd.read_csv(f"data/df_train_sample_00_of_20.csv", dtype=str)
df2 = pd.read_csv(f"data/df_train_sample_00_of_20.csv", dtype=str)
display(df.head(2))
display(df.shape)

Unnamed: 0,RECORD_ID,DISCHARGE,THCIC_ID,PROVIDER_NAME,TYPE_OF_ADMISSION,SOURCE_OF_ADMISSION,PAT_STATE,PAT_COUNTRY,COUNTY,PUBLIC_HEALTH_REGION,PAT_STATUS,SEX_CODE,RACE,ETHNICITY,ADMIT_WEEKDAY,LENGTH_OF_STAY,PAT_AGE,FIRST_PAYMENT_SRC,TYPE_OF_BILL,TOTAL_CHARGES,TOTAL_NON_COV_CHARGES,TOTAL_CHARGES_ACCOMM,TOTAL_NON_COV_CHARGES_ACCOMM,TOTAL_CHARGES_ANCIL,TOTAL_NON_COV_CHARGES_ANCIL,POA_PROVIDER_INDICATOR,ADMITTING_DIAGNOSIS,PRINC_DIAG_CODE,OTH_DIAG_CODE_1,MS_MDC,MS_DRG,MS_GROUPER_VERSION_NBR,MS_GROUPER_ERROR_CODE,APR_MDC,APR_DRG,RISK_MORTALITY,ILLNESS_SEVERITY,APR_GROUPER_VERSION_NBR,APR_GROUPER_ERROR_CODE,ATTENDING_PHYSICIAN_UNIF_ID,ENCOUNTER_INDICATOR,CERT_STATUS,TARGET
0,320136748870,2013Q3,838400,Memorial Hermann Rehab Hospital Katy,3,4,TX,US,201,6,7,F,4,2,2,1,20,MA,111,1671.0,0.0,1145.0,0.0,526.0,0.0,X,V5789,V5789,1919,23,945,1300,0,23,860,2,3,7300,0,1229763162,1,1,short
1,120130546450,2013Q1,409000,John Peter Smith Hospital,1,1,TX,US,367,3,1,M,5,1,2,2,13,MA,111,53064.01,0.0,4092.0,0.0,48972.01,0.0,M,78650,41401,42822,5,247,1300,0,5,175,2,2,7300,0,1578252829,1,2,short


(49984, 43)

In [4]:
if df['TARGET'].dtype!=int:
    le_target = LabelEncoder()
    df['TARGET']= le_target.fit_transform(df['TARGET'])
else:
    pass

target_labels = {c:le_target.inverse_transform([c])[0] for c in [0,1,2]}
print(target_labels)

{0: 'long', 1: 'medium', 2: 'short'}


In [5]:
# open feature label file
feature_labels = {}

with open("data/feature_labels.json") as f:
  feature_labels = json.load(f)

ml.clean_data(df)

TYPE_OF_ADMISSION: -> ['3' '1' '2' '4' '5' '9']
SOURCE_OF_ADMISSION: -> ['4' '1' '2' '8' '5' '9' '6' 'D']
PAT_STATE: -> ['TX' 'XX' 'ZZ']
SEX_CODE: -> ['F' 'M' 'U']
RACE: -> ['4' '5' '3' '2' '1']
ETHNICITY: -> ['2' '1' '3']
PAT_AGE: -> ['5' '4' '2' '3' '1']
PAT_COUNTRY: -> ['US' 'MX' 'XX']


## Splitting dataset in Test and Train

In [6]:
x = df.drop(target, axis=1)
y = df.TARGET

df_x_train, df_x_test, df_y_train, df_y_test = train_test_split(x,y, test_size = 0.4, stratify = y, random_state = SEED)

## Encoding Categorical Features

In [7]:
target_features = [ "SOURCE_OF_ADMISSION", "PAT_STATE", "SEX_CODE", "RACE", "ETHNICITY", "ADMIT_WEEKDAY", "PAT_AGE", "PAT_COUNTRY"]

def encode_features(df_x_train, df_x_test, target_features, debug=False):
    # create dataframes to populate
    dfx_train_model = df_x_train.loc[:,[]]
    dfx_test_model = df_x_test.loc[:,[]]

    # encoding features using LabelBinarizer
    for feature in target_features:

        lb = LabelBinarizer()
        lb_result = lb.fit_transform(df_x_train[feature].astype("str"))
        names = [f'{feature}_{l}' for l in lb.classes_]

        for k, name in enumerate(names):
            dfx_train_model[name] = lb_result[:,k]
            print(name)

        lb_result = lb.transform(df_x_test[feature].astype("str"))
        names = [f'{feature}_{l}' for l in lb.classes_]
        for k, name in enumerate(names):
            dfx_test_model[name] = lb_result[:,k]

    return dfx_train_model, dfx_test_model

dfx_train_model, dfx_test_model = encode_features(df_x_train, df_x_test, target_features)

SOURCE_OF_ADMISSION_1
SOURCE_OF_ADMISSION_2
SOURCE_OF_ADMISSION_4
SOURCE_OF_ADMISSION_5
SOURCE_OF_ADMISSION_6
SOURCE_OF_ADMISSION_8
SOURCE_OF_ADMISSION_9
SOURCE_OF_ADMISSION_D
PAT_STATE_TX
PAT_STATE_XX
PAT_STATE_ZZ
SEX_CODE_F
SEX_CODE_M
SEX_CODE_U
RACE_1
RACE_2
RACE_3
RACE_4
RACE_5
ETHNICITY_1
ETHNICITY_2
ETHNICITY_3
ADMIT_WEEKDAY_1
ADMIT_WEEKDAY_2
ADMIT_WEEKDAY_3
ADMIT_WEEKDAY_4
ADMIT_WEEKDAY_5
ADMIT_WEEKDAY_6
ADMIT_WEEKDAY_7
PAT_AGE_1
PAT_AGE_2
PAT_AGE_3
PAT_AGE_4
PAT_AGE_5
PAT_COUNTRY_MX
PAT_COUNTRY_US
PAT_COUNTRY_XX


In [18]:
#Basic Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier#GradientTreeBoosting

# Using a few classifiers with the dataset
classifiers = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "KNearestNeighbors": KNeighborsClassifier(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "AdaBoostClassifier": AdaBoostClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),
    "ExtraTreeClassifier": ExtraTreesClassifier(n_estimators=200)
}

In [19]:
x_train, y_train = dfx_train_model.values, df_y_train.values
x_test, y_test = dfx_test_model.values, df_y_test.values

In [20]:
from sklearn.model_selection import GridSearchCV

best_classifiers = {}

param_space = {
    "LogisticRegression": {
        "solver":["liblinear", "saga"],
        "penalty": ['l1', 'l2'], 
        "C": [0.001, 0.01, 0.1, 1, 10, 100]
    },
    "KNearestNeighbors": {
        "n_neighbors": range(2, 5, 10), 
        "algorithm": ["auto", "ball_tree", "kd_tree", "brute"]
    },
    "DecisionTreeClassifier": {
        "criterion": ["gini", "entropy"], 
        "max_depth": range(2,8), 
        "min_samples_leaf": range(1,8)
    },
    "AdaBoostClassifier": {
        "algorithm": ["SAMME", "SAMME.R"],
        "n_estimators": [10, 30, 50, 80]    
    },
    "RandomForestClassifier": {
        "min_samples_leaf": range(3, 6),
        "max_depth": range(2,4), 
        "criterion": ["gini", "entropy"]  
    },
    "ExtraTreeClassifier": {
        "min_samples_leaf": range(3, 6),
        "max_depth": range(2,4), 
        "criterion": ["gini", "entropy"]
    }
}

In [None]:
# clf = ExtraTreesClassifier()
# clf.get_params()

for name in classifiers:
    param = param_space[name]
    print("\n\nPerforming GridSearchCV on %s..." % name)
    clf = GridSearchCV(classifiers[name], param, cv=5)
    
    clf.fit(x_train, y_train)
    best_classifiers[name] = clf
    
    score = cross_val_score(clf, x_train, y_train, cv=5)
    print(best_classifiers[name])
    print("%s Cross Validation Score (%s): %.2f%%" % (name, metric, 100*score.mean()))



Performing GridSearchCV on LogisticRegression...




In [None]:
!say "Doneeee"

## Model Training

In [None]:
# looping over the classifiers and getting the model scores
metric = "recall_macro"

for key, classifier in classifiers.items():
    classifier.fit(x_train, y_train)
    training_score = cross_val_score(classifier, x_train, y_train, cv=10)
    print("Classifiers: ", classifier.__class__.__name__, "Has a training score of", round(training_score.mean(), 2) * 100, "% accuracy score")

In [None]:
classifier_predictions = {}

for name, clf in classifiers.items():
    pred = cross_val_predict(clf, x_train, y_train, cv=10)
    classifier_predictions[name] = pred
    
classifier_predictions["True"] = y_train
df_pred = pd.DataFrame(classifier_predictions)
df_pred.head(5)

### Hyperparameter Tuning