# Texas Hospital Discharge - EDA

In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib

import seaborn as sns
sns.set_style("darkgrid")

from IPython.display import display, Markdown
pd.set_option('display.max_columns', None)  

import glob, os

DEBUG = False
SEED = 42

In [32]:
%load_ext autoreload
%autoreload 2

import my_lib

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [33]:
df = pd.read_csv("data/df_train_sample_00_of_20.csv", dtype=str)
df.shape

(49984, 195)

## 1. Encode Target

In [34]:
from sklearn.preprocessing import LabelEncoder
if df['TARGET'].dtype!=int:
    le_target = LabelEncoder()
    df['TARGET']= le_target.fit_transform(df['TARGET'])
else:
    pass

target_labels = {c:le_target.inverse_transform([c])[0] for c in [0,1,2]}
print(target_labels)

{0: 'long', 1: 'medium', 2: 'short'}


## 1.1 Applying Cleaned Cat. Features

In [35]:
def clean_categories(df):
    
    # TYPE_OF_ADMISSION 
    feature = "TYPE_OF_ADMISSION"
    df[feature].fillna("9", inplace=True)
    df.loc[df[feature] == "`", feature] = "9"
    print("Feture %s -> %s" % (feature, df[feature].unique()))

    # SOURCE_OF_ADMISSION
    feature = "SOURCE_OF_ADMISSION"
    df[feature].fillna("9", inplace=True)
    df.loc[df[feature].isin(["`", "3", "NaN"]), feature] = "9"
    print("Feture %s -> %s" % (feature, df[feature].unique()))

    # PAT_STATE
    feature = "PAT_STATE"
    df[feature].fillna("XX", inplace=True)
    df.loc[df[feature].isin(["`", "FC", "AR", "OK", "LA", "NM"]), feature] = "ZZ"
    df.loc[df[feature].isin(["`", "FC"]), feature] = "XX"
    print("Feture %s -> %s" % (feature, df[feature].unique()))

    
clean_categories(df)

Feture TYPE_OF_ADMISSION -> ['3' '1' '2' '4' '5' '9']
Feture SOURCE_OF_ADMISSION -> ['4' '1' '2' '8' '5' '9' '6' 'D']
Feture PAT_STATE -> ['TX' 'ZZ' 'XX']


## 1.2 Test-Train Split

In [36]:
from sklearn.model_selection import train_test_split

target = "TARGET"
X = df.drop(target, axis=1)
y = df[target]
df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(X, y, test_size=0.4, stratify=y, random_state=SEED)

## 1.3 Encoding of Categorical Features

In [29]:
cat_features = ["TYPE_OF_ADMISSION", "ADMIT_WEEKDAY"]

In [7]:
df_X_train_model = df_X_train.loc[:, []]
df_X_test_model = df_X_test.loc[:, []]

In [8]:


for feature in cat_features:
    
    # fit and transform on train -> fit decides what label goes to each category
    lb = LabelBinarizer()
    lb_results = lb.fit_transform(df_X_train[feature].astype("str"))
    names= [feature+"_"+l for l in lb.classes_]
    for k, name in enumerate(names):
        df_X_train_model[name] = lb_results[:, k]

    # transform on test -> this transform uses the fit already stored
    lb_results = lb.transform(df_X_test[feature].astype("str"))
    names= [feature+"_"+l for l in lb.classes_]
    for k, name in enumerate(names):
        df_X_test_model[name] = lb_results[:, k]
        
    # NOT ALL CAT. FEATURES SHOULD BE ENCODED USING LABEL BINARYIZER

In [38]:
from sklearn.preprocessing import LabelBinarizer

def encode_features(df_X_train, df_X_test, cat_LabelBinarize=[], debug=False):
    
    # create dataframes to populate
    df_X_train_model = df_X_train.loc[:, []]
    df_X_test_model = df_X_test.loc[:, []]
    
    # encoding features using LabelBinarizer
    for feature in cat_LabelBinarize:
    
        if debug: print(feature)
            
        # fit and transform on train -> fit decides what label goes to each category
        lb = LabelBinarizer()
        lb_results = lb.fit_transform(df_X_train[feature].astype("str"))
        names= [feature+"_"+l for l in lb.classes_]
        if debug: print("\t", names)
        for k, name in enumerate(names):
            df_X_train_model[name] = lb_results[:, k]

        # transform on test -> this transform uses the fit already stored
        lb_results = lb.transform(df_X_test[feature].astype("str"))
        names= [feature+"_"+l for l in lb.classes_]
        for k, name in enumerate(names):
            df_X_test_model[name] = lb_results[:, k]
        
        # NOT ALL CAT. FEATURES SHOULD BE ENCODED USING LABEL BINARYIZER
    
    return df_X_train_model, df_X_test_model

cat_LabelBinarize = ["TYPE_OF_ADMISSION", "ADMIT_WEEKDAY"]
df_X_train_model, df_X_test_model = encode_features(df_X_train, df_X_test, cat_LabelBinarize)

TYPE_OF_ADMISSION
	 ['TYPE_OF_ADMISSION_1', 'TYPE_OF_ADMISSION_2', 'TYPE_OF_ADMISSION_3', 'TYPE_OF_ADMISSION_4', 'TYPE_OF_ADMISSION_5', 'TYPE_OF_ADMISSION_9']
ADMIT_WEEKDAY
	 ['ADMIT_WEEKDAY_1', 'ADMIT_WEEKDAY_2', 'ADMIT_WEEKDAY_3', 'ADMIT_WEEKDAY_4', 'ADMIT_WEEKDAY_5', 'ADMIT_WEEKDAY_6', 'ADMIT_WEEKDAY_7']


In [9]:
df_X_train_model.head()

Unnamed: 0,TYPE_OF_ADMISSION_1,TYPE_OF_ADMISSION_2,TYPE_OF_ADMISSION_3,TYPE_OF_ADMISSION_4,TYPE_OF_ADMISSION_5,TYPE_OF_ADMISSION_9,ADMIT_WEEKDAY_1,ADMIT_WEEKDAY_2,ADMIT_WEEKDAY_3,ADMIT_WEEKDAY_4,ADMIT_WEEKDAY_5,ADMIT_WEEKDAY_6,ADMIT_WEEKDAY_7
44930,0,0,1,0,0,0,0,0,0,1,0,0,0
14686,0,0,0,1,0,0,0,0,1,0,0,0,0
44674,0,0,1,0,0,0,0,0,0,1,0,0,0
504,0,0,1,0,0,0,1,0,0,0,0,0,0
7052,1,0,0,0,0,0,0,0,0,0,0,1,0


In [10]:
# df_X_train_model.drop(columns=features, inplace=True)

In [11]:
X_train, y_train = df_X_train_model.values, df_y_train.values
X_test, y_test = df_X_test_model.values, df_y_test.values

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

classifiers = {
    "LogisticRegression": LogisticRegression(),
    "KNearestNeighbors": KNeighborsClassifier(),
    "SupportVectorClassifier": SVC(),
    "DecisionTreeClassifier": DecisionTreeClassifier()
}

## 1.4 Model Training

In [25]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, make_scorer

for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    training_score = cross_val_score(clf, X_train, y_train, cv=10, n_jobs=-1,
        scoring=make_scorer(f1_score, average='macro'))
    
    print("%30s has a training score (macro f1-score) of %.2f%% " % (name, 100*training_score.mean()))

            LogisticRegression has a training score (macro f1-score) of 33.58% 
             KNearestNeighbors has a training score (macro f1-score) of 36.94% 
       SupportVectorClassifier has a training score (macro f1-score) of 34.46% 
        DecisionTreeClassifier has a training score (macro f1-score) of 34.51% 


In [26]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import f1_score

classifier_pred = {}
for name, clf in classifiers.items():
    pred = cross_val_predict(clf, X_train, y_train, cv=10,n_jobs=-1)
    classifier_pred[name] = pred

classifier_pred['true'] = y_train
df_pred = pd.DataFrame(classifier_pred)

In [28]:
pd.crosstab(df_pred.true, df_pred.LogisticRegression, rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,0,1,2,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,4,4439,1580,6023
1,15,8320,3166,11501
2,4,7096,5366,12466
All,23,19855,10112,29990


In [29]:
from sklearn.metrics import classification_report 
print(classification_report(df_pred.true, df_pred.LogisticRegression, target_names=le_target.inverse_transform([0,1,2])))

              precision    recall  f1-score   support

        long       0.17      0.00      0.00      6023
      medium       0.42      0.72      0.53     11501
       short       0.53      0.43      0.48     12466

    accuracy                           0.46     29990
   macro avg       0.37      0.38      0.34     29990
weighted avg       0.42      0.46      0.40     29990



## 1.5 Hyper-Parameter Tuning

## 1.6 Grading (EXAMPLE)

In [9]:
df_grading = pd.read_csv(f"data/grading.csv", dtype=str)

In [27]:
clean_categories(df_grading)

df_grading.loc[:, ["RECORD_ID", "TARGET"]]

Feture TYPE_OF_ADMISSION -> ['2' '1' '3' '4' '5' '9']
Feture SOURCE_OF_ADMISSION -> ['1' '2' '5' '6' 'D' '4' '8' '9' '0']
Feture PAT_STATE -> ['TX' 'ZZ' 'XX']


Unnamed: 0,RECORD_ID,TARGET
0,420132203333,short
1,220130397490,short
2,120137915430,short
3,420132272963,short
4,120133877370,short
...,...,...
99995,320138426880,short
99996,220134576500,short
99997,420135765323,short
99998,220132728430,short


In [12]:
df_grading.loc[:, ["RECORD_ID", "TARGET"]].to_csv("df_grading_pred.csv", index=False)

In [19]:
my_lib.make_assignment()

Creating archive: my_assignment.zip
	01-Import.ipynb - OK
	02-EDA.ipynb - OK
	03-Model.ipynb - OK
	my_lib.py - OK
	df_grading_pred.csv - OK
