# Texas Hospital Discharge - EDA

In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib

import seaborn as sns
sns.set_style("darkgrid")

from IPython.display import display, Markdown
pd.set_option('display.max_columns', None)  

import glob, os

DEBUG = False
SEED = 42

In [55]:
%load_ext autoreload
%autoreload 2

import my_lib

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [56]:
df = pd.read_csv("data/df_train_sample_00_of_20.csv", dtype=str)
df.shape

(49984, 195)

## 1. Encode Target

In [57]:
from sklearn.preprocessing import LabelEncoder
if df['TARGET'].dtype!=int:
    le_target = LabelEncoder()
    df['TARGET']= le_target.fit_transform(df['TARGET'])
else:
    pass

target_labels = {c:le_target.inverse_transform([c])[0] for c in [0,1,2]}
print(target_labels)

{0: 'long', 1: 'medium', 2: 'short'}


## 1.1 Applying Cleaned Cat. Features

In [58]:
def clean_categories(df):
    
    # TYPE_OF_ADMISSION 
    feature = "TYPE_OF_ADMISSION"
    df[feature].fillna("9", inplace=True)
    df.loc[df[feature] == "`", feature] = "9"
    print("Feture %s -> %s" % (feature, df[feature].unique()))

    # SOURCE_OF_ADMISSION
    feature = "SOURCE_OF_ADMISSION"
    df[feature].fillna("9", inplace=True)
    df.loc[df[feature].isin(["`", "3", "NaN"]), feature] = "9"
    print("Feture %s -> %s" % (feature, df[feature].unique()))

    # PAT_STATE
    feature = "PAT_STATE"
    df[feature].fillna("XX", inplace=True)
    df.loc[df[feature].isin(["`", "FC", "AR", "OK", "LA", "NM"]), feature] = "ZZ"
    df.loc[df[feature].isin(["`", "FC"]), feature] = "XX"
    print("Feture %s -> %s" % (feature, df[feature].unique()))

    # SEX_CODE
    feature = "SEX_CODE"
    df[feature].fillna("U", inplace=True)
    df.loc[df[feature].isin(["NaN"]), feature] = "U"
    print("Feture %s -> %s" % (feature, df[feature].unique()))

    # RACE
    feature = "RACE"
    df[feature].fillna("5", inplace=True)
    df.loc[df[feature].isin(["NaN", "`"]), feature] = "5"
    print("Feture %s -> %s" % (feature, df[feature].unique()))

    # ETHNICITY
    feature = "ETHNICITY"
    df[feature].fillna("3", inplace=True)
    df.loc[df[feature].isin(["NaN", "`"]), feature] = "3"
    print("Feture %s -> %s" % (feature, df[feature].unique()))

clean_categories(df)

Feture TYPE_OF_ADMISSION -> ['3' '1' '2' '4' '5' '9']
Feture SOURCE_OF_ADMISSION -> ['4' '1' '2' '8' '5' '9' '6' 'D']
Feture PAT_STATE -> ['TX' 'ZZ' 'XX']
Feture SEX_CODE -> ['F' 'M' 'U']
Feture RACE -> ['4' '5' '3' '2' '1']
Feture ETHNICITY -> ['2' '1' '3']


## 1.2 Test-Train Split

In [59]:
from sklearn.model_selection import train_test_split

target = "TARGET"
X = df.drop(target, axis=1)
y = df[target]
df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(X, y, test_size=0.4, stratify=y, random_state=SEED)

## 1.3 Encoding of Categorical Features

In [60]:
from sklearn.preprocessing import LabelBinarizer

def encode_features(df_X_train, df_X_test, cat_LabelBinarize=[], debug=False):
    
    # create dataframes to populate
    df_X_train_model = df_X_train.loc[:, []]
    df_X_test_model = df_X_test.loc[:, []]
    
    # encoding features using LabelBinarizer
    for feature in cat_LabelBinarize:
    
        if debug: print(feature)
            
        # fit and transform on train -> fit decides what label goes to each category
        lb = LabelBinarizer()
        lb_results = lb.fit_transform(df_X_train[feature].astype("str"))
        names= [feature+"_"+l for l in lb.classes_]
        if debug: print("\t", names)
        for k, name in enumerate(names):
            df_X_train_model[name] = lb_results[:, k]

        # transform on test -> this transform uses the fit already stored
        lb_results = lb.transform(df_X_test[feature].astype("str"))
        names= [feature+"_"+l for l in lb.classes_]
        for k, name in enumerate(names):
            df_X_test_model[name] = lb_results[:, k]
        
        # NOT ALL CAT. FEATURES SHOULD BE ENCODED USING LABEL BINARYIZER
    
    return df_X_train_model, df_X_test_model

cat_LabelBinarize = ["TYPE_OF_ADMISSION","SOURCE_OF_ADMISSION","PAT_STATE", "SEX_CODE", "RACE", "ETHNICITY", "ADMIT_WEEKDAY"]
df_X_train_model, df_X_test_model = encode_features(df_X_train, df_X_test, cat_LabelBinarize)

In [61]:
df_X_train_model.columns

Index(['TYPE_OF_ADMISSION_1', 'TYPE_OF_ADMISSION_2', 'TYPE_OF_ADMISSION_3',
       'TYPE_OF_ADMISSION_4', 'TYPE_OF_ADMISSION_5', 'TYPE_OF_ADMISSION_9',
       'SOURCE_OF_ADMISSION_1', 'SOURCE_OF_ADMISSION_2',
       'SOURCE_OF_ADMISSION_4', 'SOURCE_OF_ADMISSION_5',
       'SOURCE_OF_ADMISSION_6', 'SOURCE_OF_ADMISSION_8',
       'SOURCE_OF_ADMISSION_9', 'SOURCE_OF_ADMISSION_D', 'PAT_STATE_TX',
       'PAT_STATE_XX', 'PAT_STATE_ZZ', 'SEX_CODE_F', 'SEX_CODE_M',
       'SEX_CODE_U', 'RACE_1', 'RACE_2', 'RACE_3', 'RACE_4', 'RACE_5',
       'ETHNICITY_1', 'ETHNICITY_2', 'ETHNICITY_3', 'ADMIT_WEEKDAY_1',
       'ADMIT_WEEKDAY_2', 'ADMIT_WEEKDAY_3', 'ADMIT_WEEKDAY_4',
       'ADMIT_WEEKDAY_5', 'ADMIT_WEEKDAY_6', 'ADMIT_WEEKDAY_7'],
      dtype='object')

In [62]:
# df_X_train_model.drop(columns=features, inplace=True)

In [63]:
X_train, y_train = df_X_train_model.values, df_y_train.values
X_test, y_test = df_X_test_model.values, df_y_test.values

In [67]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier

# classifiers = {
#    "LogisticRegression": LogisticRegression(),
#    "KNearestNeighbors": KNeighborsClassifier(),
#    "SupportVectorClassifier": SVC(),
#    "DecisionTreeClassifier": DecisionTreeClassifier(),
#    }

# experimenting with different classifiers

classifiers = {
    "LogisticRegression": LogisticRegression(),
#     "ExtraTreeClassifier": ExtraTreesClassifier(),
#     "AdaBoostClassifier": AdaBoostClassifier()
}

## 1.4 Model Training

In [68]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, make_scorer

for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    training_score = cross_val_score(clf, X_train, y_train, cv=10, n_jobs=-1,
        scoring=metric)
    
    print("%30s has a training score (%s) of %.2f%% " % (name, metric, 100*training_score.mean()))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


            LogisticRegression has a training score (recall_macro) of 45.31% 


In [70]:
for name in classifiers:
    classifiers[name].fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### 1.4.1 Feature Importance

In [73]:
model = "LogisticRegression"
clf = classifiers[model]

if model == "AdaBoostClassifier" or model == "ExtraTreeClassifier":

    importances = clf.feature_importances_
    std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)
    indices = np.argsort(importances)[::-1]
    
    print("Feature Ranking")
    feature_names = [df_X_train_model.columns[indices[f]] for f in range(importances.shape[0])]
    for f in range(importances.shape[0]):
        print("%2d. feature %2d %20s (%f)" % (f+1, indices[f], feature_names[f], importances[indices[f]]))

IndentationError: expected an indented block (<ipython-input-73-e3ed8b063320>, line 13)

### 1.4.2 Metrics

In [53]:
pd.crosstab(df_pred.true.map(target_labels), df_pred[model].map(target_labels), rownames=['True'], colnames=['Predicted'])

Predicted,long,medium,short
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
long,1572,2844,1607
medium,934,6230,4337
short,605,5244,6617


In [49]:
from sklearn.metrics import classification_report 
print(classification_report(df_pred.true, df_pred[model], target_names=le_target.inverse_transform([0,1,2])))

              precision    recall  f1-score   support

        long       0.51      0.26      0.34      6023
      medium       0.44      0.54      0.48     11501
       short       0.53      0.53      0.53     12466

    accuracy                           0.48     29990
   macro avg       0.49      0.44      0.45     29990
weighted avg       0.49      0.48      0.47     29990



### 1.4.3 Hyper-Parameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

## unused models
# param_space = {
#     "LogisticRegression": {
#         "solver":["liblinear"],
#         "penalty": ['l1', 'l2'], 
#         "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000]},

#     "KNearestNeighbors": {
#         "n_neighbors": range(2,5), 
#         "algorithm": ["auto", "ball_tree", "kd_tree", "brute"]},

#     "SupportVectorClassifier": {
#         "C": [0.5, 0.7, 0.9, 1], 
#         "kernel": ["rbf", "poly", "sigmoid", "linear"]},

#     "DecisionTreeClassifier": {
#         "criterion": ["gini", "entropy"], 
#         "max_depth": range(2,4), 
#         "min_samples_leaf": range(5,7)},
# }

param_space = {
    "LogisticRegression": {
        "solver":["liblinear"],
        "penalty": ['l1', 'l2'], 
        "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000]
    },
    
#     "ExtraTreeClassifier": {
#         "n_estimators":[200, 500],
#         "max_features":["auto", "sqrt", "log2"],
#         "min_samples_split":[2,5,10],
#         "max_depth": [4,5,6,7,8]
#         "criterion":["gini", "entropy"],
#         "bootstrap":[True, False]
#     },
    
#     "AdaBoost": { 
#         "n_estimators": [50, 100],
#         "learning_rate" : [0.01,0.05,0.1,0.3,1],
#         "loss" : ['linear', 'square', 'exponential']
#     }
}
    

best_classifiers = {}

In [None]:
for name, clf in classifiers.items():
        
    param = param_space[name]
    print("\n\nPerforming GridSearchCV on %s..." % name)
    clf = GridSearchCV(classifiers[name], param, cv=5, scoring=metric )

    clf.fit(X_train, y_train)
    best_classifiers[name] = clf

    score = cross_val_score(clf, X_train, y_train, cv=10)
    print(best_classifiers[name])
    print("%s Cross Validation Score (%s): %.2f%%" % (name, metric, 100*score.mean()))
    
    print("Best Params: \n")
    print(best_classifiers[name].best_params_)
    
    # LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
    #               intercept_scaling=1, l1_ratio=None, max_iter=100,
    #               multi_class='auto', n_jobs=None, penalty='l2',
    #               random_state=None, solver='liblinear', tol=0.0001, verbose=0,
    #               warm_start=False)

    # KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
    #                 metric_params=None, n_jobs=None, n_neighbors=4, p=2,
    #                 weights='uniform')
    
    #     SVC(C=0.9, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    #         decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    #         max_iter=-1, probability=False, random_state=None, shrinking=True,
    #         tol=0.001, verbose=False)

    #     DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
    #                            max_depth=3, max_features=None, max_leaf_nodes=None,
    #                            min_impurity_decrease=0.0, min_impurity_split=None,
    #                            min_samples_leaf=5, min_samples_split=2,
    #                            min_weight_fraction_leaf=0.0, presort='deprecated',
    #                            random_state=None, splitter='best')

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import f1_score

best_classifier_pred = {}
for name in classifiers:
    pred = cross_val_predict(best_classifiers[name], X_train, y_train, cv=10)
    best_classifier_pred[name] = pred

best_classifier_pred['true'] = y_train
df_best_pred = pd.DataFrame(df_classifier_pred)

## 1.5 Grading (EXAMPLE)

In [16]:
df_grading = pd.read_csv(f"data/grading.csv", dtype=str)
df_grading.shape

(100000, 193)

### 1.5.1. Encoding `df_grading`

In [17]:
clean_categories(df_grading)

Feture TYPE_OF_ADMISSION -> ['2' '1' '3' '4' '5' '9']
Feture SOURCE_OF_ADMISSION -> ['1' '2' '5' '6' 'D' '4' '8' '9' '0']
Feture PAT_STATE -> ['TX' 'ZZ' 'XX']
Feture SEX_CODE -> ['F' 'M' 'U']
Feture RACE -> ['4' '5' '3' '2' '1']
Feture ETHNICITY -> ['1' '2' '3']


In [18]:
df_X_train_model, df_X_test_model = encode_features(df_X_train, df_grading, cat_LabelBinarize)

### 1.5.2. Make Predictions

In [None]:
model = "LogisticRegression"
df_grading_pred['TARGET'] = classifier[model].fit(df_X_train_model, y_train).predict(df_X_test_model)
df_grading_pred['TARGET'] = df_grading_pred['TARGET'].map(target_labels)

### 1.5.3. Save Predictions

In [19]:
df_grading.loc[:, ["RECORD_ID", "TARGET"]].to_csv("df_grading_pred.csv", index=False)

KeyError: 'Passing list-likes to .loc or [] with any missing labels is no longer supported, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike'

### 1.5.4. Generate Archive

In [20]:
my_lib.make_assignment()

Creating archive: my_assignment.zip
	01-Import.ipynb - OK
	02-EDA.ipynb - OK
	03-Model.ipynb - OK
	my_lib.py - OK
	df_grading_pred.csv - OK
