In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats

import glob, os
import my_lib as ml
import json
import pprint
pp = pprint.PrettyPrinter(indent=4)

from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import classification_report, f1_score, make_scorer, accuracy_score

import seaborn as sns
sns.set_style("darkgrid")

from IPython.display import display, Markdown
pd.set_option('display.max_columns', None)  

SEED = 42
target = "TARGET"
metric = "recall_macro"

In [2]:
df = pd.read_csv(f"data/df_train_sample_00_of_20.csv", dtype=str)
display(df.head(2))
display(df.shape)

Unnamed: 0,RECORD_ID,DISCHARGE,THCIC_ID,PROVIDER_NAME,TYPE_OF_ADMISSION,SOURCE_OF_ADMISSION,PAT_STATE,PAT_COUNTRY,COUNTY,PUBLIC_HEALTH_REGION,PAT_STATUS,SEX_CODE,RACE,ETHNICITY,ADMIT_WEEKDAY,LENGTH_OF_STAY,PAT_AGE,FIRST_PAYMENT_SRC,TYPE_OF_BILL,TOTAL_CHARGES,TOTAL_NON_COV_CHARGES,TOTAL_CHARGES_ACCOMM,TOTAL_NON_COV_CHARGES_ACCOMM,TOTAL_CHARGES_ANCIL,TOTAL_NON_COV_CHARGES_ANCIL,POA_PROVIDER_INDICATOR,ADMITTING_DIAGNOSIS,PRINC_DIAG_CODE,OTH_DIAG_CODE_1,MS_MDC,MS_DRG,MS_GROUPER_VERSION_NBR,MS_GROUPER_ERROR_CODE,APR_MDC,APR_DRG,RISK_MORTALITY,ILLNESS_SEVERITY,APR_GROUPER_VERSION_NBR,APR_GROUPER_ERROR_CODE,ATTENDING_PHYSICIAN_UNIF_ID,ENCOUNTER_INDICATOR,CERT_STATUS,TARGET
0,320136748870,2013Q3,838400,Memorial Hermann Rehab Hospital Katy,3,4,TX,US,201,6,7,F,4,2,2,1,20,MA,111,1671.0,0.0,1145.0,0.0,526.0,0.0,X,V5789,V5789,1919,23,945,1300,0,23,860,2,3,7300,0,1229763162,1,1,short
1,120130546450,2013Q1,409000,John Peter Smith Hospital,1,1,TX,US,367,3,1,M,5,1,2,2,13,MA,111,53064.01,0.0,4092.0,0.0,48972.01,0.0,M,78650,41401,42822,5,247,1300,0,5,175,2,2,7300,0,1578252829,1,2,short


(49984, 43)

## Encoding the target

In [3]:
if df['TARGET'].dtype!=int:
    le_target = LabelEncoder()
    df['TARGET']= le_target.fit_transform(df['TARGET'])
else:
    print(f'DF["TARGET"] is already of type int.')
    pass

target_labels = {c:le_target.inverse_transform([c])[0] for c in [0,1,2]}
print(target_labels)

{0: 'long', 1: 'medium', 2: 'short'}


## Cleaning of data and importing cleaned feature labels

In [4]:
# open feature label file
feature_labels = {}

with open("data/feature_labels.json") as f:
  feature_labels = json.load(f)

ml.clean_data(df)

TYPE_OF_ADMISSION: -> ['3' '1' '2' '4' '5' '9']
SOURCE_OF_ADMISSION: -> ['4' '1' '2' '8' '5' '9' '6' 'D']
PAT_STATE: -> ['TX' 'XX' 'ZZ']
SEX_CODE: -> ['F' 'M' 'U']
RACE: -> ['4' '5' '3' '2' '1']
ETHNICITY: -> ['2' '1' '3']
PAT_AGE: -> ['5' '4' '2' '3' '1']
PAT_COUNTRY: -> ['US' 'MX' 'XX']


## Splitting dataset in Test and Train

In [5]:
x = df.drop(target, axis=1)
y = df.TARGET

df_x_train, df_x_test, df_y_train, df_y_test = train_test_split(x,y, test_size = 0.4, stratify = y, random_state = SEED)

## Encoding Categorical Features

In [6]:
target_features = [ "SOURCE_OF_ADMISSION", "PAT_STATE", "SEX_CODE", "RACE", "ETHNICITY", "ADMIT_WEEKDAY", "PAT_AGE", "PAT_COUNTRY"]

def encode_features(df_x_train, df_x_test, target_features, debug=False):
    # create dataframes to populate
    dfx_train_model = df_x_train.loc[:,[]]
    dfx_test_model = df_x_test.loc[:,[]]

    # encoding features using LabelBinarizer
    for feature in target_features:

        lb = LabelBinarizer()
        lb_result = lb.fit_transform(df_x_train[feature].astype("str"))
        names = [f'{feature}_{l}' for l in lb.classes_]

        for k, name in enumerate(names):
            dfx_train_model[name] = lb_result[:,k]
#             print(name)

        lb_result = lb.transform(df_x_test[feature].astype("str"))
        names = [f'{feature}_{l}' for l in lb.classes_]
        for k, name in enumerate(names):
            dfx_test_model[name] = lb_result[:,k]

    return dfx_train_model, dfx_test_model

dfx_train_model, dfx_test_model = encode_features(df_x_train, df_x_test, target_features)

## Clasiffication Models

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier#GradientTreeBoosting

classifiers = {
    "LogisticRegression": LogisticRegression(max_iter=500),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "AdaBoostClassifier": AdaBoostClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),
    "ExtraTreesClassifier": ExtraTreesClassifier()
}

In [8]:
# convert from dataframes into numrical rrays
x_train, y_train = dfx_train_model.values, df_y_train.values
x_test, y_test = dfx_test_model.values, df_y_test.values

## Model Training

In [9]:
# looping over the classifiers and getting the model scores
for key, classifier in classifiers.items():
    classifier.fit(x_train, y_train)
    training_score = cross_val_score(classifier, x_train, y_train, cv=10)
    print("Classifiers: ", classifier.__class__.__name__, "Has a training score of", round(training_score.mean(), 2) * 100, "% accuracy score")

Classifiers:  LogisticRegression Has a training score of 51.0 % accuracy score
Classifiers:  KNeighborsClassifier Has a training score of 45.0 % accuracy score
Classifiers:  DecisionTreeClassifier Has a training score of 49.0 % accuracy score
Classifiers:  AdaBoostClassifier Has a training score of 51.0 % accuracy score
Classifiers:  RandomForestClassifier Has a training score of 49.0 % accuracy score
Classifiers:  ExtraTreesClassifier Has a training score of 50.0 % accuracy score


### New Classifiers

In [10]:
# from the scores above I have picked the three highest scores to further examine

new_classifiers = {
    "LogisticRegression": LogisticRegression(max_iter=500),
    "AdaBoostClassifier": AdaBoostClassifier(),
    "ExtraTreesClassifier": ExtraTreesClassifier()
}

In [11]:
# visual examination of the predicted results for each classifiers comapared to the real values
classifier_predictions = {}

for name, clf in new_classifiers.items():
    pred = cross_val_predict(clf, x_train, y_train, cv=10)
    classifier_predictions[name] = pred
    
classifier_predictions["true"] = y_train
df_pred = pd.DataFrame(classifier_predictions)
df_pred.head(5)

Unnamed: 0,LogisticRegression,AdaBoostClassifier,ExtraTreesClassifier,true
0,1,1,0,0
1,2,2,2,2
2,2,2,2,2
3,0,0,1,0
4,1,1,1,2


## Feature Importance

In [12]:
for model in new_classifiers:
    new_classifiers[model].fit(x_train, y_train)

for model in new_classifiers:
    clf = new_classifiers["ExtraTreesClassifier"]
    importance = clf.feature_importances_
    std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)
    indices = np.argsort(importance)[::-1]

In [13]:
if False:
    print("Feature Ranking")
    feature_names = [dfx_train_model.columns[indices[f]] for f in range(importance.shape[0])]
    for f in range(importance.shape[0]):
        print("%2d. feature %2d %20s (%f)" % (f+1, indices[f], feature_names[f], importance[indices[f]]))

## Metrics

In [14]:
for model in new_classifiers:
    print(model)
    display(pd.crosstab(df_pred.true.map(target_labels), df_pred[model].map(target_labels), rownames=['True'], colnames=['Predicted']))

LogisticRegression


Predicted,long,medium,short
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
long,1445,3193,1385
medium,657,6549,4295
short,424,4787,7255


AdaBoostClassifier


Predicted,long,medium,short
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
long,1448,3202,1373
medium,666,6597,4238
short,430,4843,7193


ExtraTreesClassifier


Predicted,long,medium,short
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
long,1558,3116,1349
medium,967,6527,4007
short,640,5058,6768


## Hyperparameter Tuning

In [15]:
from sklearn.model_selection import GridSearchCV

best_classifiers = {}

param_space = {
    "LogisticRegression": {
        "solver":["liblinear"],
        "penalty": ['l1', 'l2'],
        "C": [0.01, 0.1, 10, 100, 1000]
#         "max_iter": [100, 300, 500]
    },
    "AdaBoostClassifier": {
        "algorithm": ["SAMME", "SAMME.R"],
        "n_estimators": [10, 30, 50, 80]    
    },
#     "RandomForestClassifier": {
#         "min_samples_leaf": range(3, 6),
#         "max_depth": range(2,4), 
#         "criterion": ["gini", "entropy"]  
#     },
    "ExtraTreesClassifier": {
        "min_samples_leaf": range(3, 6),
        "max_depth": range(2,4), 
        "criterion": ["gini", "entropy"]
    }
} 

### Logistic Regression

In [16]:
name = "LogisticRegression"
param = param_space[name]
print("\n\nPerforming GridSearchCV on %s..." % name)
clf = GridSearchCV(classifiers[name], param, cv=5)

clf.fit(x_train, y_train)
best_classifiers[name] = clf

score = cross_val_score(clf, x_train, y_train, cv=5)
print(best_classifiers[name])
print("%s Cross Validation Score (%s): %.2f%%" % (name, metric, 100*score.mean()))



Performing GridSearchCV on LogisticRegression...
GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=500, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.01, 0.1, 10, 100, 1000],
                         'penalty': ['l1', 'l2'], 'solver': ['liblinear']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)
LogisticRegression Cross Validation Score (recall_mac

In [17]:
print(f'Best parameters for Logistic Regression -> {best_classifiers[name].best_params_}')

Best parameters for Logistic Regression -> {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}


### Ada Boost Classifier

In [18]:
name = "AdaBoostClassifier"
param = param_space[name]
print("\n\nPerforming GridSearchCV on %s..." % name)
clf = GridSearchCV(classifiers[name], param, cv=5)

clf.fit(x_train, y_train)
best_classifiers[name] = clf

score = cross_val_score(clf, x_train, y_train, cv=5)
print(best_classifiers[name])
print("%s Cross Validation Score (%s): %.2f%%" % (name, metric, 100*score.mean()))



Performing GridSearchCV on AdaBoostClassifier...
GridSearchCV(cv=5, error_score=nan,
             estimator=AdaBoostClassifier(algorithm='SAMME.R',
                                          base_estimator=None,
                                          learning_rate=1.0, n_estimators=50,
                                          random_state=None),
             iid='deprecated', n_jobs=None,
             param_grid={'algorithm': ['SAMME', 'SAMME.R'],
                         'n_estimators': [10, 30, 50, 80]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)
AdaBoostClassifier Cross Validation Score (recall_macro): 50.65%


In [19]:
print(f'Best parameters for Ada Boost Classifier -> {best_classifiers[name].best_params_}')

Best parameters for Ada Boost Classifier -> {'algorithm': 'SAMME.R', 'n_estimators': 80}


### Extra Trees Classifier

In [20]:
name = "ExtraTreesClassifier"
param = param_space[name]
print("\n\nPerforming GridSearchCV on %s..." % name)
clf = GridSearchCV(classifiers[name], param, cv=5)

clf.fit(x_train, y_train)
best_classifiers[name] = clf

score = cross_val_score(clf, x_train, y_train, cv=5)
print(best_classifiers[name])
print("%s Cross Validation Score (%s): %.2f%%" % (name, metric, 100*score.mean()))



Performing GridSearchCV on ExtraTreesClassifier...
GridSearchCV(cv=5, error_score=nan,
             estimator=ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0,
                                            class_weight=None, criterion='gini',
                                            max_depth=None, max_features='auto',
                                            max_leaf_nodes=None,
                                            max_samples=None,
                                            min_impurity_decrease=0.0,
                                            min_impurity_split=None,
                                            min_samples_leaf=1,
                                            min_samples_split=2,
                                            min_weight_fraction_leaf=0.0,
                                            n_estimators=100, n_jobs=None,
                                            oob_score=False, random_state=None,
                                            verb

In [21]:
print(f'Best parameters for Extra Trees Classifier -> {best_classifiers[name].best_params_}')

Best parameters for Extra Trees Classifier -> {'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 4}


## Grading

In [22]:
df_grading = pd.read_csv(f"data/grading.csv", dtype=str)
df_grading.shape

ml.clean_data(df_grading)

TYPE_OF_ADMISSION: -> ['2' '1' '3' '4' '5' '9']
SOURCE_OF_ADMISSION: -> ['1' '2' '5' '6' 'D' '4' '8' '9' '0']
PAT_STATE: -> ['TX' 'XX' 'ZZ']
SEX_CODE: -> ['F' 'M' 'U']
RACE: -> ['4' '5' '3' '2' '1']
ETHNICITY: -> ['1' '2' '3']
PAT_AGE: -> ['3' '4' '5' '1' '2']
PAT_COUNTRY: -> ['US' 'MX' 'XX']


In [23]:
dfx_train_model, dfx_test_model = encode_features(df_x_train, df_grading, target_features)

In [24]:
# Ill be using Logistic Refression since it gave the best score 
model = "LogisticRegression"
df_grading['TARGET'] = new_classifiers[model].fit(dfx_train_model, y_train).predict(dfx_test_model)
df_grading['TARGET'] = df_grading['TARGET'].map(target_labels)

In [25]:
model = LogisticRegression(C=100, penalty= 'l1', solver= 'liblinear')
x_train, y_train = dfx_train_model.values, df_y_train.values

In [26]:
c = model.fit(dfx_train_model, y_train)
training_score = cross_val_score(classifier, x_train, y_train, cv=10)
print("Classifiers: ", classifier.__class__.__name__, "Has a training score of", round(training_score.mean(), 2) * 100, "% accuracy score")

Classifiers:  ExtraTreesClassifier Has a training score of 50.0 % accuracy score


In [27]:
display(df_grading.loc[:,["RECORD_ID", "TARGET"]].head())

path = "output/df_grading_pred.csv"
if not os.path.isfile(path):
    print (f'Saving df_grading_pred.csv....')
    df_grading.loc[:,["RECORD_ID", "TARGET"]].to_csv("output/df_grading_pred.csv", index=False)
else:
    print(f'The file {path} already exists.')


Unnamed: 0,RECORD_ID,TARGET
0,420132203333,short
1,220130397490,short
2,120137915430,medium
3,420132272963,short
4,120133877370,medium


The file output/df_grading_pred.csv already exists.


In [28]:
ml.make_assignment()

Creating archive: my_assignment.zip
	01-Import.ipynb - OK
	02-EDA.ipynb - OK
	03-Model.ipynb - OK
	my_lib.py - OK
	df_grading_pred.csv - Skipped


In [29]:
!say "That is now done"