In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats

import glob, os
import my_lib as ml
import pprint
pp = pprint.PrettyPrinter(indent=4)

#Basic Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report,accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder

import seaborn as sns
sns.set_style("darkgrid")

from IPython.display import display, Markdown
pd.set_option('display.max_columns', None)  

SEED = 42
target = "TARGET"
feature_labels = {}
relevant_features = ["TYPE_OF_ADMISSION","SOURCE_OF_ADMISSION","PAT_STATE", "SEX_CODE", "RACE", "ETHNICITY", "ADMIT_WEEKDAY", "PAT_AGE"]

In [12]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [19]:
# df = pd.read_csv(f"data/df_train_sample_00_of_20.csv", dtype=str)
df = pd.read_csv("data/df.csv", dtype=str)
display(df.head(2))
display(df.shape)

Unnamed: 0,RECORD_ID,DISCHARGE,THCIC_ID,PROVIDER_NAME,TYPE_OF_ADMISSION,SOURCE_OF_ADMISSION,PAT_STATE,PAT_COUNTRY,COUNTY,PUBLIC_HEALTH_REGION,PAT_STATUS,SEX_CODE,RACE,ETHNICITY,ADMIT_WEEKDAY,LENGTH_OF_STAY,PAT_AGE,FIRST_PAYMENT_SRC,TYPE_OF_BILL,TOTAL_CHARGES,TOTAL_NON_COV_CHARGES,TOTAL_CHARGES_ACCOMM,TOTAL_NON_COV_CHARGES_ACCOMM,TOTAL_CHARGES_ANCIL,TOTAL_NON_COV_CHARGES_ANCIL,POA_PROVIDER_INDICATOR,ADMITTING_DIAGNOSIS,PRINC_DIAG_CODE,OTH_DIAG_CODE_1,MS_MDC,MS_DRG,MS_GROUPER_VERSION_NBR,MS_GROUPER_ERROR_CODE,APR_MDC,APR_DRG,RISK_MORTALITY,ILLNESS_SEVERITY,APR_GROUPER_VERSION_NBR,APR_GROUPER_ERROR_CODE,ATTENDING_PHYSICIAN_UNIF_ID,ENCOUNTER_INDICATOR,CERT_STATUS,TARGET
0,320136748870,2013Q3,838400,Memorial Hermann Rehab Hospital Katy,3,4,TX,US,201,6,7,F,4,2,2,1,20,MA,111,1671.0,0.0,1145.0,0.0,526.0,0.0,X,V5789,V5789,1919,23,945,1300,0,23,860,2,3,7300,0,1229763162,1,1,2
1,120130546450,2013Q1,409000,John Peter Smith Hospital,1,1,TX,US,367,3,1,M,5,1,2,2,13,MA,111,53064.01,0.0,4092.0,0.0,48972.01,0.0,M,78650,41401,42822,5,247,1300,0,5,175,2,2,7300,0,1578252829,1,2,2


(49984, 43)

## Splitting dataset in test and train

In [20]:
x = df.drop(target, axis=1)
y = df.TARGET

df_x_train, df_x_test, df_y_train, df_y_test = train_test_split(x,y, test_size = 0.4, stratify = y, random_state = SEED)

## Encoding Categorical Features

In [21]:
target_features = [ "SOURCE_OF_ADMISSION", "PAT_STATE", "SEX_CODE", "RACE", "ETHNICITY", "ADMIT_WEEKDAY"] #"PAT_AGE"]

dfx_train_model = df_x_train.loc[:,[]]
dfx_test_model = df_x_test.loc[:,[]]

for feature in target_features:
    lb = LabelBinarizer()
    
    lb_result = lb.fit_transform(df_x_train[feature].astype("str"))
    names = [f'{feature}_{l}' for l in lb.classes_]
    for k, name in enumerate(names):
        dfx_train_model[name] = lb_result[:,k]
    
    lb_result = lb.transform(df_x_test[feature].astype("str"))
    names = [f'{feature}_{l}' for l in lb.classes_]
    for k, name in enumerate(names):
        dfx_test_model[name] = lb_result[:,k]

In [29]:
import pprint as pp
pp.pprint(dfx_train_model.columns)

Index(['SOURCE_OF_ADMISSION_1', 'SOURCE_OF_ADMISSION_2',
       'SOURCE_OF_ADMISSION_4', 'SOURCE_OF_ADMISSION_5',
       'SOURCE_OF_ADMISSION_6', 'SOURCE_OF_ADMISSION_8',
       'SOURCE_OF_ADMISSION_9', 'SOURCE_OF_ADMISSION_D', 'PAT_STATE_TX',
       'PAT_STATE_XX', 'PAT_STATE_ZZ', 'SEX_CODE_F', 'SEX_CODE_M',
       'SEX_CODE_U', 'RACE_1', 'RACE_2', 'RACE_3', 'RACE_4', 'RACE_5',
       'ETHNICITY_1', 'ETHNICITY_2', 'ETHNICITY_3', 'ADMIT_WEEKDAY_1',
       'ADMIT_WEEKDAY_2', 'ADMIT_WEEKDAY_3', 'ADMIT_WEEKDAY_4',
       'ADMIT_WEEKDAY_5', 'ADMIT_WEEKDAY_6', 'ADMIT_WEEKDAY_7'],
      dtype='object')


In [35]:
# Using a few classifiers with the dataset
classifiers = {
    "LogisiticRegression":LogisticRegression(),
    "KNearest":KNeighborsClassifier(),
    "Decision Tree Classifier":DecisionTreeClassifier()
}

In [36]:
x_train, y_train = dfx_train_model.values, df_y_train.values
x_test, y_test = dfx_test_model.values, df_y_test.values

In [37]:
# looping over the classifiers and getting the model scores
from sklearn.model_selection import cross_val_score

for key, classifier in classifiers.items():
    classifier.fit(x_train, y_train)
    training_score = cross_val_score(classifier, x_train, y_train, cv=10)
    print("Classifiers: ", classifier.__class__.__name__, "Has a training score of", round(training_score.mean(), 2) * 100, "% accuracy score")

Classifiers:  LogisticRegression Has a training score of 48.0 % accuracy score
Classifiers:  KNeighborsClassifier Has a training score of 44.0 % accuracy score
Classifiers:  DecisionTreeClassifier Has a training score of 48.0 % accuracy score


In [34]:
!say "sex"