# Texas Dataset Model
- gather data 
- clean data
- feature engineering
- define model
- training, testing and predict

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats

import glob, os
import my_lib as ml
import json
import pprint
pp = pprint.PrettyPrinter(indent=4)

from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import classification_report, f1_score, make_scorer, accuracy_score

import seaborn as sns
sns.set_style("darkgrid")

from IPython.display import display, Markdown
pd.set_option('display.max_columns', None)  

SEED = 42
target = "TARGET"
metric = "recall_macro"

## Merging Facility to our DataFrame

In [2]:
df_raw = pd.read_csv(f"data/df_train_sample_00_of_20.csv", dtype=str)
df_facility = pd.read_csv("data/facility.csv", dtype = str)
df_grading_raw = pd.read_csv(f"data/grading.csv", dtype=str)

print(df_facility.shape)
print(df_raw.shape)
print(df_grading_raw.shape)

(597, 10)
(49984, 43)
(100000, 41)


In [3]:
df = pd.merge(df_raw, df_facility, on="THCIC_ID", how="left")
display(df.shape)

# replace empty values
df_facility_columns = []
for i in range(2, len(df_facility.columns) - 1):
    col = df_facility.columns[i]
    df[col].fillna(0, inplace=True)
    
    df_facility_columns.append(col)

(49984, 52)

## Encoding the target

In [4]:
if df['TARGET'].dtype!=int:
    le_target = LabelEncoder()
    df['TARGET']= le_target.fit_transform(df['TARGET'])
else:
    print(f'DF["TARGET"] is already of type int.')
    pass

target_labels = {c:le_target.inverse_transform([c])[0] for c in [0,1,2]}
print(target_labels)

{0: 'long', 1: 'medium', 2: 'short'}


## Cleaning of data

In [5]:
ml.clean_data(df)

TYPE_OF_ADMISSION: -> ['3' '1' '2' '4' '5' '9']
SOURCE_OF_ADMISSION: -> ['4' '1' '2' '8' '5' '9' '6' 'D']
PAT_STATE: -> ['TX' 'XX' 'ZZ']
SEX_CODE: -> ['F' 'M' 'U']
RACE: -> ['4' '5' '3' '2' '1']
ETHNICITY: -> ['2' '1' '3']
PAT_AGE: -> ['5' '4' '2' '3' '1']
PAT_COUNTRY: -> ['US' 'MX' 'XX']
POA_PROVIDER_INDICATOR: -> ['X' 'M' 'R']
ILLNESS_SEVERITY: -> ['3' '2' '1' '4']
RISK_MORTALITY: -> ['2' '1' '3' '4']


## Feature Engineering
- bucket "PAT_STATE" into people living in texas and people who doesnt live in texas.
- bucket the "ADMIT_WEEKDAY" into weekday or weekend.

In [6]:
def feature_engineering(df_input):
    feature = "PAT_STATE"
    df_input["NON_TEXAS"] = df_input[feature] != "TX"
    df_input["FROM_TEXAS"] = df_input[feature] == "TX"

    feature = "ADMIT_WEEKDAY"
    df_input["WEEK_DAY"] = ((df_input[feature] != "6") & (df_input[feature] != "7"))
    df_input["WEEK_END"] = (df_input[feature] == "6") | (df_input[feature] == "7")
    
feature_engineering(df)

## Splitting dataset in Test and Train

In [7]:
def split_train_test(input_df):
    x = input_df.drop(target, axis=1)
    y = input_df.TARGET

    df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(x, y, test_size = 0.4, stratify = y, random_state = SEED)
    
    return df_X_train, df_X_test, df_y_train, df_y_test

df_x_train, df_x_test, df_y_train, df_y_test = split_train_test(df)

## Encoding Categorical Features

In [8]:
target_features = ["SOURCE_OF_ADMISSION", "PAT_STATE", "SEX_CODE", "RACE", "ETHNICITY", "ADMIT_WEEKDAY", "PAT_AGE", "PAT_COUNTRY", "POA_PROVIDER_INDICATOR", "ILLNESS_SEVERITY", "RISK_MORTALITY"]
engineered_features = ["NON_TEXAS", "FROM_TEXAS", "WEEK_DAY", "WEEK_END"]

def encode_features(df_x_train, df_x_test, target_features, engineered_features, df_facility_columns,  debug=False):
    # create dataframes to populate
    dfx_train_model = df_x_train.loc[:,[]]
    dfx_test_model = df_x_test.loc[:,[]]

    # encoding features using LabelBinarizer
    for feature in target_features:

        lb = LabelBinarizer()
        lb_result = lb.fit_transform(df_x_train[feature].astype("str"))
        names = [f'{feature}_{l}' for l in lb.classes_]

        for k, name in enumerate(names):
            dfx_train_model[name] = lb_result[:,k]
#             print(name)

        lb_result = lb.transform(df_x_test[feature].astype("str"))
        names = [f'{feature}_{l}' for l in lb.classes_]
        for k, name in enumerate(names):
            dfx_test_model[name] = lb_result[:,k]
            
    for feature in engineered_features:
        dfx_train_model[feature] = df_x_train[feature].astype(int)
        dfx_test_model[feature] = df_x_test[feature].astype(int)
        
    for feature in df_facility_columns:
        dfx_train_model[feature] = df_x_train[feature].astype(int)
        dfx_test_model[feature] = df_x_test[feature].astype(int)  

    return dfx_train_model, dfx_test_model

dfx_train_model, dfx_test_model = encode_features(df_x_train, df_x_test, target_features, engineered_features, df_facility_columns)
print(len(dfx_train_model.columns))
print(len(dfx_test_model.columns))

59
59


## Clasiffication Models

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier#GradientTreeBoosting

classifiers = {
    "LogisticRegression": LogisticRegression(max_iter=500),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "AdaBoostClassifier": AdaBoostClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),
    "ExtraTreesClassifier": ExtraTreesClassifier()
}

In [10]:
# convert from dataframes into numrical rrays
x_train, y_train = dfx_train_model.values, df_y_train.values
x_test, y_test = dfx_test_model.values, df_y_test.values

## Model Training

In [11]:
# looping over the classifiers and getting the model scores
for key, classifier in classifiers.items():
    classifier.fit(x_train, y_train)
    training_score = cross_val_score(classifier, x_train, y_train, cv=10)
    print("Classifiers: ", classifier.__class__.__name__, "Has a training score of", round(training_score.mean(), 2) * 100, "% accuracy score")

Classifiers:  LogisticRegression Has a training score of 56.99999999999999 % accuracy score
Classifiers:  KNeighborsClassifier Has a training score of 52.0 % accuracy score
Classifiers:  DecisionTreeClassifier Has a training score of 48.0 % accuracy score
Classifiers:  AdaBoostClassifier Has a training score of 56.99999999999999 % accuracy score
Classifiers:  RandomForestClassifier Has a training score of 53.0 % accuracy score
Classifiers:  ExtraTreesClassifier Has a training score of 51.0 % accuracy score


### New Classifiers

In [12]:
# from the scores above I have picked the three highest scores to further examine
new_classifiers = {
    "LogisticRegression": LogisticRegression(max_iter=500),
    "AdaBoostClassifier": AdaBoostClassifier(),
    "ExtraTreesClassifier": ExtraTreesClassifier()
}

In [13]:
# visual examination of the predicted results for each classifiers comapared to the real values
classifier_predictions = {}

for name, clf in new_classifiers.items():
    pred = cross_val_predict(clf, x_train, y_train, cv=10)
    classifier_predictions[name] = pred
    
classifier_predictions["true"] = y_train
df_pred = pd.DataFrame(classifier_predictions)
df_pred.head(5)

Unnamed: 0,LogisticRegression,AdaBoostClassifier,ExtraTreesClassifier,true
0,0,0,0,0
1,2,2,2,2
2,2,1,2,2
3,0,0,1,0
4,1,1,1,2


## Feature Importance

In [14]:
for model in new_classifiers:
    new_classifiers[model].fit(x_train, y_train)

for model in new_classifiers:
    clf = new_classifiers["ExtraTreesClassifier"]
    importance = clf.feature_importances_
    std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)
    indices = np.argsort(importance)[::-1]

In [15]:
low_importance = []

print("Feature Ranking\n")
feature_names = [dfx_train_model.columns[indices[f]] for f in range(importance.shape[0])]
for f in range(importance.shape[0]):
    if importance[indices[f]] < 0.004:
        low_importance.append(feature_names[f])
        print("Feature: {0}, Importance: {1}".format(feature_names[f], importance[indices[f]]))


Feature Ranking

Feature: SOURCE_OF_ADMISSION_6, Importance: 0.0038688260887348672
Feature: SOURCE_OF_ADMISSION_8, Importance: 0.003684274305898153
Feature: SOURCE_OF_ADMISSION_D, Importance: 0.0035293733690079407
Feature: NON_TEXAS, Importance: 0.0031307187986432767
Feature: FROM_TEXAS, Importance: 0.0031091129327367535
Feature: PAT_STATE_TX, Importance: 0.003074277444898382
Feature: PAT_STATE_XX, Importance: 0.0029428624401178984
Feature: SOURCE_OF_ADMISSION_9, Importance: 0.0022711171983333476
Feature: RACE_1, Importance: 0.0016596799878966855
Feature: PAT_STATE_ZZ, Importance: 0.001579724345857235
Feature: PAT_COUNTRY_US, Importance: 0.0014731819721763922
Feature: PAT_COUNTRY_XX, Importance: 0.0012446524716550119
Feature: FAC_OTHER_LTC_IND, Importance: 0.0009691704648162969
Feature: PAT_COUNTRY_MX, Importance: 0.00039721890177046827


In [16]:
# drop unimportant feature
dfx_train_model.drop(columns=low_importance, inplace=True)
dfx_test_model.drop(columns=low_importance, inplace=True)
df_y_train.drop(columns=low_importance, inplace=True)
df_y_test.drop(columns=low_importance, inplace=True)

print(len(dfx_train_model.columns))
print(len(dfx_test_model.columns))

45
45


In [17]:
# convert from dataframes into numrical rrays
x_train, y_train = dfx_train_model.values, df_y_train.values
x_test, y_test = dfx_test_model.values, df_y_test.values

In [18]:
# looping over the classifiers and getting the model scores
for key, classifier in classifiers.items():
    classifier.fit(x_train, y_train)
    training_score = cross_val_score(classifier, x_train, y_train, cv=10)
    print("Classifiers: ", classifier.__class__.__name__, "Has a training score of", round(training_score.mean(), 2) * 100, "% accuracy score")

Classifiers:  LogisticRegression Has a training score of 56.99999999999999 % accuracy score
Classifiers:  KNeighborsClassifier Has a training score of 52.0 % accuracy score
Classifiers:  DecisionTreeClassifier Has a training score of 49.0 % accuracy score
Classifiers:  AdaBoostClassifier Has a training score of 56.99999999999999 % accuracy score
Classifiers:  RandomForestClassifier Has a training score of 53.0 % accuracy score
Classifiers:  ExtraTreesClassifier Has a training score of 51.0 % accuracy score


## Metrics

In [19]:
for model in new_classifiers:
    print(model)
    display(pd.crosstab(df_pred.true.map(target_labels), df_pred[model].map(target_labels), rownames=['True'], colnames=['Predicted']))

LogisticRegression


Predicted,long,medium,short
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
long,2745,2628,650
medium,1129,6188,4184
short,470,3744,8252


AdaBoostClassifier


Predicted,long,medium,short
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
long,2809,2560,654
medium,1227,6254,4020
short,541,3831,8094


ExtraTreesClassifier


Predicted,long,medium,short
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
long,2975,2134,914
medium,2036,5387,4078
short,1042,4359,7065


## Hyperparameter Tuning

In [20]:
from sklearn.model_selection import GridSearchCV

best_classifiers = {}

param_space = {
    "LogisticRegression": {
        "solver":["liblinear"],
        "penalty": ['l1', 'l2'],
        "C": [0.01, 0.1, 10, 100, 1000]
#         "max_iter": [100, 300, 500]
    },
    "AdaBoostClassifier": {
        "algorithm": ["SAMME", "SAMME.R"],
        "n_estimators": [10, 30, 50, 80]    
    },
#     "RandomForestClassifier": {
#         "min_samples_leaf": range(3, 6),
#         "max_depth": range(2,4), 
#         "criterion": ["gini", "entropy"]  
#     },
    "ExtraTreesClassifier": {
        "min_samples_leaf": range(3, 6),
        "max_depth": range(2,4), 
        "criterion": ["gini", "entropy"]
    }
} 

### Logistic Regression

In [21]:
name = "LogisticRegression"
param = param_space[name]
print("\n\nPerforming GridSearchCV on %s..." % name)
clf = GridSearchCV(classifiers[name], param, cv=5)

clf.fit(x_train, y_train)
best_classifiers[name] = clf

score = cross_val_score(clf, x_train, y_train, cv=5)
print(best_classifiers[name])
print("%s Cross Validation Score (%s): %.2f%%" % (name, metric, 100*score.mean()))



Performing GridSearchCV on LogisticRegression...
GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=500, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.01, 0.1, 10, 100, 1000],
                         'penalty': ['l1', 'l2'], 'solver': ['liblinear']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)
LogisticRegression Cross Validation Score (recall_mac

In [22]:
print(f'Best parameters for Logistic Regression -> {best_classifiers[name].best_params_}')

Best parameters for Logistic Regression -> {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}


### Ada Boost Classifier

In [23]:
name = "AdaBoostClassifier"
param = param_space[name]
print("\n\nPerforming GridSearchCV on %s..." % name)
clf = GridSearchCV(classifiers[name], param, cv=5)

clf.fit(x_train, y_train)
best_classifiers[name] = clf

score = cross_val_score(clf, x_train, y_train, cv=5)
print(best_classifiers[name])
print("%s Cross Validation Score (%s): %.2f%%" % (name, metric, 100*score.mean()))



Performing GridSearchCV on AdaBoostClassifier...
GridSearchCV(cv=5, error_score=nan,
             estimator=AdaBoostClassifier(algorithm='SAMME.R',
                                          base_estimator=None,
                                          learning_rate=1.0, n_estimators=50,
                                          random_state=None),
             iid='deprecated', n_jobs=None,
             param_grid={'algorithm': ['SAMME', 'SAMME.R'],
                         'n_estimators': [10, 30, 50, 80]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)
AdaBoostClassifier Cross Validation Score (recall_macro): 57.19%


In [24]:
print(f'Best parameters for Ada Boost Classifier -> {best_classifiers[name].best_params_}')

Best parameters for Ada Boost Classifier -> {'algorithm': 'SAMME.R', 'n_estimators': 80}


### Extra Trees Classifier

In [25]:
name = "ExtraTreesClassifier"
param = param_space[name]
print("\n\nPerforming GridSearchCV on %s..." % name)
clf = GridSearchCV(classifiers[name], param, cv=5)

clf.fit(x_train, y_train)
best_classifiers[name] = clf

score = cross_val_score(clf, x_train, y_train, cv=5)
print(best_classifiers[name])
print("%s Cross Validation Score (%s): %.2f%%" % (name, metric, 100*score.mean()))



Performing GridSearchCV on ExtraTreesClassifier...
GridSearchCV(cv=5, error_score=nan,
             estimator=ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0,
                                            class_weight=None, criterion='gini',
                                            max_depth=None, max_features='auto',
                                            max_leaf_nodes=None,
                                            max_samples=None,
                                            min_impurity_decrease=0.0,
                                            min_impurity_split=None,
                                            min_samples_leaf=1,
                                            min_samples_split=2,
                                            min_weight_fraction_leaf=0.0,
                                            n_estimators=100, n_jobs=None,
                                            oob_score=False, random_state=None,
                                            verb

In [26]:
print(f'Best parameters for Extra Trees Classifier -> {best_classifiers[name].best_params_}')

Best parameters for Extra Trees Classifier -> {'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 3}


## Grading

In [27]:
df_grading = pd.merge(df_grading_raw, df_facility, on="THCIC_ID", how="left")

# replace empty values
for i in range(2, len(df_facility.columns) - 1):
    col = df_facility.columns[i]
    df_grading[col].fillna(0, inplace=True)

In [28]:
# Clean grading data
ml.clean_data(df_grading)

TYPE_OF_ADMISSION: -> ['2' '1' '3' '4' '5' '9']
SOURCE_OF_ADMISSION: -> ['1' '2' '5' '6' 'D' '4' '8' '9' '0']
PAT_STATE: -> ['TX' 'XX' 'ZZ']
SEX_CODE: -> ['F' 'M' 'U']
RACE: -> ['4' '5' '3' '2' '1']
ETHNICITY: -> ['1' '2' '3']
PAT_AGE: -> ['3' '4' '5' '1' '2']
PAT_COUNTRY: -> ['US' 'MX' 'XX']
POA_PROVIDER_INDICATOR: -> ['M' 'X' 'R']
ILLNESS_SEVERITY: -> ['2' '1' '3' '4']
RISK_MORTALITY: -> ['1' '2' '3' '4']


In [29]:
# Feature Engineering
feature_engineering(df_grading)

In [31]:
# # split df_grading
# df_x_train, df_x_test, df_y_train, df_y_test = split_train_test(df_grading)

In [32]:
# encode features
dfx_train_model, dfx_test_model = encode_features(df_x_train, df_grading, target_features, engineered_features, df_facility_columns)
print(len(dfx_train_model.columns))
print(len(dfx_test_model.columns))

59
59


In [33]:
# final classifier
final_clf = AdaBoostClassifier(algorithm='SAMME.R', n_estimators=80)

# make our predictions
df_grading['TARGET'] = final_clf.fit(dfx_train_model, df_y_train).predict(dfx_test_model)
df_grading['TARGET'] = df_grading['TARGET'].map(target_labels)

In [35]:
# Save our prediction
df_grading.loc[:, ["RECORD_ID", "TARGET"]].to_csv("df_grading_pred.csv", index=False)

In [36]:
ml.make_assignment()

Creating archive: my_assignment.zip
	01-Import.ipynb - OK
	02-EDA.ipynb - OK
	03-Model.ipynb - OK
	my_lib.py - OK
	df_grading_pred.csv - OK


In [None]:
!say "That is now done"