In [2]:
import numpy as np
import pandas as pd

In [3]:
dataset = pd.read_csv('/users/he145100/OneDrive - WA Health/Test Data/vw_emergency_episode_50k.csv', dtype={
                                                                               'standard_australian_classification_of_countries_2011_for_country_of_birth':'str',
  'ambulance_number':'str'})

In [4]:
list(dataset.columns)

['establishment_code',
 'sex',
 'ethnicity',
 'marital_status',
 'interpreter_required',
 'employment_status',
 'occupation',
 'triage_category',
 'visit_type',
 'mode_of_arrival',
 'referral_source',
 'primary_diagnosis',
 'presenting_complaint',
 'major_diagnostic_category',
 'human_intent_of_injury',
 'feeder_system',
 'external_cause_of_injury',
 'state',
 'claim_type',
 'treating_doctor_type',
 'senior_doctor_type',
 'ambulance_number',
 'statistical_area_2_2016',
 'payment_classification',
 'aboriginality',
 'primary_diagnosis_ICD10AM_chapter',
 'principal_diagnosis_system_code_EDIS',
 'additional_diagnosis_system_code_EDIS',
 'standard_australian_classification_of_countries_2011_for_country_of_birth',
 'australian_postcode',
 'metropolitan_hospital_flag',
 'rural_hospital_flag',
 'local_health_network',
 'attendance_length_of_episode',
 'attendance',
 'attendance_with_length_of_episode_less_than_or_equal_to_4_hours',
 'attendance_with_length_of_episode_less_than_or_equal_to_4_ho

In [5]:
dataset.shape

(50000, 68)

In [6]:
data = dataset.sample(frac=0.95, random_state=786)
data_unseen = dataset.drop(data.index)
data.reset_index(inplace=True, drop=True)
data_unseen.reset_index(inplace=True, drop=True)
print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

Data for Modeling: (47500, 68)
Unseen Data For Predictions: (2500, 68)


In [7]:
from pycaret.classification import *

In [8]:
#Ordinal Encoding
#When the categorical features in the dataset contain variables with intrinsic natural order such as Low, Medium, and High, 
#these must be encoded differently than nominal variables (where there is no intrinsic order for e.g. Male or Female). 
#This can be achieved using  the ordinal_features parameter in the setup function that accepts a dictionary with feature 
#names and the levels in the increasing order from lowest to highest.

#triage rating needs to setup as ordinal 
#ordinal_features = {'triage_category' : ['7','5','4','3','2','1']},
admission_test_1 = setup(data = data, target = 'admission',
                         normalize = True,
                         ignore_features=['reattendance_within_48_hours_for_the_same_condition', 
                                          'admission_with_length_of_episode_less_than_or_equal_to_4_hours',
 'admission_with_length_of_episode_less_than_or_equal_to_4_hours_denominator',
                                         'departure_with_length_of_episode_less_than_or_equal_to_4_hours'],
                         ordinal_features = {'triage_category' : ['7','5','4','3','2','1']},
                         high_cardinality_features = ['presenting_complaint','australian_postcode','primary_diagnosis','primary_diagnosis_ICD10AM_chapter',
                                                      'principal_diagnosis_system_code_EDIS'], 
                         bin_numeric_features = ['age'],
                         session_id=123) 

Unnamed: 0,Description,Value
0,session_id,123
1,Target,admission
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(47500, 68)"
5,Missing Values,True
6,Numeric Features,5
7,Categorical Features,58
8,Ordinal Features,True
9,High Cardinality Features,True


In [None]:
best_model = compare_models()

IntProgress(value=0, description='Processing: ', max=84)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.9252,0.9758,0.8884,0.841,0.864,0.8125,0.8131,92.53
gbc,Gradient Boosting Classifier,0.9252,0.9766,0.8852,0.8434,0.8637,0.8122,0.8128,31.498
svm,SVM - Linear Kernel,0.9246,0.0,0.906,0.8295,0.8654,0.8133,0.8154,4.114
ada,Ada Boost Classifier,0.9238,0.975,0.8813,0.8416,0.861,0.8085,0.809,24.086
ridge,Ridge Classifier,0.9188,0.0,0.8565,0.8427,0.8495,0.7939,0.794,10.772
rf,Random Forest Classifier,0.9077,0.9658,0.7521,0.8861,0.8135,0.7528,0.7573,9.665
dt,Decision Tree Classifier,0.9005,0.8779,0.8292,0.8051,0.8169,0.7486,0.7488,1.629
knn,K Neighbors Classifier,0.8619,0.906,0.6872,0.7719,0.727,0.635,0.637,354.516
nb,Naive Bayes,0.771,0.5758,0.1506,0.9613,0.26,0.2023,0.3273,2.577
qda,Quadratic Discriminant Analysis,0.685,0.584,0.3629,0.5203,0.4265,0.2934,0.3019,292.914


In [None]:
ada = create_model('ada')

In [None]:
print(ada)

In [None]:
tuned_ada = tune_model(ada)

In [None]:
plot_model(tuned_ada, plot = 'auc')

In [None]:
plot_model(tuned_ada, plot = 'pr')

In [None]:
plot_model(tuned_ada, plot='feature')

In [None]:
plot_model(tuned_ada, plot = 'confusion_matrix')
#https://towardsdatascience.com/understanding-confusion-matrix-a9ad42dcfd62
#True Positive:
#Interpretation: You predicted positive and it’s true.
#You predicted that a woman is pregnant and she actually is.
#True Negative:
#Interpretation: You predicted negative and it’s true.
#You predicted that a man is not pregnant and he actually is not.
#False Positive: (Type 1 Error)
#Interpretation: You predicted positive and it’s false.
#You predicted that a man is pregnant but he actually is not.
#False Negative: (Type 2 Error)
#Interpretation: You predicted negative and it’s false.
#You predicted that a woman is not pregnant but she actually is.

In [None]:
evaluate_model(tuned_ada)

In [None]:
predict_model(tuned_ada)

In [None]:
final_ada = finalize_model(tuned_ada)

In [None]:
#Final ADA parameters for deployment
print(final_ada)

In [None]:
unseen_predictions = predict_model(final_ada, data=data_unseen)
unseen_predictions.head()

In [None]:
unseen_predictions[["admission","Label"]].head(50)

In [None]:
from pycaret.utils import check_metric
check_metric(unseen_predictions['admission'], unseen_predictions['Label'], metric = 'Accuracy')

In [None]:
save_model(best, 'ada_admission')