In [1]:
import feather
import pandas as pd
import numpy as np
from numpy import random
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix
import xgboost as xgb
import joblib
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

In [2]:
path = 'REDUCEDFEATURES'
df = feather.read_dataframe(path)
# subset data
OTHER =  df['TRIAGE_CATEGORY'].isin(["2","3", "4"])
df = df[OTHER]
df['TRIAGE_CATEGORY'] = df.TRIAGE_CATEGORY.cat.remove_unused_categories()
df

Unnamed: 0,ID,TRIAGE_CATEGORY,AGE,PREGNANCY_STATUS,AVPU_1,SMOKING_STATUS,Cardiovascular,Mental_Health,Toxicology,Endocrine,Neurology,HR_critical,RR_LOW,BP_critical,TEXT
0,d5db547ae8e87d127ce0d4e51447befa5ff5012b_2019-...,3,50.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,abdomen distension last days vomiting patient ...
1,8ba1ffcd025f4002f0a94b9e5414a8868b40604f_2018-...,3,49.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,gp referal low hb saw gp feeling lethargic rec...
2,75ccf97f6dbc795bd0bfc34067ab6b1a3d7e9325_2017-...,2,55.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,referred lmo ascitic tap c sob past days e abd...
3,ce3585171e5bf2dbaa366ebb9157cdbd790bcd66_2017-...,3,62.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,referred lmo w increasing abdomen distention NA
4,2e1a4fc5fb8bd00cd0ebb96848a2d72e19ff8450_2019-...,3,50.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,presented abdomen bloating taking lasix last d...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
556291,89a5762376eae1fdae2331f5a9f02a59b46a2c7b_2018-...,3,21.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,presents right sided facial weakness right arm...
556293,ed5fc3fcdeefd097873c74ba2e6777046aaac42a_2017-...,4,44.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,f mdh surgery right vomiting possible pancreat...
556294,c428206b9a9ec8e715f7ffc702ad06d116b670b8_2017-...,3,63.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,administration error first net clerical regist...
556296,a5d78c68d7bfa65af1a84659b28910ca5bad8e84_2019-...,2,84.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,syncope home approx hrs states sitting chair w...


In [3]:
# Remove ID column
X = df.drop(['ID','TRIAGE_CATEGORY'],axis=1) # subset all but ID and target variable
y = df['TRIAGE_CATEGORY'].values #target
print(X, y)
# recode response variable
mapping = {
        '2':'CAT2',
        '3':'CAT3_4',
        '4':'CAT3_4',
    }

y = df['TRIAGE_CATEGORY'].replace(mapping)

         AGE  PREGNANCY_STATUS  AVPU_1  SMOKING_STATUS  Cardiovascular  \
0       50.0               0.0     1.0             0.0             0.0   
1       49.0               0.0     NaN             0.0             0.0   
2       55.0               0.0     NaN             0.0             0.0   
3       62.0               0.0     1.0             0.0             0.0   
4       50.0               0.0     1.0             0.0             0.0   
...      ...               ...     ...             ...             ...   
556291  21.0               0.0     NaN             0.0             0.0   
556293  44.0               0.0     1.0             0.0             0.0   
556294  63.0               0.0     NaN             0.0             0.0   
556296  84.0               1.0     1.0             0.0             0.0   
556297  91.0               1.0     1.0             0.0             0.0   

        Mental_Health  Toxicology  Endocrine  Neurology  HR_critical  RR_LOW  \
0                 0.0         0

In [4]:
#Train test split with stratified sampling for evaluation
x_train, x_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = .3, 
                                                    shuffle = True, 
                                                    stratify = y, 
                                                    random_state = 3)

In [6]:
# build the pipeline
# https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html
from sklearn.compose import ColumnTransformer
preprocess = ColumnTransformer(
    [('description_tfidf', TfidfVectorizer(), 'TEXT')],
    remainder='passthrough')

# Now we have a full prediction pipeline.
pipe2 = Pipeline(steps=[('preprocessor', preprocess),
                                ('xgboost', xgb.XGBClassifier(objective='binary:logistic'))])

In [7]:
# fit model
pipe2.fit(x_train, y_train)

# output scores on training data
scores = cross_val_score(pipe2, x_train, y_train, cv=5)
print('Accuracy for Tf-Idf & XGBoost Classifier : ', scores.mean())


Accuracy for Tf-Idf & XGBoost Classifier :  0.8617170442649933


In [8]:
# save model
filename = 'MODEL2.pkl'
joblib.dump(pipe2, filename)

['MODEL2.pkl']

In [9]:
# output scores on test data
y_preds = pipe2.predict(x_test)
accuracy = accuracy_score(y_test, y_preds)
print("Accuracy on Test data: %.2f%%" % (accuracy * 100.0))

Accuracy on Test data: 86.32%


In [10]:
# code to load model later
# import joblib
# model = joblib.load('MODEL2.pkl')

In [11]:
labels = np.unique(y_test)
a =  confusion_matrix(y_test, y_preds, labels=labels)
pd.DataFrame(a, index=labels, columns=labels)

Unnamed: 0,CAT2,CAT3_4
CAT2,27393,15786
CAT3_4,4960,103498


In [12]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_preds, target_names=labels))

              precision    recall  f1-score   support

        CAT2       0.85      0.63      0.73     43179
      CAT3_4       0.87      0.95      0.91    108458

    accuracy                           0.86    151637
   macro avg       0.86      0.79      0.82    151637
weighted avg       0.86      0.86      0.86    151637

