In [1]:
import feather
import pandas as pd
import numpy as np
from numpy import random
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score
import xgboost as xgb
import joblib

In [2]:
# load data
path = 'INITIAL_TEXT'
df = feather.read_dataframe(path)
#df = df.loc[:]
# Remove ID column
#df = df.loc[:, df.columns != 'ID']
df.head(10)

Unnamed: 0,ID,TRIAGE_CATEGORY,CHRONIC_HISTORY,TEXT
0,d5db547ae8e87d127ce0d4e51447befa5ff5012b_2019-...,3,1.0,abdomen distension last days vomiting patient ...
1,8ba1ffcd025f4002f0a94b9e5414a8868b40604f_2018-...,3,0.0,gp referal low hb saw gp feeling lethargic rec...
2,75ccf97f6dbc795bd0bfc34067ab6b1a3d7e9325_2017-...,2,0.0,referred lmo ascitic tap c sob past days e abd...
3,ce3585171e5bf2dbaa366ebb9157cdbd790bcd66_2017-...,3,0.0,referred lmo w increasing abdomen distention NA
4,2e1a4fc5fb8bd00cd0ebb96848a2d72e19ff8450_2019-...,3,0.0,presented abdomen bloating taking lasix last d...
5,9801e2c709c01d192846b111486761d624195d3d_2017-...,3,1.0,biba bilateral oedema abdomen distension rece...
6,cbdad23f6ecfd13126b581b58b67c556bbe80ed5_2018-...,3,1.0,biba days bowel movements group home followed...
7,0e6a0a3413b9564c047f307122ff8d97413bc454_2017-...,2,0.0,patient biba home distended abdomen due parace...
8,dc6bcdf7172899c263bbd9589fd72e0bbcdb871e_2019-...,4,1.0,abdominal distension associated sob last drain...
9,cbbd84e19d92fae86072b3d154dc2a1b3ac1cd93_2019-...,3,1.0,gp abdominal distension history ascites assoc...


In [3]:
df_x = df[['TEXT','CHRONIC_HISTORY']]
df_x

Unnamed: 0,TEXT,CHRONIC_HISTORY
0,abdomen distension last days vomiting patient ...,1.0
1,gp referal low hb saw gp feeling lethargic rec...,0.0
2,referred lmo ascitic tap c sob past days e abd...,0.0
3,referred lmo w increasing abdomen distention NA,0.0
4,presented abdomen bloating taking lasix last d...,0.0
...,...,...
556293,f mdh surgery right vomiting possible pancreat...,0.0
556294,administration error first net clerical regist...,1.0
556295,co itchy left foot ongoing last months nil med...,0.0
556296,syncope home approx hrs states sitting chair w...,0.0


In [4]:
# recode response variable
mapping = {
        '1':'CAT1',
        '2':'OTHER',
        '3':'OTHER',
        '4':'OTHER',
        '5':'CAT5',
    }

df_y = df['TRIAGE_CATEGORY'].replace(mapping)
df_y

0         OTHER
1         OTHER
2         OTHER
3         OTHER
4         OTHER
          ...  
556293    OTHER
556294    OTHER
556295     CAT5
556296    OTHER
556297    OTHER
Name: TRIAGE_CATEGORY, Length: 556298, dtype: object

In [5]:
#Train test split with stratified sampling for evaluation
x_train, x_test, y_train, y_test = train_test_split(df_x, 
                                                    df_y, 
                                                    test_size = .3, 
                                                    shuffle = True, 
                                                    stratify = df_y, 
                                                    random_state = 3)

In [6]:
x_train

Unnamed: 0,TEXT,CHRONIC_HISTORY
283732,gp abdominal pain associated nausea NA,0.0
395198,twisted knee football yesterday able weight be...,0.0
392491,bib parents right knee pain running fell knee ...,0.0
301729,biba home lower abdomen pain vomiting sob ambu...,0.0
213968,presented ed upper limb injury days ago fall NA,0.0
...,...,...
244714,patient biba presented suicidal ideation fall ...,0.0
180444,presents laceration nail bed top right middle ...,0.0
418327,biba presents raf history af chest heaviness s...,0.0
394912,child bib parents pain behind right knee histo...,0.0


In [7]:
# build the pipeline
# https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html

preprocess = ColumnTransformer(
    [('description_tfidf', TfidfVectorizer(), 'TEXT')],
    remainder='passthrough')

# Now we have a full prediction pipeline.
tfidf_lr_pipe = Pipeline(steps=[('preprocessor', preprocess),
                                ('xgboost', xgb.XGBClassifier(objective='multi:softprob'))])


In [8]:
# fit model
tfidf_lr_pipe.fit(x_train, y_train)

# output scores on training data
scores = cross_val_score(tfidf_lr_pipe, x_train, y_train, cv=5)
print('Accuracy for Tf-Idf & XGBoost Classifier : ', scores.mean())

Accuracy for Tf-Idf & XGBoost Classifier :  0.9318503985905734


In [9]:
# save model
filename = 'MODEL_1.pkl'
joblib.dump(tfidf_lr_pipe, filename)

['MODEL_1.pkl']

In [10]:
# output scores on test data
y_preds = tfidf_lr_pipe.predict(x_test)
accuracy = accuracy_score(y_test, y_preds)
print("Accuracy on Test data: %.2f%%" % (accuracy * 100.0))

Accuracy on Test data: 93.21%


In [11]:
# code to load model later
# import joblib
# clf = joblib.load('CAT125_XGB.sav')

In [12]:
labels = np.unique(y_test)
a =  confusion_matrix(y_test, y_preds, labels=labels)
pd.DataFrame(a, index=labels, columns=labels)

Unnamed: 0,CAT1,CAT5,OTHER
CAT1,2105,0,436
CAT5,4,2774,9934
OTHER,178,779,150680


In [13]:
pd.crosstab(y_test, y_preds, rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,CAT1,CAT5,OTHER,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CAT1,2105,0,436,2541
CAT5,4,2774,9934,12712
OTHER,178,779,150680,151637
All,2287,3553,161050,166890


In [14]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_preds, target_names=labels))

              precision    recall  f1-score   support

        CAT1       0.92      0.83      0.87      2541
        CAT5       0.78      0.22      0.34     12712
       OTHER       0.94      0.99      0.96    151637

    accuracy                           0.93    166890
   macro avg       0.88      0.68      0.73    166890
weighted avg       0.92      0.93      0.91    166890

