In [36]:
# Import Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [37]:
# Load df 

df = pd.read_excel('datasource/data.xlsx')
df.head()

Unnamed: 0,Group,Sex,Age,Patients number per hour,Arrival mode,Injury,Chief_complain,Mental,Pain,NRS_pain,...,BT,Saturation,KTAS_RN,Diagnosis in ED,Disposition,KTAS_expert,Error_group,Length of stay_min,KTAS duration_min,mistriage
0,2,2,71.0,3,3,2,right ocular pain,1,1,2.0,...,36.6,100.0,2,Corneal abrasion,1,4,2,86.0,5.0,1
1,1,1,56.0,12,3,2,right forearm burn,1,1,2.0,...,36.5,,4,"Burn of hand, firts degree dorsum",1,5,4,64.0,3.95,1
2,2,1,67.5,8,2,2,"arm pain, Lt",1,1,2.0,...,36.6,98.0,4,"Fracture of surgical neck of humerus, closed",2,5,4,862.0,1.0,1
3,1,2,71.0,8,1,1,ascites tapping,1,1,3.0,...,36.5,,4,Alcoholic liver cirrhosis with ascites,1,5,6,108.0,9.833333,1
4,1,2,58.0,4,3,1,"distension, abd",1,1,3.0,...,36.5,,4,Ascites,1,5,8,109.0,6.6,1


### Snippet from original paper

This study was conducted retrospectively, utilizing a cross-sectional approach that involved a systematic selection of 1267 records of adult patients admitted to two emergency departments between October 2016 and September 2017. The study assessed a range of 24 variables, including the patients' chief complaints, vital signs according to initial nursing records, and clinical outcomes. To determine the true Korean Triage and Acuity Scale (KTAS), three triage experts were involved in the study, including a certified emergency nurse, a KTAS provider and instructor, and a nurse selected based on their excellent experience and competence in emergency department care. The inter-rater agreement between the expert and emergency nurse KTAS scores was used to evaluate the accuracy of triage, and the experts' comments were analyzed to determine the cause of any triage errors. The study further conducted an independent sample t-test to compare the number of patient visits per hour in terms of the accuracy and inaccuracy of triage.


In [38]:
# Columns overview 
df.columns

Index(['Group', 'Sex', 'Age', 'Patients number per hour', 'Arrival mode',
       'Injury', 'Chief_complain', 'Mental', 'Pain', 'NRS_pain', 'SBP', 'DBP',
       'HR', 'RR', 'BT', 'Saturation', 'KTAS_RN', 'Diagnosis in ED',
       'Disposition', 'KTAS_expert', 'Error_group', 'Length of stay_min',
       'KTAS duration_min', 'mistriage'],
      dtype='object')

In [39]:
# Types overview 
df.dtypes

Group                         int64
Sex                           int64
Age                         float64
Patients number per hour      int64
Arrival mode                  int64
Injury                        int64
Chief_complain               object
Mental                        int64
Pain                          int64
NRS_pain                    float64
SBP                          object
DBP                          object
HR                           object
RR                           object
BT                           object
Saturation                   object
KTAS_RN                       int64
Diagnosis in ED              object
Disposition                   int64
KTAS_expert                   int64
Error_group                   int64
Length of stay_min          float64
KTAS duration_min           float64
mistriage                     int64
dtype: object

Note:

Upon examining the saturation column, the value labeled as 측불 appears to be meaningless. 
Therefore, I have decided to remove this value and convert the column to a float df type.

In [40]:
# To clean the df, the '측불' values in the 'Saturation' column were removed using the following code:
df = df[df['Saturation'] != '측불']

# Then, the column was converted to float type using the following code:
df['Saturation'] = df['Saturation'].astype('float')

We can identify the columns containing Null values in our dfset. Upon inspection, it was found that the NRS_pain, Saturation, and Diagnosis in ED columns contain Null values.

The NRS_pain column represents the numeric rating scale of pain, which can be imputed with the average of the corresponding chief complaint. Similarly, the Saturation column represents the saturation level measured using a pulse oximeter, which can also be imputed with the average of the corresponding chief complaint. However, for the Diagnosis in ED column, it is not appropriate to fill the Null values based on any assumptions. Therefore, I suggest discarding the rows with Null values in the Diagnosis in ED column.

In [41]:
df.isna().sum()

Group                         0
Sex                           0
Age                           0
Patients number per hour      0
Arrival mode                  0
Injury                        0
Chief_complain                0
Mental                        0
Pain                          0
NRS_pain                    551
SBP                           0
DBP                           0
HR                            0
RR                            0
BT                            0
Saturation                  688
KTAS_RN                       0
Diagnosis in ED               2
Disposition                   0
KTAS_expert                   0
Error_group                   0
Length of stay_min            0
KTAS duration_min             0
mistriage                     0
dtype: int64

In the following code, I assigned mean value for NRS Pain and Saturation according to Chief Complaint. However, some of the chief complaints has no value for NRS Pain or Saturation. For those values, I excluded them from the dfset

In [42]:
# Fill Null values in NRS_pain and Saturation columns with the mean of corresponding Chief_complain values
df['NRS_pain'] = df.groupby('Chief_complain')['NRS_pain'].transform(lambda x: x.fillna(x.mean()))
df['Saturation'] = df.groupby('Chief_complain')['Saturation'].transform(lambda x: x.fillna(x.mean()))

# Drop rows with Null values in Diagnosis in ED column
df.dropna(subset=['Diagnosis in ED'], inplace=True)

# Using the transform() function with the fillna() method allows you to fill the Null values in the specified column 
# with the mean of corresponding Chief_complain values in a single line of code. 
# Also, the dropna() method can be used with the subset parameter to drop rows with Null values in a specific column.

##### Outliers

identification of outlier values refers to the process of identifying df points that are significantly different from other df points in a dfset. Outliers can be caused by various factors, such as measurement errors, df entry errors, or unusual events. Identifying outliers is an important step in df cleaning and df analysis because they can skew statistical measures and affect the accuracy of statistical models. By detecting and removing outliers, we can improve the quality of df analysis and ensure more accurate results.


In [43]:
# Next thing is identification of outlier values
from scipy import stats

def drop_numerical_outliers(df, z_thresh=3):
    constrains = df.select_dtypes(include=np.number).apply(lambda x: np.abs(stats.zscore(x)) < z_thresh).all(axis=1)
    df.drop(df[~constrains].index, inplace=True)

drop_numerical_outliers(df)

In [44]:
# Type conversion

# Select the relevant columns to convert to numeric df types
cols_to_convert = ['SBP', 'DBP', 'HR', 'RR', 'BT']

# Apply the to_numeric function to convert the columns to numeric df types and
# replace invalid values with NaN
df[cols_to_convert] = df[cols_to_convert].apply(pd.to_numeric, errors='coerce')

# Drop any rows with NaN values in the selected columns
df.dropna(subset=cols_to_convert, inplace=True)

# Explanation:
# This code first selects the columns to convert to numeric df types, 
# then applies the pd.to_numeric() function to these columns using the apply() method. 
# The errors='coerce' argument ensures that any invalid values (including the '측불' values) are replaced with NaN values.
# Finally, the code uses the dropna() method to drop any rows with NaN values in the selected columns, 
# effectively achieving the same result as the original code but in a more concise and efficient way.

In [45]:
# Performing normalization for readability

# Define dictionaries for mapping values to labels
gender_map = {1: 'Female', 2: 'Male'}
group_map = {1: 'Local ED', 2: 'Regional ED'}
arrival_map = {
    1: 'Walking',
    2: '119 Use',
    3: 'Private Car',
    4: 'Private Ambulance',
    5: 'Public Transportation',
    6: 'Wheelchair',
    7: 'Others'
}
injury_map = {1: 'Injury', 2: 'Non Injury'}
mental_map = {
    1: 'Alert',
    2: 'Verval Response',
    3: 'Pain Response',
    4: 'Unconciousness'
}
pain_map = {1: 'Pain', 2: 'Non Pain'}
disposition_map = {
    1: 'Discharge',
    2: 'Ward admission',
    3: 'ICU admission',
    4: 'AMA discharge',
    5: 'Transfer',
    6: 'Death',
    7: 'OP fom ED'
}
error_map = {
    1: 'Vital sign',
    2: 'Physical exam',
    3: 'Psychatric',
    4: 'Pain',
    5: 'Mental',
    6: 'Underlying disease',
    7: 'Medical records of other ED',
    8: 'On set',
    9: 'Others'
}
mistriage_map = {0: 'Correct', 1: 'Over Triage', 2: 'Under Triage'}

# Use np.vectorize to map values to labels in the data frame
df['Gender'] = np.vectorize(gender_map.get)(df['Sex'])
df['GroupName'] = np.vectorize(group_map.get)(df['Group'])
df['ArrivalMethod'] = np.vectorize(arrival_map.get)(df['Arrival mode'])
df['InjuryName'] = np.vectorize(injury_map.get)(df['Injury'])
df['MentalName'] = np.vectorize(mental_map.get)(df['Mental'])
df['PainName'] = np.vectorize(pain_map.get)(df['Pain'])
df['DispositionName'] = np.vectorize(disposition_map.get)(df['Disposition'])
df['ErrorName'] = np.vectorize(error_map.get)(df['Error_group'])
df['MistriageName'] = np.vectorize(mistriage_map.get)(df['mistriage'])


ValueError: cannot call `vectorize` on size 0 inputs unless `otypes` is set

In [47]:
# Define the labels and their corresponding values
labels_values = {
    'Correct': df['mistriage'].value_counts()[0],
    'Over Triage': df['mistriage'].value_counts()[1],
    'Under Triage': df['mistriage'].value_counts()[2]
}

# Convert the dictionary to arrays
labels = list(labels_values.keys())
values = list(labels_values.values())

# Set the figure size
plt.figure(figsize=(10, 10))

# Create the pie chart
plt.pie(values, labels=labels, autopct='%1.1f%%', startangle=200)

# Add title and axis labels with larger font size
plt.title('Mistriage Distribution', fontsize=18)
plt.xlabel('Mistriage Name', fontsize=14)
plt.ylabel('Percentage', fontsize=14)

# Add a legend to the graph
plt.legend(title='Mistriage', loc='best', fontsize=12)

# Use a more distinguishable color palette
colors = ['#fbb4ae', '#b3cde3', '#ccebc5']
plt.gca().set_prop_cycle('color', colors)

# Show the graph
plt.show()

KeyError: 'Correct'