Loading libraries

In [1]:
import pandas as pd
import numpy as np, warnings
from pathlib import Path
import os
import sweetviz as sv
from importlib import reload
import matplotlib.pyplot as plt
import seaborn as sns

np.warnings = warnings

In [2]:
import sys
sys.path.append("..")

# import utility functions
import src.utils as util
reload(util)

import src.load_parquet as load_parquet
reload(load_parquet)

# setup OS agnostic pathnames
ROOT_DIR = Path('..')

# Note: to run main notebook from root directory, use:
# ROOT_DIR = Path('')

1. Loading datasets

In [3]:
path_labevents = ROOT_DIR / 'data' / "LABEVENTS.csv"
path_demographics = ROOT_DIR / 'data' / "demographic_data.csv"
path_pot_labevents = ROOT_DIR / 'data' / "potential_labevents_combined.csv"
path_diagnoses = ROOT_DIR / 'data'/ "DIAGNOSES_ICD.csv"

In [4]:
df_labevents = pd.read_csv(path_labevents)
df_demographics = pd.read_csv(path_demographics)
df_pot_labevents = pd.read_csv(path_pot_labevents)
df_diagnoses=pd.read_csv(path_diagnoses)

In [15]:
df_children=df_demographics[(df_demographics.AGE<18)]

In [16]:
df_children

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,...,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA,GENDER,DOB,EXPIRE_FLAG,AGE,IS_SEPSIS
6,27,27,134931,2191-11-30 22:16:00,2191-12-03 14:45:00,,NEWBORN,PHYS REFERRAL/NORMAL DELI,HOME,Private,...,,,NEWBORN,0,1,F,2191-11-30,0,0,0
20,41,39,106266,2114-11-29 21:04:00,2114-12-09 15:10:00,,NEWBORN,PHYS REFERRAL/NORMAL DELI,HOME,Private,...,,,NEWBORN,0,1,F,2114-11-29,0,0,0
27,461,358,110872,2168-10-24 23:48:00,2168-10-29 03:23:00,,NEWBORN,PHYS REFERRAL/NORMAL DELI,HOME,Private,...,,,NEWBORN,0,1,F,2168-10-24,0,0,0
34,468,363,196503,2176-03-01 15:26:00,2176-03-03 14:04:00,,NEWBORN,CLINIC REFERRAL/PREMATURE,HOME,Government,...,,,NEWBORN,0,1,F,2176-03-01,0,0,0
51,49,50,132761,2112-06-23 19:40:00,2112-06-26 10:15:00,,NEWBORN,PHYS REFERRAL/NORMAL DELI,HOME,Private,...,,,NEWBORN,0,1,M,2112-06-23,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55692,53991,84227,169949,2168-03-19 15:14:00,2168-03-25 11:29:00,,EMERGENCY,EMERGENCY ROOM ADMIT,REHAB/DISTINCT PART HOSP,Private,...,2168-03-19 10:42:00,2168-03-19 16:17:00,BLADDER RUPTURE,0,1,F,2151-03-11,0,17,0
56419,58955,99934,176121,2110-02-28 03:47:00,2110-03-06 14:31:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Private,...,,,PELVIC ABSCESS,0,1,M,2092-03-18,0,17,0
57584,58652,98995,123142,2155-10-05 02:32:00,2155-10-05 12:00:00,2155-10-05 12:00:00,EMERGENCY,EMERGENCY ROOM ADMIT,DEAD/EXPIRED,Private,...,2155-10-04 23:56:00,2155-10-05 04:04:00,INTRACRANIAL BLEED,1,1,M,2138-12-23,1,16,0
57950,58142,97339,198692,2114-08-25 19:20:00,2114-08-28 13:00:00,,EMERGENCY,CLINIC REFERRAL/PREMATURE,HOME,Private,...,2114-08-25 17:53:00,2114-08-25 19:20:00,EPIDURAL HEMATOMA,0,1,M,2097-01-08,0,17,0


In [17]:
child_hadmid = df_children['HADM_ID'].tolist()

In [31]:
len(child_hadmid)

8210

In [32]:
len(df_demographics)

58976

2. Labevents data processing

In [24]:
#Filtering rows with abnormal Flag
df_labevents = df_labevents[df_labevents['FLAG'] == 'abnormal']

In [34]:
#Removing rows with missing HADM_ID
df_labevents=df_labevents.dropna(subset=['HADM_ID'])

In [35]:
#Removing HADM_IDs of children
df_labevents = df_labevents[~df_labevents['HADM_ID'].isin(child_hadmid)]

In [36]:
#Taking only following columns
df_labevents=df_labevents[['SUBJECT_ID', 'HADM_ID', 'ITEMID']]

In [37]:
#Removing duplicated rows
df_labevent2=df_labevents[['HADM_ID', 'ITEMID']].drop_duplicates()

In [38]:
#df_labevent2 has all hadmids with and without sepsis
df_labevent2

Unnamed: 0,HADM_ID,ITEMID
162,145834.0,50893
163,145834.0,50902
166,145834.0,50912
169,145834.0,50970
170,145834.0,50971
...,...,...
27853955,107499.0,50998
27853979,120151.0,51200
27853982,120151.0,51244
27853986,120151.0,51254


### Counting abnormal ITEMIDs with and without sepsis

In [39]:
#Counting ITEMIDs of all admission
itemid_counts = df_labevent2['ITEMID'].value_counts()

In [40]:
itemid_counts_df = itemid_counts.reset_index()
itemid_counts_df.columns = ['ITEMID', 'Abnormal_Count']

In [41]:
itemid_counts_df

Unnamed: 0,ITEMID,Abnormal_Count
0,51279,48216
1,50931,48199
2,51221,47818
3,51222,47649
4,51301,38975
...,...,...
285,50816,1
286,50913,1
287,50802,1
288,51113,1


In [17]:
#Downloading all itemid count
#itemid_counts_df.to_csv('itemid_allcounts_labevent.csv', index=False)

### Counting abnormal ITEMIDs with sepsis patients

Taking HADM_IDs with newborn sepsis


In [59]:
#Filtering HADM_IDs with newborn sepsis ICD9COD
nb_sepsis = df_diagnoses[df_diagnoses['ICD9_CODE'] == '77181'][['HADM_ID', 'ICD9_CODE']]

In [60]:
nb_sepsis

Unnamed: 0,HADM_ID,ICD9_CODE
497,142807,77181
508,160481,77181
2755,154943,77181
3804,194246,77181
3825,137370,77181
...,...,...
361443,123012,77181
363625,163974,77181
371487,150568,77181
385979,173392,77181


In [61]:
nb_hadmid = nb_sepsis['HADM_ID'].tolist()

Counting ITEMIDs with sepsis patient

In [42]:
#Taking only HADM_ID with sepsis and age >=18
df_withsepsis = df_demographics[(df_demographics['IS_SEPSIS'] == 1) & (df_demographics.AGE >=18)]

In [46]:
#Taking only HADM_ID and IS_SEPSIS columns
df_withsepsis=df_withsepsis[['HADM_ID', 'AGE','IS_SEPSIS']]

In [47]:
df_withsepsis

Unnamed: 0,HADM_ID,AGE,IS_SEPSIS
19,185910,75,1
23,145674,63,1
24,122609,63,1
37,134462,52,1
63,189535,55,1
...,...,...,...
58945,186754,84,1
58950,168288,69,1
58953,153703,69,1
58969,141860,80,1


In [48]:
#Taking HADM_IDs with abnormal ITEM_ID
sepsis_df = pd.merge(df_withsepsis, df_labevent2, on='HADM_ID', how='inner')

In [49]:
#Removing duplicated rows
sepsis_df=sepsis_df[['HADM_ID','ITEMID']].drop_duplicates()

In [50]:
sepsis_df

Unnamed: 0,HADM_ID,ITEMID
0,185910,50893
1,185910,51221
2,185910,51222
3,185910,51265
4,185910,51274
...,...,...
209646,105447,50893
209647,105447,50910
209648,105447,50960
209649,105447,50970


In [51]:
#Counting ITEMIDs with sepsis
itemid_counts_sepsis = sepsis_df['ITEMID'].value_counts()

In [52]:
itemid_counts_sepsis = itemid_counts_sepsis.reset_index()
itemid_counts_sepsis.columns = ['ITEMID', 'Sepsis_abnormal']

In [53]:
itemid_counts_sepsis

Unnamed: 0,ITEMID,Sepsis_abnormal
0,51222,5114
1,51279,5109
2,50931,5108
3,51221,5094
4,50893,4868
...,...,...
265,51224,1
266,51210,1
267,50894,1
268,51459,1


### Adding Counts to the potential labevents file

In [54]:
#potential labevents file
df_pot_labevents

Unnamed: 0,ITEMID,LABEL,FLUID,ref1,ref2,chatgpt,neo4j
0,50802,BASE EXCESS,BLOOD,yes,no,no,no
1,50803,"CALCULATED BICARBONATE, WHOLE BLOOD",BLOOD,yes,no,yes,no
2,50806,"CHLORIDE, WHOLE BLOOD",BLOOD,yes,no,no,no
3,50808,FREE CALCIUM,BLOOD,yes,yes,no,no
4,50809,GLUCOSE,BLOOD,no,yes,no,yes
...,...,...,...,...,...,...,...
79,50804,CALCULATED TOTAL CO2,BLOOD,no,no,yes,yes
80,50910,CREATINE KINASE (CK),BLOOD,no,no,no,yes
81,51249,MCHC,BLOOD,no,no,no,yes
82,51516,WBC,URINE,no,no,yes,yes


In [55]:
#Adding all abnormal counts to potential labevents 
df_final1 = pd.merge(df_pot_labevents, itemid_counts_df[['ITEMID', 'Abnormal_Count']], on='ITEMID', how='left')

In [56]:
df_final1

Unnamed: 0,ITEMID,LABEL,FLUID,ref1,ref2,chatgpt,neo4j,Abnormal_Count
0,50802,BASE EXCESS,BLOOD,yes,no,no,no,1.0
1,50803,"CALCULATED BICARBONATE, WHOLE BLOOD",BLOOD,yes,no,yes,no,2318.0
2,50806,"CHLORIDE, WHOLE BLOOD",BLOOD,yes,no,no,no,7271.0
3,50808,FREE CALCIUM,BLOOD,yes,yes,no,no,22384.0
4,50809,GLUCOSE,BLOOD,no,yes,no,yes,24322.0
...,...,...,...,...,...,...,...,...
79,50804,CALCULATED TOTAL CO2,BLOOD,no,no,yes,yes,17489.0
80,50910,CREATINE KINASE (CK),BLOOD,no,no,no,yes,16940.0
81,51249,MCHC,BLOOD,no,no,no,yes,28638.0
82,51516,WBC,URINE,no,no,yes,yes,11374.0


In [57]:
# Adding all sepsis counts  
df_final2 = pd.merge(df_final1, itemid_counts_sepsis, on='ITEMID', how='left')

In [58]:
df_final2

Unnamed: 0,ITEMID,LABEL,FLUID,ref1,ref2,chatgpt,neo4j,Abnormal_Count,Sepsis_abnormal
0,50802,BASE EXCESS,BLOOD,yes,no,no,no,1.0,
1,50803,"CALCULATED BICARBONATE, WHOLE BLOOD",BLOOD,yes,no,yes,no,2318.0,392.0
2,50806,"CHLORIDE, WHOLE BLOOD",BLOOD,yes,no,no,no,7271.0,940.0
3,50808,FREE CALCIUM,BLOOD,yes,yes,no,no,22384.0,2724.0
4,50809,GLUCOSE,BLOOD,no,yes,no,yes,24322.0,2296.0
...,...,...,...,...,...,...,...,...,...
79,50804,CALCULATED TOTAL CO2,BLOOD,no,no,yes,yes,17489.0,3252.0
80,50910,CREATINE KINASE (CK),BLOOD,no,no,no,yes,16940.0,2542.0
81,51249,MCHC,BLOOD,no,no,no,yes,28638.0,3067.0
82,51516,WBC,URINE,no,no,yes,yes,11374.0,2318.0


In [59]:
#Downloading final dataframe with all abnormal count and sepsis abnormal count
#df_final2.to_csv('labevents_features.csv', index=False)

### Sorting ITEM_IDs based on 3 proportions


In [64]:
path_features = ROOT_DIR / 'data' / "features_comparison_draft.csv"
df_features = pd.read_csv(path_features)

In [65]:
df_features

Unnamed: 0,#,LABEL,FLUID,ref1,ref2,chatgpt,neo4j,Abnormal_Count,Sepsis_abnormal,Sepsis_proportion,Admission_proportion,Sepsis_admission_proportion
0,51279,RED BLOOD CELLS,BLOOD,no,no,no,yes,48216.0,5109.0,0.11,0.95,0.99
1,50931,GLUCOSE,BLOOD,no,yes,no,yes,48199.0,5108.0,0.11,0.95,0.99
2,51221,HEMATOCRIT,BLOOD,yes,no,no,yes,47818.0,5094.0,0.11,0.94,0.98
3,51222,HEMOGLOBIN,BLOOD,yes,no,yes,yes,47649.0,5114.0,0.11,0.94,0.99
4,51301,WHITE BLOOD CELLS,BLOOD,no,no,yes,yes,38975.0,4824.0,0.12,0.77,0.93
...,...,...,...,...,...,...,...,...,...,...,...,...
80,51106,URINE CREATININE,URINE,yes,yes,yes,no,,,0.00,0.00,0.00
81,51133,ABSOLUTE LYMPHOCYTE COUNT,BLOOD,no,yes,no,no,,,0.00,0.00,0.00
82,51253,MONOCYTE COUNT,BLOOD,no,yes,no,no,,,0.00,0.00,0.00
83,51480,HEMATOCRIT,URINE,yes,no,no,no,,,0.00,0.00,0.00


Sorting based on 3 proportions

In [66]:
# Create a composite score for each row
df_features['composite_score'] = df_features['Sepsis_proportion'] + df_features['Admission_proportion'] + df_features['Sepsis_admission_proportion']

# Sort the dataframe by the composite score in descending order
df_features = df_features.sort_values(by='composite_score', ascending=False)

# Drop the composite score column if you no longer need it
df_features = df_features.drop(columns=['composite_score'])

In [67]:
df_features

Unnamed: 0,#,LABEL,FLUID,ref1,ref2,chatgpt,neo4j,Abnormal_Count,Sepsis_abnormal,Sepsis_proportion,Admission_proportion,Sepsis_admission_proportion
0,51279,RED BLOOD CELLS,BLOOD,no,no,no,yes,48216.0,5109.0,0.11,0.95,0.99
1,50931,GLUCOSE,BLOOD,no,yes,no,yes,48199.0,5108.0,0.11,0.95,0.99
3,51222,HEMOGLOBIN,BLOOD,yes,no,yes,yes,47649.0,5114.0,0.11,0.94,0.99
2,51221,HEMATOCRIT,BLOOD,yes,no,no,yes,47818.0,5094.0,0.11,0.94,0.98
4,51301,WHITE BLOOD CELLS,BLOOD,no,no,yes,yes,38975.0,4824.0,0.12,0.77,0.93
...,...,...,...,...,...,...,...,...,...,...,...,...
66,50833,POTASSIUM,OTHER BODY FLUID,yes,no,no,no,,,0.00,0.00,0.00
65,50830,"PCO2, BODY FLUID",OTHER BODY FLUID,yes,no,no,no,,,0.00,0.00,0.00
64,50817,OXYGEN SATURATION,BLOOD,yes,no,no,no,,,0.00,0.00,0.00
62,50802,BASE EXCESS,BLOOD,yes,no,no,no,1.0,,0.00,0.00,0.00


In [44]:
#Downloading sorted data
df_features.to_csv('labevents_comparison_sorted.csv', index=False)