Loading libraries

In [1]:
import pandas as pd
import numpy as np, warnings
from pathlib import Path
import os
import sweetviz as sv
from importlib import reload
import matplotlib.pyplot as plt
import seaborn as sns

np.warnings = warnings

In [2]:
import sys
sys.path.append("..")

# import utility functions
import src.utils as util
reload(util)

import src.load_parquet as load_parquet
reload(load_parquet)

# setup OS agnostic pathnames
ROOT_DIR = Path('..')

# Note: to run main notebook from root directory, use:
# ROOT_DIR = Path('')

1. Loading datasets

In [3]:
path_labevents = ROOT_DIR / 'data' / "LABEVENTS.csv"
path_demographics = ROOT_DIR / 'data' / "demographic_data.csv"
path_pot_labevents = ROOT_DIR / 'data' / "potential_labevents_all.csv"
path_diagnoses = ROOT_DIR / 'data'/ "DIAGNOSES_ICD.csv"
path_labitem = ROOT_DIR / 'data'/ "D_LABITEMS.csv"

In [4]:
df_labevents = pd.read_csv(path_labevents)
df_demographics = pd.read_csv(path_demographics)
df_pot_labevents = pd.read_csv(path_pot_labevents)
df_diagnoses=pd.read_csv(path_diagnoses)
df_labitem=pd.read_csv(path_labitem)

2. Labevents data processing

In [157]:
#Filtering rows with abnormal Flag
df_labevents = df_labevents[df_labevents['FLAG'] == 'abnormal']

In [158]:
#Removing rows with missing HADM_ID
df_labevents=df_labevents.dropna(subset=['HADM_ID'])

In [159]:
#Taking only following columns
df_labevents=df_labevents[['SUBJECT_ID', 'HADM_ID', 'ITEMID']]

In [160]:
df_labevents

Unnamed: 0,SUBJECT_ID,HADM_ID,ITEMID
162,3,145834.0,50893
163,3,145834.0,50902
166,3,145834.0,50912
169,3,145834.0,50970
170,3,145834.0,50971
...,...,...,...
27854045,96443,103219.0,50862
27854046,96443,103219.0,50863
27854049,96443,103219.0,50878
27854051,96443,103219.0,50885


In [161]:
#Removing duplicated rows
df_labevent2=df_labevents[['HADM_ID', 'ITEMID']].drop_duplicates()

In [162]:
#df_labevent2 has all hadmids with and without sepsis
df_labevent2

Unnamed: 0,HADM_ID,ITEMID
162,145834.0,50893
163,145834.0,50902
166,145834.0,50912
169,145834.0,50970
170,145834.0,50971
...,...,...
27853955,107499.0,50998
27853979,120151.0,51200
27853982,120151.0,51244
27853986,120151.0,51254


### Counting abnormal ITEMIDs with and without sepsis

In [163]:
#Counting ITEMIDs of all admission
itemid_counts = df_labevent2['ITEMID'].value_counts()

In [164]:
itemid_counts_df = itemid_counts.reset_index()
itemid_counts_df.columns = ['ITEMID', 'Abnormal_Count']

In [165]:
itemid_counts_df

Unnamed: 0,ITEMID,Abnormal_Count
0,51222,52821
1,51221,51779
2,51279,50487
3,50931,48493
4,51301,41550
...,...,...
285,51532,1
286,51282,1
287,51459,1
288,50816,1


In [17]:
#Downloading all itemid count
itemid_counts_df.to_csv('itemid_allcounts_labevent.csv', index=False)

### Counting abnormal ITEMIDs with sepsis patients

Taking HADM_IDs with newborn sepsis


In [166]:
#Filtering HADM_IDs with newborn sepsis ICD9COD
nb_sepsis = df_diagnoses[df_diagnoses['ICD9_CODE'] == '77181'][['HADM_ID', 'ICD9_CODE']]

In [167]:
nb_sepsis

Unnamed: 0,HADM_ID,ICD9_CODE
497,142807,77181
508,160481,77181
2755,154943,77181
3804,194246,77181
3825,137370,77181
...,...,...
361443,123012,77181
363625,163974,77181
371487,150568,77181
385979,173392,77181


In [168]:
nb_hadmid = nb_sepsis['HADM_ID'].tolist()

Counting ITEMIDs with sepsis patient

In [169]:
#Taking only HADM_ID and IS_SEPSIS columns
df_demographics=df_demographics[['HADM_ID', 'IS_SEPSIS']]

In [170]:
#Taking only HADM_ID with sepsis
df_withsepsis = df_demographics[df_demographics['IS_SEPSIS'] == 1]

In [171]:
df_withsepsis

Unnamed: 0,HADM_ID,IS_SEPSIS
19,185910,1
23,145674,1
24,122609,1
37,134462,1
63,189535,1
...,...,...
58945,186754,1
58950,168288,1
58953,153703,1
58969,141860,1


In [172]:
#Removing HADM_IDs with newborn sepsis
df_withsepsis2 = df_withsepsis[~df_withsepsis['HADM_ID'].isin(nb_hadmid)]

In [173]:
#Removing duplicated HADM_ID
df_withsepsis2=df_withsepsis2[['HADM_ID']].drop_duplicates()

In [174]:
df_withsepsis2

Unnamed: 0,HADM_ID
19,185910
23,145674
24,122609
37,134462
63,189535
...,...
58945,186754
58950,168288
58953,153703
58969,141860


In [178]:
#Merging Sepsis HADM_IDs with abnormal ITEM_ID
sepsis_df = pd.merge(df_withsepsis2, df_labevent2, on='HADM_ID', how='inner')

In [179]:
#Removing duplicated rows
sepsis_df=sepsis_df[['HADM_ID','ITEMID']].drop_duplicates()

In [180]:
sepsis_df

Unnamed: 0,HADM_ID,ITEMID
0,185910,50893
1,185910,51221
2,185910,51222
3,185910,51265
4,185910,51274
...,...,...
209742,105447,50893
209743,105447,50910
209744,105447,50960
209745,105447,50970


In [181]:
#Counting ITEMIDs with sepsis
itemid_counts_sepsis = sepsis_df['ITEMID'].value_counts()

In [182]:
itemid_counts_sepsis = itemid_counts_sepsis.reset_index()
itemid_counts_sepsis.columns = ['ITEMID', 'Sepsis_abnormal']

In [183]:
itemid_counts_sepsis

Unnamed: 0,ITEMID,Sepsis_abnormal
0,51222,5116
1,51279,5113
2,50931,5110
3,51221,5097
4,50893,4869
...,...,...
265,51224,1
266,51210,1
267,50894,1
268,51459,1


### Adding Counts to the potential labevents file

In [184]:
#potential labevents file
df_pot_labevents

Unnamed: 0,ITEMID,LABEL,ref1,ref2,ref3,neo4j,All yes
0,50802,BASE EXCESS,yes,no,no,no,
1,50803,"CALCULATED BICARBONATE, WHOLE BLOOD",yes,no,yes,no,
2,50806,"CHLORIDE, WHOLE BLOOD",yes,no,no,no,
3,50808,FREE CALCIUM,yes,yes,no,no,
4,50809,GLUCOSE,no,yes,no,no,
...,...,...,...,...,...,...,...
74,50821,pO2,no,no,no,yes,
75,51248,MCH,no,no,no,yes,
76,51493,RBC,no,no,no,yes,
77,50804,Calculated Total CO2,no,no,no,yes,


In [185]:
#Adding all abnormal counts to potential labevents 
df_final1 = pd.merge(df_pot_labevents, itemid_counts_df[['ITEMID', 'Abnormal_Count']], on='ITEMID', how='outer')

In [186]:
df_final1

Unnamed: 0,ITEMID,LABEL,ref1,ref2,ref3,neo4j,All yes,Abnormal_Count
0,50802,BASE EXCESS,yes,no,no,no,,1.0
1,50803,"CALCULATED BICARBONATE, WHOLE BLOOD",yes,no,yes,no,,2342.0
2,50806,"CHLORIDE, WHOLE BLOOD",yes,no,no,no,,7305.0
3,50808,FREE CALCIUM,yes,yes,no,no,,22593.0
4,50809,GLUCOSE,no,yes,no,no,,24478.0
...,...,...,...,...,...,...,...,...
306,51532,,,,,,,1.0
307,51282,,,,,,,1.0
308,51459,,,,,,,1.0
309,50816,,,,,,,1.0


In [187]:
# Adding all sepsis counts  
df_final2 = pd.merge(df_final1, itemid_counts_sepsis, on='ITEMID', how='left')

In [188]:
df_final2

Unnamed: 0,ITEMID,LABEL,ref1,ref2,ref3,neo4j,All yes,Abnormal_Count,Sepsis_abnormal
0,50802,BASE EXCESS,yes,no,no,no,,1.0,
1,50803,"CALCULATED BICARBONATE, WHOLE BLOOD",yes,no,yes,no,,2342.0,392.0
2,50806,"CHLORIDE, WHOLE BLOOD",yes,no,no,no,,7305.0,940.0
3,50808,FREE CALCIUM,yes,yes,no,no,,22593.0,2725.0
4,50809,GLUCOSE,no,yes,no,no,,24478.0,2296.0
...,...,...,...,...,...,...,...,...,...
306,51532,,,,,,,1.0,
307,51282,,,,,,,1.0,
308,51459,,,,,,,1.0,1.0
309,50816,,,,,,,1.0,


In [99]:
#Downloading final dataframe with all abnormal count and sepsis abnormal count
df_final2.to_csv('labevents_features.csv', index=False)

### Sorting ITEM_IDs based on 3 proportions


In [5]:
path_features = ROOT_DIR / 'data' / "features_comparison_draft.csv"
df_features = pd.read_csv(path_features)

In [6]:
df_features

Unnamed: 0,ITEMID,LABEL,ref1,ref2,ref3,neo4j,All yes,Abnormal_Count,Sepsis_abnormal_count,Sepsis_Proportion,Admission_proportion,Sepsis_admission_proportion
0,51222,HEMOGLOBIN,yes,no,yes,yes,,52821.0,5116.0,0.10,0.90,0.99
1,51279,Red Blood Cells,no,no,no,yes,,50487.0,5113.0,0.10,0.86,0.99
2,50931,GLUCOSE,no,yes,no,yes,,48493.0,5110.0,0.11,0.82,0.99
3,51221,HEMATOCRIT,yes,no,no,yes,,51779.0,5097.0,0.10,0.88,0.98
4,50893,"CALCIUM, TOTAL",yes,yes,no,yes,,36137.0,4869.0,0.13,0.61,0.94
...,...,...,...,...,...,...,...,...,...,...,...,...
306,51095,"PHOSPHATE, URINE",yes,no,no,no,,,,0.00,0.00,0.00
307,51106,URINE CREATININE,yes,yes,yes,no,,,,0.00,0.00,0.00
308,51133,ABSOLUTE LYMPHOCYTE COUNT,no,yes,no,no,,,,0.00,0.00,0.00
309,51253,MONOCYTE COUNT,no,yes,no,no,,,,0.00,0.00,0.00


Sorting based on 3 proportions

In [7]:
# Create a composite score for each row
df_features['composite_score'] = df_features['Sepsis_Proportion'] + df_features['Admission_proportion'] + df_features['Sepsis_admission_proportion']

# Sort the dataframe by the composite score in descending order
df_features = df_features.sort_values(by='composite_score', ascending=False)

# Drop the composite score column if you no longer need it
df_features = df_features.drop(columns=['composite_score'])

In [8]:
df_features

Unnamed: 0,ITEMID,LABEL,ref1,ref2,ref3,neo4j,All yes,Abnormal_Count,Sepsis_abnormal_count,Sepsis_Proportion,Admission_proportion,Sepsis_admission_proportion
0,51222,HEMOGLOBIN,yes,no,yes,yes,,52821.0,5116.0,0.10,0.90,0.99
3,51221,HEMATOCRIT,yes,no,no,yes,,51779.0,5097.0,0.10,0.88,0.98
1,51279,Red Blood Cells,no,no,no,yes,,50487.0,5113.0,0.10,0.86,0.99
2,50931,GLUCOSE,no,yes,no,yes,,48493.0,5110.0,0.11,0.82,0.99
7,51301,White Blood Cells,no,no,no,yes,,41550.0,4825.0,0.12,0.70,0.93
...,...,...,...,...,...,...,...,...,...,...,...,...
284,51129,,,,,,,1.0,,0.00,0.00,0.00
285,50934,,,,,,,1.0,,0.00,0.00,0.00
286,51532,,,,,,,1.0,,0.00,0.00,0.00
287,51282,,,,,,,1.0,,0.00,0.00,0.00


In [None]:
#Downloading sorted data
df_features.to_csv('features_comparison_sorted.csv', index=False)

Filling missing labels

In [9]:
labeled_df = df_features.merge(df_labitem[['ITEMID', 'LABEL']], on='ITEMID', how='left')

In [10]:
labeled_df.rename(columns={'LABEL_y': 'LABEL2', 'LABEL_x': 'LABEL'}, inplace=True)

In [11]:
labeled_df

Unnamed: 0,ITEMID,LABEL,ref1,ref2,ref3,neo4j,All yes,Abnormal_Count,Sepsis_abnormal_count,Sepsis_Proportion,Admission_proportion,Sepsis_admission_proportion,LABEL2
0,51222,HEMOGLOBIN,yes,no,yes,yes,,52821.0,5116.0,0.10,0.90,0.99,Hemoglobin
1,51221,HEMATOCRIT,yes,no,no,yes,,51779.0,5097.0,0.10,0.88,0.98,Hematocrit
2,51279,Red Blood Cells,no,no,no,yes,,50487.0,5113.0,0.10,0.86,0.99,Red Blood Cells
3,50931,GLUCOSE,no,yes,no,yes,,48493.0,5110.0,0.11,0.82,0.99,Glucose
4,51301,White Blood Cells,no,no,no,yes,,41550.0,4825.0,0.12,0.70,0.93,White Blood Cells
...,...,...,...,...,...,...,...,...,...,...,...,...,...
306,51129,,,,,,,1.0,,0.00,0.00,0.00,Young
307,50934,,,,,,,1.0,,0.00,0.00,0.00,H
308,51532,,,,,,,1.0,,0.00,0.00,0.00,PLASMGN
309,51282,,,,,,,1.0,,0.00,0.00,0.00,"Reticulocyte Count, Absolute"


In [12]:
#Downloading labeled data
labeled_df.to_csv('features_comparison_labeled.csv', index=False)