In [1]:
import pandas as pd
import numpy as np, warnings
from pathlib import Path
import os
import sweetviz as sv
from importlib import reload

np.warnings = warnings

# import utility functions
import src.utils as util
reload(util)

import src.load_parquet as load_parquet
reload(load_parquet)

# setup OS agnostic pathnames
# ROOT_DIR = Path('..')

# Note: to run main notebook from root directory, use:
ROOT_DIR = Path('')

In [2]:
path_patients = ROOT_DIR / 'data'/ "PATIENTS.csv"
path_admissions = ROOT_DIR / 'data'/ "ADMISSIONS.csv"
path_labevents = ROOT_DIR / 'data'/ "LABEVENTS.csv"
path_combined_df_hasHADM = ROOT_DIR / 'data'/ "combined_df_hasHADM.csv"
path_desc_labitems = ROOT_DIR / 'data'/ "D_LABITEMS.csv"

In [3]:
df_patients = pd.read_csv(path_patients)
df_labevents = pd.read_csv(path_labevents)


In [4]:
#combined_df_hasHADM = pd.read_csv(path_combined_df_hasHADM)

In [5]:
df_admissions = pd.read_csv(path_admissions)
df_desc_labitems = pd.read_csv(path_desc_labitems)

In [6]:
# Convert the following datetime columns to datetime format
# patients: DOB to date format, we not care about the birth time
df_patients['DOB'] = pd.to_datetime(df_patients['DOB'], format='%Y-%m-%d %H:%M:%S')
# admissions: ADMITTIME, DISCHTIME, EDREGTIME, EDOUTTIME
df_admissions['ADMITTIME'] = pd.to_datetime(df_admissions['ADMITTIME'], format='%Y-%m-%d %H:%M:%S')
df_admissions['DISCHTIME'] = pd.to_datetime(df_admissions['DISCHTIME'], format='%Y-%m-%d %H:%M:%S')
# labevents: CHARTTIME
df_labevents['CHARTTIME'] = pd.to_datetime(df_labevents['CHARTTIME'], format='%Y-%m-%d %H:%M:%S')
# microbiologyevents: CHARTDATE to date format and CHARTTIME to datetime format
#df_microbiologyevents['CHARTDATE'] = pd.to_datetime(df_microbiologyevents['CHARTDATE'], format='%Y-%m-%d %H:%M:%S')
#df_microbiologyevents['CHARTTIME'] = pd.to_datetime(df_microbiologyevents['CHARTTIME'], format='%Y-%m-%d %H:%M:%S')

In [7]:
try:
    pathname_demographic = ROOT_DIR  / 'demographic.csv'
    df_demographic = pd.read_csv(pathname_demographic)
except:
    # merge the patients and admission tables to a demographic dataframe
    df_demographic = pd.merge(df_admissions, df_patients[['SUBJECT_ID', 'GENDER', 'DOB', 'EXPIRE_FLAG']], on='SUBJECT_ID')
    # create an age column to each case
    df_demographic['AGE'] = (((df_demographic['ADMITTIME'].dt.date - df_demographic['DOB'].dt.date) // 365) / pd.Timedelta(days=1)).astype('int16')
    # add column IS_SEPSIS to demographic data indicating which case is diagnosed with sepsis
    df_demographic['IS_SEPSIS'] = df_demographic.apply(lambda x: check_sepsis(x['SUBJECT_ID'], x['HADM_ID'], df_diagnoses_icd), axis=1)
    util.save_csv(df_demographic, ROOT_DIR / 'data' / 'demographic.csv')

# convert admittime and dischtime to datetime
df_demographic['ADMITTIME'] = pd.to_datetime(df_demographic['ADMITTIME'], format='%Y-%m-%d %H:%M:%S')
df_demographic['DISCHTIME'] = pd.to_datetime(df_demographic['DISCHTIME'], format='%Y-%m-%d %H:%M:%S')

In [8]:
# labevents: CHARTTIME
df_labevents['CHARTTIME'] = pd.to_datetime(df_labevents['CHARTTIME'], format='%Y-%m-%d %H:%M:%S')

In [9]:
df_admissions

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA
0,21,22,165315,2196-04-09 12:26:00,2196-04-10 15:54:00,,EMERGENCY,EMERGENCY ROOM ADMIT,DISC-TRAN CANCER/CHLDRN H,Private,,UNOBTAINABLE,MARRIED,WHITE,2196-04-09 10:06:00,2196-04-09 13:24:00,BENZODIAZEPINE OVERDOSE,0,1
1,22,23,152223,2153-09-03 07:15:00,2153-09-08 19:10:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,,CATHOLIC,MARRIED,WHITE,,,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,0,1
2,23,23,124321,2157-10-18 19:34:00,2157-10-25 14:00:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,ENGL,CATHOLIC,MARRIED,WHITE,,,BRAIN MASS,0,1
3,24,24,161859,2139-06-06 16:14:00,2139-06-09 12:48:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Private,,PROTESTANT QUAKER,SINGLE,WHITE,,,INTERIOR MYOCARDIAL INFARCTION,0,1
4,25,25,129635,2160-11-02 02:06:00,2160-11-05 14:55:00,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,,UNOBTAINABLE,MARRIED,WHITE,2160-11-02 01:01:00,2160-11-02 04:27:00,ACUTE CORONARY SYNDROME,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58971,58594,98800,191113,2131-03-30 21:13:00,2131-04-02 15:02:00,,EMERGENCY,CLINIC REFERRAL/PREMATURE,HOME,Private,ENGL,NOT SPECIFIED,SINGLE,WHITE,2131-03-30 19:44:00,2131-03-30 22:41:00,TRAUMA,0,1
58972,58595,98802,101071,2151-03-05 20:00:00,2151-03-06 09:10:00,2151-03-06 09:10:00,EMERGENCY,CLINIC REFERRAL/PREMATURE,DEAD/EXPIRED,Medicare,ENGL,CATHOLIC,WIDOWED,WHITE,2151-03-05 17:23:00,2151-03-05 21:06:00,SAH,1,1
58973,58596,98805,122631,2200-09-12 07:15:00,2200-09-20 12:08:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Private,ENGL,NOT SPECIFIED,MARRIED,WHITE,,,RENAL CANCER/SDA,0,1
58974,58597,98813,170407,2128-11-11 02:29:00,2128-12-22 13:11:00,,EMERGENCY,EMERGENCY ROOM ADMIT,SNF,Private,ENGL,CATHOLIC,MARRIED,WHITE,2128-11-10 23:48:00,2128-11-11 03:16:00,S/P FALL,0,0


In [10]:
# Read LABEVENTS.csv and filter rows where HADM_ID is missing

# Read ADMISSIONS.csv
df_empty_hadm_labevents = df_labevents[df_labevents['HADM_ID'].isnull()]
df_filled_hadm_labevents = df_labevents[pd.to_numeric(df_labevents["HADM_ID"], errors='coerce').notnull()]

df_admissions.sort_values(by=["SUBJECT_ID", "ADMITTIME"], inplace=True)
df_empty_hadm_labevents.sort_values(by=["SUBJECT_ID", "CHARTTIME"], inplace=True)

# Sorting both DataFrames by the keys used for merging

df_admissions['ADMITTIME'] = pd.to_datetime(df_admissions['ADMITTIME'])
df_admissions['DISCHTIME'] = pd.to_datetime(df_admissions['DISCHTIME'])
df_empty_hadm_labevents['CHARTTIME'] = pd.to_datetime(df_empty_hadm_labevents['CHARTTIME']) 

# Merge and fill missing HADM_ID using inner join
merged_data = df_empty_hadm_labevents.merge(df_admissions, on='SUBJECT_ID', how='inner')
mask = (merged_data['CHARTTIME'] >= merged_data['ADMITTIME']) & (merged_data['CHARTTIME'] <= merged_data['DISCHTIME'])
#Main Task
#if charttime falls between admission time and discharge time then replace labevents hadm_id with admissions table

#to show all the rows
merged_data.loc[mask, 'HADM_ID_x'] = merged_data.loc[mask, 'HADM_ID_y']

# Rename the column back to 'HADM_ID'
merged_data.rename(columns={'HADM_ID_x': 'HADM_ID'}, inplace=True)

# Drop unnecessary columns
merged_data.sort_values(by=["ROW_ID_x"], inplace=True)
merged_data.drop(['HADM_ID_y', 'ROW_ID_y'], axis=1, inplace=True)
merged_data.rename(columns={'SUBJECT_ID_y': 'ADMISSION_SUBJECT_ID'}, inplace=True)

filtered_data_hasHADM = merged_data[merged_data['HADM_ID'].notna()].drop_duplicates(subset=['ROW_ID_x']) #NEED TO CHECK IF unnecessary rows are present since we used merge with adissions and lab events before
filtered_data_noHADM = merged_data[merged_data['HADM_ID'].isnull()].drop_duplicates(subset=['ROW_ID_x'])
filtered_data_hasHADM.rename(columns={'ROW_ID_x': 'ROW_ID'}, inplace=True)
filtered_data_noHADM.rename(columns={'ROW_ID_x': 'ROW_ID'}, inplace=True)

row_ids_in_hasHADM = set(filtered_data_hasHADM['ROW_ID'])
filtered_data_noHADM = filtered_data_noHADM[~filtered_data_noHADM['ROW_ID'].isin(row_ids_in_hasHADM)] #Had to refilter filtered_data_noHADM since there were unnecessary rows

columns_to_remove = ['ADMITTIME','DISCHTIME','DEATHTIME', 'ADMISSION_TYPE',  'ADMISSION_LOCATION', 'DISCHARGE_LOCATION', 'INSURANCE',    'LANGUAGE', 'RELIGION', 'MARITAL_STATUS',   'ETHNICITY', 'EDREGTIME'    ,'EDOUTTIME'    ,'DIAGNOSIS'    ,'HOSPITAL_EXPIRE_FLAG' ,'HAS_CHARTEVENTS_DATA']

# Drop the specified columns
filtered_data_noHADM = filtered_data_noHADM.drop(columns=columns_to_remove)
filtered_data_hasHADM = filtered_data_hasHADM.drop(columns=columns_to_remove)

# filtered_data_hasHADM.to_csv('filtered_data_hasHADM.csv', index=False)
# filtered_data_noHADM.to_csv('filtered_data_noHADM.csv', index=False)

#Uncomment these if you want to export to CSV
filtered_data_hasHADM.to_csv('filtered_data_hasHADM.csv', index=False)
filtered_data_noHADM.to_csv('filtered_data_noHADM.csv', index=False)

In [11]:
# FINAL these three together is the total lab_events len(df_filled_hadm_labevents) + len(filtered_data_hasHADM) + len(filtered_data_noHADM)

combined_df_hasHADM = pd.concat([df_filled_hadm_labevents, filtered_data_hasHADM], ignore_index=True)
combined_df_hasHADM['HADM_ID'] = combined_df_hasHADM['HADM_ID'].astype(int)
combined_df_hasHADM

In [12]:
combined_df_hasHADM['CHARTTIME'] = pd.to_datetime(combined_df_hasHADM['CHARTTIME'], format='%Y-%m-%d %H:%M:%S')

In [13]:
combined_df_hasHADM.dtypes

Unnamed: 0             int64
ROW_ID                 int64
SUBJECT_ID             int64
HADM_ID                int64
ITEMID                 int64
CHARTTIME     datetime64[ns]
VALUE                 object
VALUENUM             float64
VALUEUOM              object
FLAG                  object
dtype: object

In [18]:
t = 4   # hours

In [17]:
feature_list= [51279,51222
,51221
,50931
,51006
,51244
,51256
,51301
,51274
,50912
,51237
,50893
,50882
,50813
,51277
,50821
,50902
,50970
,51265
,50820
,50862
,51003
,50818
,51275
,51248
,50878
,50863
,51493
,50804
,50983]

In [18]:
df_admissions.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA
0,21,22,165315,2196-04-09 12:26:00,2196-04-10 15:54:00,,EMERGENCY,EMERGENCY ROOM ADMIT,DISC-TRAN CANCER/CHLDRN H,Private,,UNOBTAINABLE,MARRIED,WHITE,2196-04-09 10:06:00,2196-04-09 13:24:00,BENZODIAZEPINE OVERDOSE,0,1
1,22,23,152223,2153-09-03 07:15:00,2153-09-08 19:10:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,,CATHOLIC,MARRIED,WHITE,,,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,0,1
2,23,23,124321,2157-10-18 19:34:00,2157-10-25 14:00:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,ENGL,CATHOLIC,MARRIED,WHITE,,,BRAIN MASS,0,1
3,24,24,161859,2139-06-06 16:14:00,2139-06-09 12:48:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Private,,PROTESTANT QUAKER,SINGLE,WHITE,,,INTERIOR MYOCARDIAL INFARCTION,0,1
4,25,25,129635,2160-11-02 02:06:00,2160-11-05 14:55:00,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,,UNOBTAINABLE,MARRIED,WHITE,2160-11-02 01:01:00,2160-11-02 04:27:00,ACUTE CORONARY SYNDROME,0,1


In [19]:
# filter the rows having itemid's
df_filtered_labevent = combined_df_hasHADM[combined_df_hasHADM["ITEMID"].isin(feature_list)]


In [20]:
def get_admittime(x):
    x = x.sort_values(['CHARTTIME'])
    min_charttime = x.iloc[0]['CHARTTIME']
    new_admittime = x.iloc[0].ADMITTIME
    if min_charttime < new_admittime:
        new_admittime = min_charttime
    return new_admittime

 

def create_labevents_with_time(hours=48):
    potential_cases = df_filtered_labevent.merge(df_demographic[['SUBJECT_ID', 'HADM_ID', 'ADMITTIME','AGE','GENDER', 'IS_SEPSIS']], on=['SUBJECT_ID', 'HADM_ID'])
    new_admittime = potential_cases.groupby(['SUBJECT_ID', 'HADM_ID']).apply(lambda x: get_admittime(x)).reset_index(name='NEW_ADMITTIME')
    potential_cases = potential_cases.merge(new_admittime, on=['SUBJECT_ID', 'HADM_ID'])
    potential_cases['TIME'] = np.ceil((potential_cases.CHARTTIME - potential_cases.NEW_ADMITTIME).dt.total_seconds() / 3600)
    potential_cases = potential_cases[(potential_cases.TIME <= hours)]
    #potential_cases = potential_cases.merge(df_desc_labitems[['ITEMID', 'LABEL', 'FLUID']], on=['ITEMID'])
    #out_file_abnormal = ROOT_DIR / 'data/output_csv/labevents_with_time.csv'
    #my_util.save_csv(potential_cases, out_file_abnormal)

    return potential_cases

 
# time t = 4
df_in_t = create_labevents_with_time(4)

In [21]:
df_in_t


Unnamed: 0.1,Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ITEMID,CHARTTIME,VALUE,VALUENUM,VALUEUOM,FLAG,ADMITTIME,AGE,GENDER,IS_SEPSIS,NEW_ADMITTIME,TIME
0,1,442,3,145834,50882,2101-10-20 16:40:00,25,25.00,mEq/L,,2101-10-20 19:08:00,76,M,0,2101-10-20 16:40:00,0.0
1,2,443,3,145834,50893,2101-10-20 16:40:00,8.2,8.20,mg/dL,abnormal,2101-10-20 19:08:00,76,M,0,2101-10-20 16:40:00,0.0
2,3,444,3,145834,50902,2101-10-20 16:40:00,99,99.00,mEq/L,abnormal,2101-10-20 19:08:00,76,M,0,2101-10-20 16:40:00,0.0
3,6,447,3,145834,50912,2101-10-20 16:40:00,3.2,3.20,mg/dL,abnormal,2101-10-20 19:08:00,76,M,0,2101-10-20 16:40:00,0.0
4,7,448,3,145834,50931,2101-10-20 16:40:00,91,91.00,mg/dL,,2101-10-20 19:08:00,76,M,0,2101-10-20 16:40:00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12936991,22263726,27591926,97747,134290,50813,2176-08-06 13:13:00,1.2,1.20,mmol/L,,2176-08-06 13:00:00,56,M,0,2176-08-06 13:00:00,1.0
12936992,22263727,27591927,97747,134290,50818,2176-08-06 13:13:00,45,45.00,mm Hg,,2176-08-06 13:00:00,56,M,0,2176-08-06 13:00:00,1.0
12936993,22263728,27591928,97747,134290,50820,2176-08-06 13:13:00,7.41,7.41,units,,2176-08-06 13:00:00,56,M,0,2176-08-06 13:00:00,1.0
12936994,22263729,27591929,97747,134290,50821,2176-08-06 13:13:00,160,160.00,mm Hg,abnormal,2176-08-06 13:00:00,56,M,0,2176-08-06 13:00:00,1.0


In [102]:
# a=a.sort_values(by=['SUBJECT_ID', 'HADM_ID', 'ITEMID', 'TIME'], ascending=[True, True, True, False])

In [22]:

str_feature_list = [str(i) for i in feature_list]

In [23]:
df_final = pd.DataFrame()

# checking for initial 5988 rows of demographic table
for index, admission in df_demographic[:5988].iterrows():
    df_required = df_in_t[(df_in_t["SUBJECT_ID"]==admission["SUBJECT_ID"]) & (df_in_t["HADM_ID"]==admission["HADM_ID"])]
    df_required = df_required.reset_index(drop=True) 
    
    row_data = {}
    
    row_data["SUBJECT_ID"] = admission["SUBJECT_ID"]
    row_data["HADM_ID"] = admission["HADM_ID"]
    row_data["AGE"] = admission["AGE"]
    row_data["GENDER"] = admission["GENDER"]
    for f in str_feature_list:
        try:
            recent_index = df_required[df_required["ITEMID"] == int(f)]["TIME"].idxmax()
            
            value = df_required.loc[recent_index]["VALUE"]
            row_data[f] = value

        except:
            # any dummy statement
            x=2
    
    df_final = pd.concat([df_final, pd.DataFrame([row_data])] , ignore_index= True)
   

In [24]:
df_final

Unnamed: 0,SUBJECT_ID,HADM_ID,AGE,GENDER,51279,51222,51221,50931,51006,51244,...,51275,51248,50878,50863,51493,50804,50983,51003,50813,50862
0,22,165315,64,F,4.00,12.2,35.9,102,17,32.2,...,30.1,30.4,26,75,0,22,140,,,
1,23,152223,71,M,,,,,,,...,,,,,,24,,,,
2,23,124321,75,M,,,,,,,...,,,,,,,,,,
3,24,161859,39,M,,,,,,,...,,,,,,27,,,,
4,25,129635,58,M,4.17,13.1,37.9,364,51,9.5,...,34.6,31.3,,,,,134,0.11,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5983,5786,167296,57,M,2.56,9.7,28.9,56,18,8.9,...,36.0,37.9,75,56,,24,133,,3.3,1.7
5984,5787,104038,0,F,4.77,15.7,48.4,,,65,...,,33.0,,,,,,,,
5985,5788,139503,79,M,3.10,9.1,25.8,147,82,,...,24.2,29.5,,,,,146,,,3.6
5986,5789,159128,55,M,5.06,16.0,44.1,109,18,17.1,...,65.9,31.6,,,,,142,,,


In [26]:
# replacing the NaN values with -999
df_final = df_final.fillna(-999)

In [27]:
df_final

Unnamed: 0,SUBJECT_ID,HADM_ID,AGE,GENDER,51279,51222,51221,50931,51006,51244,...,51275,51248,50878,50863,51493,50804,50983,51003,50813,50862
0,22,165315,64,F,4.00,12.2,35.9,102,17,32.2,...,30.1,30.4,26,75,0,22,140,-999,-999,-999
1,23,152223,71,M,-999,-999,-999,-999,-999,-999,...,-999,-999,-999,-999,-999,24,-999,-999,-999,-999
2,23,124321,75,M,-999,-999,-999,-999,-999,-999,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
3,24,161859,39,M,-999,-999,-999,-999,-999,-999,...,-999,-999,-999,-999,-999,27,-999,-999,-999,-999
4,25,129635,58,M,4.17,13.1,37.9,364,51,9.5,...,34.6,31.3,-999,-999,-999,-999,134,0.11,-999,-999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5983,5786,167296,57,M,2.56,9.7,28.9,56,18,8.9,...,36.0,37.9,75,56,-999,24,133,-999,3.3,1.7
5984,5787,104038,0,F,4.77,15.7,48.4,-999,-999,65,...,-999,33.0,-999,-999,-999,-999,-999,-999,-999,-999
5985,5788,139503,79,M,3.10,9.1,25.8,147,82,-999,...,24.2,29.5,-999,-999,-999,-999,146,-999,-999,3.6
5986,5789,159128,55,M,5.06,16.0,44.1,109,18,17.1,...,65.9,31.6,-999,-999,-999,-999,142,-999,-999,-999
