### Imports


In [54]:
import os
os.chdir('../')

from src.data import constants
import pandas as pd
from pandas import read_csv
import datetime as dt
import numpy as np

from sklearn.preprocessing import LabelEncoder 
le = LabelEncoder()

### Read data from local source

MIMIC-IV Dataset

Core

In [2]:
df_admissions = read_csv(constants.admissions)
df_patients = read_csv(constants.patients)
df_transfers = read_csv(constants.transfers)


ICU

In [3]:
df_icustays = read_csv(constants.icustays)


MIMIC-CXR

In [4]:
df_mimic_cxr_chexpert = read_csv(constants.mimic_cxr_chexpert)
df_mimic_cxr_metadata = read_csv(constants.mimic_cxr_metadata)


#### CORE dataset fusion

In [5]:
df_core_fusion = df_admissions.merge(df_patients, on=("subject_id")).merge(df_transfers, on=('subject_id', 'hadm_id'))


In [6]:
df_core_fusion.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1840955 entries, 0 to 1840954
Data columns (total 25 columns):
 #   Column                Dtype 
---  ------                ----- 
 0   subject_id            int64 
 1   hadm_id               int64 
 2   admittime             object
 3   dischtime             object
 4   deathtime             object
 5   admission_type        object
 6   admission_location    object
 7   discharge_location    object
 8   insurance             object
 9   language              object
 10  marital_status        object
 11  ethnicity             object
 12  edregtime             object
 13  edouttime             object
 14  hospital_expire_flag  int64 
 15  gender                object
 16  anchor_age            int64 
 17  anchor_year           int64 
 18  anchor_year_group     object
 19  dod                   object
 20  transfer_id           int64 
 21  eventtype             object
 22  careunit              object
 23  intime                object
 24

In [7]:
df_core_icu_fusion = df_core_fusion.merge(df_icustays, on=('subject_id', 'hadm_id'))

In [8]:
df_core_icu_fusion.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 420372 entries, 0 to 420371
Data columns (total 31 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   subject_id            420372 non-null  int64  
 1   hadm_id               420372 non-null  int64  
 2   admittime             420372 non-null  object 
 3   dischtime             420372 non-null  object 
 4   deathtime             47063 non-null   object 
 5   admission_type        420372 non-null  object 
 6   admission_location    420372 non-null  object 
 7   discharge_location    417901 non-null  object 
 8   insurance             420372 non-null  object 
 9   language              420372 non-null  object 
 10  marital_status        393342 non-null  object 
 11  ethnicity             420372 non-null  object 
 12  edregtime             288815 non-null  object 
 13  edouttime             288815 non-null  object 
 14  hospital_expire_flag  420372 non-null  int64  
 15  

#### CXR dataset fusion

In [9]:
df_cxr_merged = df_mimic_cxr_chexpert.merge(df_mimic_cxr_metadata, on= ('subject_id', 'study_id'))

In [10]:
df_cxr_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 377095 entries, 0 to 377094
Data columns (total 26 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   subject_id                                  377095 non-null  int64  
 1   study_id                                    377095 non-null  int64  
 2   Atelectasis                                 82830 non-null   float64
 3   Cardiomegaly                                97781 non-null   float64
 4   Consolidation                               35377 non-null   float64
 5   Edema                                       96892 non-null   float64
 6   Enlarged Cardiomediastinum                  31333 non-null   float64
 7   Fracture                                    10413 non-null   float64
 8   Lung Lesion                                 14547 non-null   float64
 9   Lung Opacity                                87445 non-null   float64
 

#### Core_ICU_CXR Fusion

In [11]:
df_core_icu_cxr_fusion = df_core_icu_fusion.merge(df_cxr_merged, on=("subject_id"))

In [12]:
df_core_icu_cxr_fusion.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3227874 entries, 0 to 3227873
Data columns (total 56 columns):
 #   Column                                      Dtype  
---  ------                                      -----  
 0   subject_id                                  int64  
 1   hadm_id                                     int64  
 2   admittime                                   object 
 3   dischtime                                   object 
 4   deathtime                                   object 
 5   admission_type                              object 
 6   admission_location                          object 
 7   discharge_location                          object 
 8   insurance                                   object 
 9   language                                    object 
 10  marital_status                              object 
 11  ethnicity                                   object 
 12  edregtime                                   object 
 13  edouttime                  

Demographics categorical features creation

In [13]:
demo_embeddings = ['anchor_age', 'gender', 'ethnicity', 'marital_status', 'language', 'insurance']

for i in range (len(demo_embeddings)):
      
    df_core_icu_cxr_fusion['de_'+str(i)] = df_core_icu_cxr_fusion[demo_embeddings[i]]

Demographics categorical features encoding

In [23]:
for i in range(5):
          df_core_icu_cxr_fusion['de_'+str(i+1)] = le.fit_transform(df_core_icu_cxr_fusion['de_'+str(i+1)])

In [39]:
df_core_icu_cxr_fusion["de_5"].unique()

array([2, 1, 0])

In [40]:
df_core_icu_cxr_fusion = df_core_icu_cxr_fusion.drop(['anchor_age', 'gender', 'ethnicity', 'marital_status', 'language', 'insurance'], axis=1)


In [41]:
df_core_icu_cxr_fusion['death_status'] = np.where(df_core_icu_cxr_fusion['discharge_location'] == 'DIED' ,1,0)


In [42]:
df_core_icu_cxr_fusion = df_core_icu_cxr_fusion.drop(['discharge_location'], axis =1)


In [43]:
df_core_icu_cxr_fusion.columns

Index(['subject_id', 'hadm_id', 'admittime', 'dischtime', 'deathtime',
       'admission_type', 'admission_location', 'edregtime', 'edouttime',
       'hospital_expire_flag', 'anchor_year', 'anchor_year_group', 'dod',
       'transfer_id', 'eventtype', 'careunit', 'intime_x', 'outtime_x',
       'stay_id', 'first_careunit', 'last_careunit', 'intime_y', 'outtime_y',
       'los', 'study_id', 'Atelectasis', 'Cardiomegaly', 'Consolidation',
       'Edema', 'Enlarged Cardiomediastinum', 'Fracture', 'Lung Lesion',
       'Lung Opacity', 'No Finding', 'Pleural Effusion', 'Pleural Other',
       'Pneumonia', 'Pneumothorax', 'Support Devices', 'dicom_id',
       'PerformedProcedureStepDescription', 'ViewPosition', 'Rows', 'Columns',
       'StudyDate', 'StudyTime', 'ProcedureCodeSequence_CodeMeaning',
       'ViewCodeSequence_CodeMeaning',
       'PatientOrientationCodeSequence_CodeMeaning', 'de_0', 'de_1', 'de_2',
       'de_3', 'de_4', 'de_5', 'death_status'],
      dtype='object')

In [44]:
df_core_icu_cxr_fusion = df_core_icu_cxr_fusion.loc[: , ['subject_id', 'hadm_id','stay_id',  'admittime','de_0', 'de_1', 'de_2',
       'de_3', 'de_4', 'de_5', 'death_status', 'study_id', 'Atelectasis', 'Cardiomegaly', 'Consolidation',
       'Edema', 'Enlarged Cardiomediastinum', 'Fracture', 'Lung Lesion',
       'Lung Opacity', 'No Finding', 'Pleural Effusion', 'Pleural Other',
       'Pneumonia', 'Pneumothorax', 'Support Devices', 'dicom_id',
       'PerformedProcedureStepDescription', 'ViewPosition', 'Rows', 'Columns',
       'StudyDate', 'StudyTime', 'ProcedureCodeSequence_CodeMeaning',
       'ViewCodeSequence_CodeMeaning',
       'PatientOrientationCodeSequence_CodeMeaning', ]]

In [45]:
df_core_icu_cxr_fusion.columns

Index(['subject_id', 'hadm_id', 'stay_id', 'admittime', 'de_0', 'de_1', 'de_2',
       'de_3', 'de_4', 'de_5', 'death_status', 'study_id', 'Atelectasis',
       'Cardiomegaly', 'Consolidation', 'Edema', 'Enlarged Cardiomediastinum',
       'Fracture', 'Lung Lesion', 'Lung Opacity', 'No Finding',
       'Pleural Effusion', 'Pleural Other', 'Pneumonia', 'Pneumothorax',
       'Support Devices', 'dicom_id', 'PerformedProcedureStepDescription',
       'ViewPosition', 'Rows', 'Columns', 'StudyDate', 'StudyTime',
       'ProcedureCodeSequence_CodeMeaning', 'ViewCodeSequence_CodeMeaning',
       'PatientOrientationCodeSequence_CodeMeaning'],
      dtype='object')

In [46]:
len(df_core_icu_cxr_fusion['subject_id'].unique())

20245

In [47]:
df_core_icu_cxr_fusion

Unnamed: 0,subject_id,hadm_id,stay_id,admittime,de_0,de_1,de_2,de_3,de_4,de_5,...,dicom_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning
0,16296962,29416427,31295236,2163-01-12 17:53:00,29,1,5,4,0,2,...,b3a2415a-df19a752-86089f92-63020694-4aa485e9,CHEST (PORTABLE AP),AP,3056,2544,21630113,34411.031,CHEST (PORTABLE AP),antero-posterior,Erect
1,16296962,29416427,31295236,2163-01-12 17:53:00,29,1,5,4,0,2,...,b3a2415a-df19a752-86089f92-63020694-4aa485e9,CHEST (PORTABLE AP),AP,3056,2544,21630113,34411.031,CHEST (PORTABLE AP),antero-posterior,Erect
2,16296962,29416427,31295236,2163-01-12 17:53:00,29,1,5,4,0,2,...,b3a2415a-df19a752-86089f92-63020694-4aa485e9,CHEST (PORTABLE AP),AP,3056,2544,21630113,34411.031,CHEST (PORTABLE AP),antero-posterior,Erect
3,16296962,29416427,31295236,2163-01-12 17:53:00,29,1,5,4,0,2,...,b3a2415a-df19a752-86089f92-63020694-4aa485e9,CHEST (PORTABLE AP),AP,3056,2544,21630113,34411.031,CHEST (PORTABLE AP),antero-posterior,Erect
4,16296962,29035203,31832800,2163-06-13 07:15:00,29,1,5,4,1,2,...,b3a2415a-df19a752-86089f92-63020694-4aa485e9,CHEST (PORTABLE AP),AP,3056,2544,21630113,34411.031,CHEST (PORTABLE AP),antero-posterior,Erect
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3227869,18227829,20330892,39672431,2177-11-18 11:46:00,91,1,7,1,1,2,...,eadebc12-079e0e8d-17b034c1-9333cb57-a7d8bd62,CHEST (PORTABLE AP),AP,3050,2539,21771122,73509.937,CHEST (PORTABLE AP),antero-posterior,Erect
3227870,18227829,20330892,39672431,2177-11-18 11:46:00,91,1,7,1,1,2,...,20376daa-31970cf3-938a9fd6-82cb5f44-80c1c211,CHEST (PORTABLE AP),AP,2544,3056,21771125,153228.078,CHEST (PORTABLE AP),antero-posterior,Erect
3227871,18227829,20330892,39672431,2177-11-18 11:46:00,91,1,7,1,1,2,...,226d77aa-c6da08cd-f7ede193-5e1b4e81-7226691e,CHEST (PORTABLE AP),AP,2544,3056,21771121,14131.515,CHEST (PORTABLE AP),antero-posterior,Erect
3227872,18227829,20330892,39672431,2177-11-18 11:46:00,91,1,7,1,1,2,...,f9a82111-ef932e19-948ff9f2-a147a2e1-c8674d21,CHEST (PORTABLE AP),AP,2544,3056,21771119,50543.375,CHEST (PORTABLE AP),antero-posterior,Erect


Selecting 1 patient

In [48]:
df_core_icu_cxr_fusion14811141 = df_core_icu_cxr_fusion.loc[df_core_icu_cxr_fusion['subject_id'] == 14811141 ]


In [49]:
df_core_icu_cxr_fusion14811141

Unnamed: 0,subject_id,hadm_id,stay_id,admittime,de_0,de_1,de_2,de_3,de_4,de_5,...,dicom_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning
2790418,14811141,26884919,38772375,2189-08-17 00:46:00,82,0,7,3,1,1,...,26b72926-4afaa4c9-b24ece3a-ee2a554a-4add0f4a,CHEST (PORTABLE AP),AP,3043,2539,21890817,125809.906,CHEST (PORTABLE AP),antero-posterior,Erect
2790419,14811141,26884919,38772375,2189-08-17 00:46:00,82,0,7,3,1,1,...,26b72926-4afaa4c9-b24ece3a-ee2a554a-4add0f4a,CHEST (PORTABLE AP),AP,3043,2539,21890817,125809.906,CHEST (PORTABLE AP),antero-posterior,Erect
2790420,14811141,26884919,38772375,2189-08-17 00:46:00,82,0,7,3,1,1,...,26b72926-4afaa4c9-b24ece3a-ee2a554a-4add0f4a,CHEST (PORTABLE AP),AP,3043,2539,21890817,125809.906,CHEST (PORTABLE AP),antero-posterior,Erect
2790421,14811141,26884919,38772375,2189-08-17 00:46:00,82,0,7,3,1,1,...,26b72926-4afaa4c9-b24ece3a-ee2a554a-4add0f4a,CHEST (PORTABLE AP),AP,3043,2539,21890817,125809.906,CHEST (PORTABLE AP),antero-posterior,Erect


In [55]:
#Sample of 10 patients

#sample = read_csv(constants.icu_cxr_patients_sample10)

#df_core_icu_cxr_fusion_sample = df_core_icu_cxr_fusion.loc[df_core_icu_cxr_fusion['subject_id'].isin(sample)]

'''
0    14811141
1    18874374
2    11272213
3    13762583
4    18087960
5    13500443
6    12189736
7    14024750
8    19136566
9    18481208'''

'\n0    14811141\n1    18874374\n2    11272213\n3    13762583\n4    18087960\n5    13500443\n6    12189736\n7    14024750\n8    19136566\n9    18481208'

In [53]:
df_core_icu_cxr_fusion_sample

Unnamed: 0,subject_id,hadm_id,stay_id,admittime,de_0,de_1,de_2,de_3,de_4,de_5,...,dicom_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning
1214362,18874374,26560714,31332432,2143-10-14 04:32:00,80,0,1,1,0,1,...,4d9de708-0d2bc7fe-d09123f4-cddd314f-81bceaad,CHEST (PORTABLE AP),AP,3056,2544,21431014,24132.578,CHEST (PORTABLE AP),antero-posterior,Erect
1214363,18874374,26560714,31332432,2143-10-14 04:32:00,80,0,1,1,0,1,...,4d9de708-0d2bc7fe-d09123f4-cddd314f-81bceaad,CHEST (PORTABLE AP),AP,3056,2544,21431014,24132.578,CHEST (PORTABLE AP),antero-posterior,Erect
1214364,18874374,26560714,31332432,2143-10-14 04:32:00,80,0,1,1,0,1,...,4d9de708-0d2bc7fe-d09123f4-cddd314f-81bceaad,CHEST (PORTABLE AP),AP,3056,2544,21431014,24132.578,CHEST (PORTABLE AP),antero-posterior,Erect
1214365,18874374,26560714,31332432,2143-10-14 04:32:00,80,0,1,1,0,1,...,4d9de708-0d2bc7fe-d09123f4-cddd314f-81bceaad,CHEST (PORTABLE AP),AP,3056,2544,21431014,24132.578,CHEST (PORTABLE AP),antero-posterior,Erect
1214366,18874374,26560714,31332432,2143-10-14 04:32:00,80,0,1,1,0,1,...,4d9de708-0d2bc7fe-d09123f4-cddd314f-81bceaad,CHEST (PORTABLE AP),AP,3056,2544,21431014,24132.578,CHEST (PORTABLE AP),antero-posterior,Erect
1214367,18874374,25548314,36493337,2143-10-05 13:19:00,80,0,1,1,0,1,...,4d9de708-0d2bc7fe-d09123f4-cddd314f-81bceaad,CHEST (PORTABLE AP),AP,3056,2544,21431014,24132.578,CHEST (PORTABLE AP),antero-posterior,Erect
1214368,18874374,25548314,36493337,2143-10-05 13:19:00,80,0,1,1,0,1,...,4d9de708-0d2bc7fe-d09123f4-cddd314f-81bceaad,CHEST (PORTABLE AP),AP,3056,2544,21431014,24132.578,CHEST (PORTABLE AP),antero-posterior,Erect
1214369,18874374,25548314,36493337,2143-10-05 13:19:00,80,0,1,1,0,1,...,4d9de708-0d2bc7fe-d09123f4-cddd314f-81bceaad,CHEST (PORTABLE AP),AP,3056,2544,21431014,24132.578,CHEST (PORTABLE AP),antero-posterior,Erect
1214370,18874374,25548314,36493337,2143-10-05 13:19:00,80,0,1,1,0,1,...,4d9de708-0d2bc7fe-d09123f4-cddd314f-81bceaad,CHEST (PORTABLE AP),AP,3056,2544,21431014,24132.578,CHEST (PORTABLE AP),antero-posterior,Erect
2790418,14811141,26884919,38772375,2189-08-17 00:46:00,82,0,7,3,1,1,...,26b72926-4afaa4c9-b24ece3a-ee2a554a-4add0f4a,CHEST (PORTABLE AP),AP,3043,2539,21890817,125809.906,CHEST (PORTABLE AP),antero-posterior,Erect
