### Imports


In [1]:
import os
os.chdir('../')

from src.data import constants
import pandas as pd
from pandas import read_csv
import datetime as dt
import numpy as np


### Read data from local source

MIMIC-IV Dataset

Core

In [2]:
df_admissions = read_csv(constants.admissions)
df_patients = read_csv(constants.patients)
df_transfers = read_csv(constants.transfers)


ICU

In [3]:
df_icustays = read_csv(constants.icustays)


MIMIC-CXR

In [4]:
df_mimic_cxr_chexpert = read_csv(constants.mimic_cxr_chexpert)
df_mimic_cxr_metadata = read_csv(constants.mimic_cxr_metadata)


#### CORE dataset fusion

In [5]:
df_core_fusion = df_admissions.merge(df_patients, on=("subject_id")).merge(df_transfers, on=('subject_id', 'hadm_id'))


In [6]:
df_core_fusion.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1840955 entries, 0 to 1840954
Data columns (total 25 columns):
 #   Column                Dtype 
---  ------                ----- 
 0   subject_id            int64 
 1   hadm_id               int64 
 2   admittime             object
 3   dischtime             object
 4   deathtime             object
 5   admission_type        object
 6   admission_location    object
 7   discharge_location    object
 8   insurance             object
 9   language              object
 10  marital_status        object
 11  ethnicity             object
 12  edregtime             object
 13  edouttime             object
 14  hospital_expire_flag  int64 
 15  gender                object
 16  anchor_age            int64 
 17  anchor_year           int64 
 18  anchor_year_group     object
 19  dod                   object
 20  transfer_id           int64 
 21  eventtype             object
 22  careunit              object
 23  intime                object
 24

In [7]:
df_core_icu_fusion = df_core_fusion.merge(df_icustays, on=('subject_id', 'hadm_id'))

In [8]:
df_core_icu_fusion.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 420372 entries, 0 to 420371
Data columns (total 31 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   subject_id            420372 non-null  int64  
 1   hadm_id               420372 non-null  int64  
 2   admittime             420372 non-null  object 
 3   dischtime             420372 non-null  object 
 4   deathtime             47063 non-null   object 
 5   admission_type        420372 non-null  object 
 6   admission_location    420372 non-null  object 
 7   discharge_location    417901 non-null  object 
 8   insurance             420372 non-null  object 
 9   language              420372 non-null  object 
 10  marital_status        393342 non-null  object 
 11  ethnicity             420372 non-null  object 
 12  edregtime             288815 non-null  object 
 13  edouttime             288815 non-null  object 
 14  hospital_expire_flag  420372 non-null  int64  
 15  

#### CXR dataset fusion

In [9]:
df_cxr_merged = df_mimic_cxr_chexpert.merge(df_mimic_cxr_metadata, on= ('subject_id', 'study_id'))

In [10]:
df_cxr_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 377095 entries, 0 to 377094
Data columns (total 29 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   subject_id                                  377095 non-null  int64  
 1   study_id                                    377095 non-null  int64  
 2   Atelectasis                                 82830 non-null   float64
 3   Cardiomegaly                                97781 non-null   float64
 4   Consolidation                               35377 non-null   float64
 5   Edema                                       96892 non-null   float64
 6   Enlarged Cardiomediastinum                  31333 non-null   float64
 7   Fracture                                    10413 non-null   float64
 8   Lung Lesion                                 14547 non-null   float64
 9   Lung Opacity                                87445 non-null   float64
 

#### Core_ICU_CXR Fusion

In [11]:
df_core_icu_cxr_fusion = df_core_icu_fusion.merge(df_cxr_merged, on=("subject_id"))

In [12]:
df_core_icu_cxr_fusion.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3227874 entries, 0 to 3227873
Data columns (total 59 columns):
 #   Column                                      Dtype  
---  ------                                      -----  
 0   subject_id                                  int64  
 1   hadm_id                                     int64  
 2   admittime                                   object 
 3   dischtime                                   object 
 4   deathtime                                   object 
 5   admission_type                              object 
 6   admission_location                          object 
 7   discharge_location                          object 
 8   insurance                                   object 
 9   language                                    object 
 10  marital_status                              object 
 11  ethnicity                                   object 
 12  edregtime                                   object 
 13  edouttime                  

In [23]:
demo_embeddings = ['anchor_age', 'gender', 'ethnicity', 'marital_status', 'language', 'insurance']

for i in range (len(demo_embeddings)):
      
    df_core_icu_cxr_fusion['de_'+str(i)] = df_core_icu_cxr_fusion[demo_embeddings[i]]

In [57]:
import numpy as np

df_core_icu_cxr_fusion['death_status'] = np.where(df_core_icu_cxr_fusion['discharge_location'] == 'DIED' ,1,0)

In [None]:
df_core_icu_cxr_fusion = df_core_icu_cxr_fusion.drop(['anchor_age', 'gender', 'ethnicity', 'marital_status', 'language', 'insurance'], axis=1)

In [63]:
df_core_icu_cxr_fusion = df_core_icu_cxr_fusion.drop(['discharge_location'], axis =1)

In [37]:
df_core_icu_cxr_fusion = df_core_icu_cxr_fusion.loc[: , ['subject_id', 'hadm_id', 'stay_id', 'de_0', 'de_1', 'de_2', 'de_3', 'de_4',
       'de_5', 'admittime', 'dischtime', 'deathtime',
       'admission_type', 'admission_location', 'discharge_location',
       'edregtime', 'edouttime', 'hospital_expire_flag', 'anchor_year',
       'anchor_year_group', 'dod', 'transfer_id', 'eventtype', 'careunit',
       'intime_x', 'outtime_x', 'first_careunit', 'last_careunit',
       'intime_y', 'outtime_y', 'los', 'study_id', 'Atelectasis',
       'Cardiomegaly', 'Consolidation', 'Edema', 'Enlarged Cardiomediastinum',
       'Fracture', 'Lung Lesion', 'Lung Opacity', 'No Finding',
       'Pleural Effusion', 'Pleural Other', 'Pneumonia', 'Pneumothorax',
       'Support Devices', 'dicom_id', 'PerformedProcedureStepDescription',
       'ViewPosition', 'Rows', 'Columns', 'StudyDate', 'StudyTime',
       'ProcedureCodeSequence_CodeMeaning', 'ViewCodeSequence_CodeMeaning',
       'PatientOrientationCodeSequence_CodeMeaning', 'StudyDateForm',
       'StudyTimeForm', 'cxrtime']]

In [66]:
df_core_icu_cxr_fusion = df_core_icu_cxr_fusion.loc[: , ['subject_id', 'hadm_id', 'stay_id', 'de_0', 'de_1', 'de_2', 'de_3',
       'de_4', 'de_5', 'death_status', 'admittime', 'dischtime', 'deathtime', 'admission_type',
       'admission_location', 'edregtime', 'edouttime', 'hospital_expire_flag',
       'anchor_year', 'anchor_year_group', 'dod', 'transfer_id', 'eventtype',
       'careunit', 'intime_x', 'outtime_x', 'first_careunit', 'last_careunit',
       'intime_y', 'outtime_y', 'los', 'study_id', 'Atelectasis',
       'Cardiomegaly', 'Consolidation', 'Edema', 'Enlarged Cardiomediastinum',
       'Fracture', 'Lung Lesion', 'Lung Opacity', 'No Finding',
       'Pleural Effusion', 'Pleural Other', 'Pneumonia', 'Pneumothorax',
       'Support Devices', 'dicom_id', 'PerformedProcedureStepDescription',
       'ViewPosition', 'Rows', 'Columns', 'StudyDate', 'StudyTime',
       'ProcedureCodeSequence_CodeMeaning', 'ViewCodeSequence_CodeMeaning',
       'PatientOrientationCodeSequence_CodeMeaning', 'StudyDateForm',
       'StudyTimeForm', 'cxrtime']]

In [69]:
df_core_icu_cxr_fusion.columns

Index(['subject_id', 'hadm_id', 'stay_id', 'de_0', 'de_1', 'de_2', 'de_3',
       'de_4', 'de_5', 'death_status', 'admittime', 'dischtime', 'deathtime',
       'admission_type', 'admission_location', 'edregtime', 'edouttime',
       'hospital_expire_flag', 'anchor_year', 'anchor_year_group', 'dod',
       'transfer_id', 'eventtype', 'careunit', 'intime_x', 'outtime_x',
       'first_careunit', 'last_careunit', 'intime_y', 'outtime_y', 'los',
       'study_id', 'Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema',
       'Enlarged Cardiomediastinum', 'Fracture', 'Lung Lesion', 'Lung Opacity',
       'No Finding', 'Pleural Effusion', 'Pleural Other', 'Pneumonia',
       'Pneumothorax', 'Support Devices', 'dicom_id',
       'PerformedProcedureStepDescription', 'ViewPosition', 'Rows', 'Columns',
       'StudyDate', 'StudyTime', 'ProcedureCodeSequence_CodeMeaning',
       'ViewCodeSequence_CodeMeaning',
       'PatientOrientationCodeSequence_CodeMeaning', 'StudyDateForm',
       'StudyTi

In [70]:
len(df_core_icu_cxr_fusion['subject_id'].unique())

20245

In [68]:
df_core_icu_cxr_fusion.to_csv("df_core_icu_cxr_fusion.csv")