## Install CXR foundation model

In [None]:
!pip install cxr-foundation

## Important libraries

In [None]:
import pandas as pd
from IPython.display import clear_output
import io
import os
import glob
import zipfile
import shutil

import numpy as np
import random as python_random
import matplotlib.pyplot as plt
from numpy import linalg as LA
import seaborn as sns
import sklearn.metrics as sklm
from sklearn.utils import shuffle

from cxr_foundation.mimic import parse_embedding_file_pattern
from cxr_foundation import embeddings_data

seed=42
np.random.seed(seed)
python_random.seed(seed)

## Extract the zipped file to get embedding vectors for each patient

In [None]:
# path to the zip file
zip_file_name = "generalized-image-embeddings-for-the-mimic-chest-x-ray-dataset-1.0.zip"


# Unzip the file
with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
    zip_ref.extractall('./')

print("Extraction complete.")

Extraction complete.


In [None]:
df_embeddings = pd.read_csv("./generalized-image-embeddings-for-the-mimic-chest-x-ray-dataset-1.0/SHA256SUMS.txt",
                            delimiter=" ",
                            header=None,skiprows=[0])

display(df_embeddings.head(1))

SOURCE_COL_NAME = "embedding_path"
# Create additional columns from file path components
df_embeddings = df_embeddings[[1]]

df_embeddings.rename(columns={1: "embedding_path"},
                     inplace=True)

df_embeddings[["subject_id","study_id", "dicom_id"]] = df_embeddings.apply(
    lambda x: parse_embedding_file_pattern(x[SOURCE_COL_NAME]),
    axis=1,
    result_type="expand")

df_embeddings.embedding_path="generalized-image-embeddings-for-the-mimic-chest-x-ray-dataset-1.0/"+df_embeddings.embedding_path

df_embeddings.head(2)

Unnamed: 0,0,1
0,70e8c87d38d5dfba586218ccafb7428f76a3c7b08f6179...,files/p10/p10000032/s50414267/02aa804e-bde0afd...


Unnamed: 0,embedding_path,subject_id,study_id,dicom_id
0,generalized-image-embeddings-for-the-mimic-che...,10000032,50414267,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014
1,generalized-image-embeddings-for-the-mimic-che...,10000032,53189527,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab


In [None]:
example = embeddings_data.read_record_example(df_embeddings.embedding_path.iloc[10])
print(example)

features {
  feature {
    key: "image/id"
    value {
      bytes_list {
        value: "gs://superrad/inputs/mimic-cxr/files/p10/p10000935/s55697293/c50494f1-90e2bff5-e9189550-1a4562fd-6ab5204c.dcm"
      }
    }
  }
  feature {
    key: "image/format"
    value {
      bytes_list {
        value: "png"
      }
    }
  }
  feature {
    key: "embedding"
    value {
      float_list {
        value: -0.732482851
        value: -2.44990683
        value: 1.28205884
        value: -2.61938357
        value: 0.555371881
        value: -0.38510558
        value: 0.451221466
        value: 0.475964963
        value: 0.673081517
        value: 0.49331975
        value: 0.0527170226
        value: -0.255719841
        value: 0.292476237
        value: 0.129536092
        value: 1.29072249
        value: 2.03499103
        value: -0.82255441
        value: 0.701173246
        value: 0.246889234
        value: 1.03137314
        value: 2.47983336
        value: 2.08443427
        value: -3.113

## Embeddings Metadata

Data source:
- https://physionet.org/content/image-embeddings-mimic-cxr/1.0/
- https://physionet.org/content/mimiciv/1.0/
- https://physionet.org/content/mimic-cxr-jpg/2.0.0/


In [None]:
df_metadata = pd.read_csv("./mimic-cxr-2.0.0-metadata.csv.gz", compression="gzip")
MIMIC_CXR_Labels_df = pd.read_csv("./mimic-cxr-2.0.0-chexpert.csv.gz", compression="gzip")

demographic_df  = pd.read_csv("./admissions.csv.gz", compression="gzip")
patients_df  = pd.read_csv("./patients.csv.gz", compression="gzip")


In [None]:
df_metadata.info()
MIMIC_CXR_Labels_df.info()
demographic_df.info()
patients_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 377110 entries, 0 to 377109
Data columns (total 12 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   dicom_id                                    377110 non-null  object 
 1   subject_id                                  377110 non-null  int64  
 2   study_id                                    377110 non-null  int64  
 3   PerformedProcedureStepDescription           341598 non-null  object 
 4   ViewPosition                                361341 non-null  object 
 5   Rows                                        377110 non-null  int64  
 6   Columns                                     377110 non-null  int64  
 7   StudyDate                                   377110 non-null  int64  
 8   StudyTime                                   377110 non-null  float64
 9   ProcedureCodeSequence_CodeMeaning           377110 non-null  object 
 

In [None]:
demographic_df.insurance.value_counts()

insurance
Other       301530
Medicare    171360
Medicaid     50850
Name: count, dtype: int64

In [None]:
patients_df.shape

(382278, 6)

In [None]:
MIMIC_CXR_Labels_df.replace(np.nan,0,inplace=True)
MIMIC_CXR_Labels_df.replace(-1,0,inplace=True)

MIMIC_CXR_Labels_df.shape

(227827, 16)

## Create one dataframe by joining all the other df's

Join embeddings list with Demography metadata files

In [None]:
demographic_df = demographic_df.drop_duplicates(subset='subject_id')
demographic_df.shape

(256878, 15)

In [None]:
demographic_df.ethnicity.value_counts()

ethnicity
WHITE                            159471
BLACK/AFRICAN AMERICAN            31190
ASIAN                             16526
UNKNOWN                           16464
OTHER                             15859
HISPANIC/LATINO                   13334
UNABLE TO OBTAIN                   3086
AMERICAN INDIAN/ALASKA NATIVE       948
Name: count, dtype: int64

In [None]:
# remove patients who have inconsistent documented race information
# credit to github.com/robintibor
ethnicity_df = demographic_df.loc[:,['subject_id', 'ethnicity']].drop_duplicates()
v = ethnicity_df.subject_id.value_counts()
subject_id_more_than_once = v.index[v.gt(1)]

print(f'count of duplicated subject id : {len(subject_id_more_than_once)}')

ambiguous_ethnicity_df = ethnicity_df[ethnicity_df.subject_id.isin(subject_id_more_than_once)]

inconsistent_race = ambiguous_ethnicity_df.subject_id.unique()

count of duplicated subject id : 0


In [None]:
data_df= df_metadata.merge(demographic_df,on='subject_id')
data_df.shape

(347747, 26)

In [None]:
# Each study contains one or more DICOMs
data_df = data_df.merge(patients_df, on=['subject_id'])

# Select only the columns you want to include
data_df=data_df.drop(columns=['anchor_year','anchor_year_group','dod','hadm_id', 'admittime','dischtime',
                              'deathtime','admission_type','admission_location','discharge_location'
                                ,'language','marital_status','edregtime','edouttime','hospital_expire_flag',
                                'PerformedProcedureStepDescription','ViewPosition','Rows','Columns',
                                'StudyDate','StudyTime','ProcedureCodeSequence_CodeMeaning',
                                'ViewCodeSequence_CodeMeaning','PatientOrientationCodeSequence_CodeMeaning'])
data_df.shape

(347747, 7)

In [None]:
data_df = data_df[~data_df.subject_id.isin(inconsistent_race)]
data_df = data_df.rename(columns={"ethnicity": "race"})
data_df = data_df.merge(MIMIC_CXR_Labels_df, on=['study_id','subject_id'])
data_df.shape


(347732, 21)

In [None]:

data_df = df_embeddings.merge(data_df, on=['dicom_id'], how='left')
data_df=data_df.loc[:, ~data_df.columns.duplicated()]
data_df.dropna(inplace=True)
data_df.rename(columns={'subject_id_x': 'subject_id','study_id_x': 'study_id'}, inplace=True)
data_df.shape

(228905, 24)

In [None]:

data_df=data_df[['embeddings_file','subject_id','study_id','dicom_id','gender','insurance',
                 'anchor_age','race','Enlarged Cardiomediastinum','Cardiomegaly','Lung Opacity',
                 'Lung Lesion','Edema','Consolidation','Pneumonia','Atelectasis'
                 ,'Pneumothorax','Pleural Effusion','Pleural Other','Fracture'
                 ,'Support Devices','No Finding']]
data_df.insert(4, "split","none", True)
data_df.rename(columns={'embeddings_file': 'path'},inplace=True)
data_df.race.value_counts()

race
WHITE                            150364
BLACK/AFRICAN AMERICAN            36572
HISPANIC/LATINO                   12427
OTHER                             10726
UNKNOWN                            9844
ASIAN                              7390
UNABLE TO OBTAIN                    918
AMERICAN INDIAN/ALASKA NATIVE       664
Name: count, dtype: int64

In [None]:
data_df.isna().sum()


path                          0
subject_id                    0
study_id                      0
dicom_id                      0
split                         0
gender                        0
insurance                     0
anchor_age                    0
race                          0
Enlarged Cardiomediastinum    0
Cardiomegaly                  0
Lung Opacity                  0
Lung Lesion                   0
Edema                         0
Consolidation                 0
Pneumonia                     0
Atelectasis                   0
Pneumothorax                  0
Pleural Effusion              0
Pleural Other                 0
Fracture                      0
Support Devices               0
No Finding                    0
dtype: int64