# Script to generate all splits

In [47]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path



BASEDIR_MIMIC = "../data/mimic/"
# Set up plotting style
# Load preprocessed metadata
metadata_df = pd.read_csv(os.path.join(BASEDIR_MIMIC,'mimic_metadata_preprocessed.csv'))
# Get numerical columns (excluding subject_id, study_id etc)
numerical_cols = ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 
                 'No Finding', 'Pleural Effusion',
                 'Pneumonia', 'Pneumothorax']



#columns_of_interest = ["No Finding", "Atelectasis", "Cardiomegaly", "Consolidation", "Edema", "Pleural Effusion", "Pneumonia", "Pneumothorax"] #"No Finding" is first column

            


# Convert numerical values to boolean True/False
# True for positive cases (1.0), False otherwise
metadata_df[numerical_cols] = metadata_df[numerical_cols].apply(lambda x: x == 1.0)
label_cols = numerical_cols
# removed all entries with more than one label
metadata_df = metadata_df[metadata_df[label_cols].sum(axis=1) == 1]

#metadata_df = metadata_df[metadata_df["Pleural Other"] == False]
metadata_df[label_cols].sum()

Atelectasis         10029
Cardiomegaly        10614
Consolidation        1689
Edema                4966
No Finding          69677
Pleural Effusion     9301
Pneumonia            5385
Pneumothorax         3412
dtype: int64

In [None]:
test_df = pd.read_csv(os.path.join(BASEDIR_MIMIC,'mimic-cxr-2.1.0-test-set-labeled.csv'))
test_df


In [49]:
meta_all = pd.read_csv(os.path.join(BASEDIR_MIMIC,'mimic-cxr-2.0.0-chexpert.csv'))
test_subject_ids = set(meta_all[meta_all["study_id"].isin(test_df["study_id"])]["subject_id"])
len(test_subject_ids)

296

In [50]:
print(len(metadata_df))
metadata_df = metadata_df[~metadata_df["subject_id"].isin(test_subject_ids)]
len(metadata_df)

115073


113891

In [51]:
# Create balanced test set with 283 images per class
test_size = 198

# Initialize empty list to store test indices
test_indices = []

# For each label, randomly sample test_size images
for label in label_cols:
    # Get indices of positive cases for this label
    label_indices = metadata_df[metadata_df[label]].index.tolist()
    # Set random seed for reproducibility
    np.random.seed(1)
    # Randomly sample test_size indices
    sampled_indices = np.random.choice(label_indices, size=test_size, replace=False)
    
    test_indices.extend(sampled_indices)

# Convert to array and get unique indices (in case of any overlap)
test_indices = np.unique(test_indices)

# Create test dataframe
test_df = metadata_df.loc[test_indices]

# Create train dataframe by removing test subjects
train_subjects = set(metadata_df['subject_id']) - set(test_df['subject_id'])
train_df = metadata_df[metadata_df['subject_id'].isin(train_subjects)]

print(f"Train set size: {len(train_df)}")
print(f"Test set size: {len(test_df)}")
print("\nTest set label distribution:")
print(test_df[label_cols].sum())


Train set size: 104124
Test set size: 1584

Test set label distribution:
Atelectasis         198
Cardiomegaly        198
Consolidation       198
Edema               198
No Finding          198
Pleural Effusion    198
Pneumonia           198
Pneumothorax        198
dtype: int64


In [52]:
# Create balanced test set with 283 images per class
retrieve_size = 300 

# Initialize empty list to store test indices
retrieve_indices = []

# For each label, randomly sample retrieve_size images
for label in label_cols:
    # Get indices of positive cases for this label
    label_indices = train_df[train_df[label]].index.tolist()
    # Set random seed for reproducibility
    np.random.seed(1)
    # Randomly sample retrieve_size indices
    sampled_indices = np.random.choice(label_indices, size=retrieve_size, replace=False)
    
    retrieve_indices.extend(sampled_indices)

# Convert to array and get unique indices (in case of any overlap)
retrieve_indices = np.unique(retrieve_indices)

# Create retrieve dataframe
retrieve_df = train_df.loc[retrieve_indices]

# Create train dataframe by removing retrieve subjects
retrieve_subjects = set(train_df['subject_id']) - set(retrieve_df['subject_id']) - set(test_df["subject_id"])
train_df = train_df[train_df['subject_id'].isin(retrieve_subjects)]

print(f"Train set size: {len(train_df)}")
print(f"retrieve set size: {len(retrieve_df)}")
print("\nRetrieve set label distribution:")
print(retrieve_df[label_cols].sum())
train_df[label_cols].sum()

Train set size: 92365
retrieve set size: 2400

Retrieve set label distribution:
Atelectasis         300
Cardiomegaly        300
Consolidation       300
Edema               300
No Finding          300
Pleural Effusion    300
Pneumonia           300
Pneumothorax        300
dtype: int64


Atelectasis          7678
Cardiomegaly         7850
Consolidation         708
Edema                3165
No Finding          61386
Pleural Effusion     6362
Pneumonia            3761
Pneumothorax         1455
dtype: int64

In [53]:
def assert_empty_intersect(set_a, set_b): 
    assert len(set_a.intersection(set_b) ) == 0

assert_empty_intersect(set(test_df["subject_id"]),  set(train_df["subject_id"]))
assert_empty_intersect(set(test_df["subject_id"]),  set(retrieve_df["subject_id"]))
assert_empty_intersect(set(train_df["subject_id"]), set(retrieve_df["subject_id"]))

In [54]:
# Create balanced test set with 283 images per class
balanced_train_size = 700 

# Initialize empty list to store test indices
retrieve_indices = []

# For each label, randomly sample balanced_train_size images
for label in label_cols:
    # Get indices of positive cases for this label
    label_indices = train_df[train_df[label]].index.tolist()
    # Set random seed for reproducibility
    np.random.seed(1)
    # Randomly sample balanced_train_size indices
    sampled_indices = np.random.choice(label_indices, size=balanced_train_size, replace=False)
    
    retrieve_indices.extend(sampled_indices)

# Convert to array and get unique indices (in case of any overlap)
retrieve_indices = np.unique(retrieve_indices)

# Create retrieve dataframe
train_balanced_df = train_df.loc[retrieve_indices]

train_balanced_df[label_cols].sum()

Atelectasis         700
Cardiomegaly        700
Consolidation       700
Edema               700
No Finding          700
Pleural Effusion    700
Pneumonia           700
Pneumothorax        700
dtype: int64

In [55]:
train_df[label_cols].sum()

Atelectasis          7678
Cardiomegaly         7850
Consolidation         708
Edema                3165
No Finding          61386
Pleural Effusion     6362
Pneumonia            3761
Pneumothorax         1455
dtype: int64

In [56]:
train_balanced_df = pd.read_csv(os.path.join(BASEDIR_MIMIC, 'longtail_8_balanced_train.csv'))

In [57]:
# Export dataframes to CSV files
train_balanced_df.to_csv(os.path.join(BASEDIR_MIMIC, 'longtail_8_balanced_train.csv'), index=False)
train_df.to_csv(os.path.join(BASEDIR_MIMIC, 'longtail_8_train.csv'), index=False)
retrieve_df.to_csv(os.path.join(BASEDIR_MIMIC, 'longtail_8_balanced_retrieve.csv'), index=False)
test_df.to_csv(os.path.join(BASEDIR_MIMIC,'longtail_8_balanced_test.csv'), index=False)

In [None]:
# Create unbalanced train sets with only 10 samples for each disease
diseases_to_undersample = ["Pneumothorax", "Atelectasis", "Cardiomegaly", "Edema", "No Finding", "Pleural Effusion", "Pneumonia", "Consolidation"]

# Dictionary to store all unbalanced dataframes
all_unbalanced_dfs = []

for disease_to_undersample in diseases_to_undersample:
    # Get indices of positive cases for this disease
    disease_indices = train_balanced_df[train_balanced_df[disease_to_undersample] == True].index.tolist()
    np.random.seed(5)

    # Define indices for the 10 samples we want to keep
    sample_indices_all = {
        "Consolidation": [0,1,2,3,4,5,6,7,8,11],
        "Atelectasis": [0,1,11,3,4,5,6,7,8,9],
        "Cardiomegaly": [0,1,13,3,18,19,20,7,8,21],
        "Edema": [0,1,2,3,4,5,10,7,8,11],
        "No Finding": [0,1,2,3,4,5,6,7,8,9],
        "Pleural Effusion": [0,1,2,3,4,5,11,7,8,9],
        "Pneumonia": [0,1,2,3,4,5,6,10,8,9],
        "Pneumothorax": [0,1,2,3,4,5,6,7,8,9]
    }
    sample_indices = sample_indices_all[disease_to_undersample]

    # Get all samples for this disease
    disease_samples = train_balanced_df.loc[disease_indices]

    # Print labels for the 10 specific samples we're interested in
    disease_labels = {i: disease_samples.iloc[i] for i in sample_indices}
    #[print(f"{disease_to_undersample} {i}- " + disease_labels[i]) for i in sample_indices]

    # Store in dictionary - keeping all positive samples
    for d in disease_labels.values(): 
        all_unbalanced_dfs.append(d)

    # Verify counts
    #print(f"\nDisease counts for {disease_to_undersample}:")
    #print(disease_samples[label_cols].sum())
    #print("\n")

final_unbalanced_df = pd.DataFrame(all_unbalanced_dfs).reset_index(drop=True)
final_unbalanced_df.to_csv(os.path.join(BASEDIR_MIMIC, 'longtail_unbalanced_all.csv'), index=False)
final_unbalanced_df

In [142]:
os.path.join(BASEDIR_MIMIC, 'longtail_8_unbalanced_all.csv')

'../data/mimic/longtail_8_unbalanced_all.csv'