# Script to generate all splits

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set up plotting style
# Load preprocessed metadata
metadata_df = pd.read_csv('../data/mimic/mimic_metadata_preprocessed.csv')
# Get numerical columns (excluding subject_id, study_id etc)
numerical_cols = ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 
                 'Enlarged Cardiomediastinum', 'Fracture', 'Lung Lesion',
                 'Lung Opacity', 'No Finding', 'Pleural Effusion', 'Pleural Other',
                 'Pneumonia', 'Pneumothorax', 'Support Devices']


# Convert numerical values to boolean True/False
# True for positive cases (1.0), False otherwise
metadata_df[numerical_cols] = metadata_df[numerical_cols].apply(lambda x: x == 1.0)
numerical_cols.remove("Support Devices")
label_cols = numerical_cols
# removed all entries with more than one label
metadata_df = metadata_df[metadata_df[label_cols].sum(axis=1) == 1]

metadata_df = metadata_df[metadata_df["Pleural Other"] == False]
label_cols.remove("Pleural Other")
metadata_df[label_cols].sum()

Atelectasis                    6439
Cardiomegaly                   8255
Consolidation                  1133
Edema                          3490
Enlarged Cardiomediastinum      983
Fracture                       1416
Lung Lesion                    1239
Lung Opacity                  10563
No Finding                    69677
Pleural Effusion               5662
Pneumonia                      2848
Pneumothorax                   2697
dtype: int64

In [2]:
# Create balanced test set with 283 images per class
test_size = 200 

# Initialize empty list to store test indices
test_indices = []

# For each label, randomly sample test_size images
for label in label_cols:
    # Get indices of positive cases for this label
    label_indices = metadata_df[metadata_df[label]].index.tolist()
    # Set random seed for reproducibility
    np.random.seed(1)
    # Randomly sample test_size indices
    sampled_indices = np.random.choice(label_indices, size=test_size, replace=False)
    
    test_indices.extend(sampled_indices)

# Convert to array and get unique indices (in case of any overlap)
test_indices = np.unique(test_indices)

# Create test dataframe
test_df = metadata_df.loc[test_indices]

# Create train dataframe by removing test subjects
train_subjects = set(metadata_df['subject_id']) - set(test_df['subject_id'])
train_df = metadata_df[metadata_df['subject_id'].isin(train_subjects)]

print(f"Train set size: {len(train_df)}")
print(f"Test set size: {len(test_df)}")
print("\nTest set label distribution:")
print(test_df[label_cols].sum())


Train set size: 101107
Test set size: 2400

Test set label distribution:
Atelectasis                   200
Cardiomegaly                  200
Consolidation                 200
Edema                         200
Enlarged Cardiomediastinum    200
Fracture                      200
Lung Lesion                   200
Lung Opacity                  200
No Finding                    200
Pleural Effusion              200
Pneumonia                     200
Pneumothorax                  200
dtype: int64


In [3]:
# Create balanced test set with 283 images per class
retrieve_size = 300 

# Initialize empty list to store test indices
retrieve_indices = []

# For each label, randomly sample retrieve_size images
for label in label_cols:
    # Get indices of positive cases for this label
    label_indices = train_df[train_df[label]].index.tolist()
    # Set random seed for reproducibility
    np.random.seed(1)
    # Randomly sample retrieve_size indices
    sampled_indices = np.random.choice(label_indices, size=retrieve_size, replace=False)
    
    retrieve_indices.extend(sampled_indices)

# Convert to array and get unique indices (in case of any overlap)
retrieve_indices = np.unique(retrieve_indices)

# Create retrieve dataframe
retrieve_df = train_df.loc[retrieve_indices]

# Create train dataframe by removing retrieve subjects
retrieve_subjects = set(train_df['subject_id']) - set(retrieve_df['subject_id']) - set(test_df["subject_id"])
train_df = train_df[train_df['subject_id'].isin(retrieve_subjects)]

print(f"Train set size: {len(train_df)}")
print(f"retrieve set size: {len(retrieve_df)}")
print("\nRetrieve set label distribution:")
print(retrieve_df[label_cols].sum())
train_df[label_cols].sum()

Train set size: 85953
retrieve set size: 3600

Retrieve set label distribution:
Atelectasis                   300
Cardiomegaly                  300
Consolidation                 300
Edema                         300
Enlarged Cardiomediastinum    300
Fracture                      300
Lung Lesion                   300
Lung Opacity                  300
No Finding                    300
Pleural Effusion              300
Pneumonia                     300
Pneumothorax                  300
dtype: int64


Atelectasis                    4437
Cardiomegaly                   5778
Consolidation                   306
Edema                          1918
Enlarged Cardiomediastinum      302
Fracture                        692
Lung Lesion                     432
Lung Opacity                   7207
No Finding                    58856
Pleural Effusion               3407
Pneumonia                      1756
Pneumothorax                    862
dtype: int64

In [4]:
def assert_empty_intersect(set_a, set_b): 
    assert len(set_a.intersection(set_b) ) == 0

assert_empty_intersect(set(test_df["subject_id"]),  set(train_df["subject_id"]))
assert_empty_intersect(set(test_df["subject_id"]),  set(retrieve_df["subject_id"]))
assert_empty_intersect(set(train_df["subject_id"]), set(retrieve_df["subject_id"]))

In [5]:
# Create balanced test set with 283 images per class
balanced_train_size = 300 

# Initialize empty list to store test indices
retrieve_indices = []

# For each label, randomly sample balanced_train_size images
for label in label_cols:
    # Get indices of positive cases for this label
    label_indices = train_df[train_df[label]].index.tolist()
    # Set random seed for reproducibility
    np.random.seed(1)
    # Randomly sample balanced_train_size indices
    sampled_indices = np.random.choice(label_indices, size=balanced_train_size, replace=False)
    
    retrieve_indices.extend(sampled_indices)

# Convert to array and get unique indices (in case of any overlap)
retrieve_indices = np.unique(retrieve_indices)

# Create retrieve dataframe
train_balanced_df = train_df.loc[retrieve_indices]

train_balanced_df[label_cols].sum()

Atelectasis                   300
Cardiomegaly                  300
Consolidation                 300
Edema                         300
Enlarged Cardiomediastinum    300
Fracture                      300
Lung Lesion                   300
Lung Opacity                  300
No Finding                    300
Pleural Effusion              300
Pneumonia                     300
Pneumothorax                  300
dtype: int64

In [9]:
train_df[label_cols].sum()

Atelectasis                    4437
Cardiomegaly                   5778
Consolidation                   306
Edema                          1918
Enlarged Cardiomediastinum      302
Fracture                        692
Lung Lesion                     432
Lung Opacity                   7207
No Finding                    58856
Pleural Effusion               3407
Pneumonia                      1756
Pneumothorax                    862
dtype: int64

In [7]:
# Export dataframes to CSV files
train_balanced_df.to_csv('../data/mimic/longtail_balanced_train.csv', index=False)
train_df.to_csv('../data/mimic/longtail_train.csv', index=False)
retrieve_df.to_csv('../data/mimic/longtail_balanced_retrieve.csv', index=False) 
test_df.to_csv('../data/mimic/longtail_balanced_test.csv', index=False)

Exported train, retrieve and test sets to CSV files
