In [None]:
import os
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [None]:
all_ecg_df = pd.read_csv('../../data/mimic-acute-mi.csv')

In [None]:
all_ecg_df = all_ecg_df[~all_ecg_df['simple_note'].isna()]
all_ecg_df = all_ecg_df[(all_ecg_df['troponin'] & ~all_ecg_df['valuenum'].isnull() & ~all_ecg_df['comments'].isnull()) | (all_ecg_df['troponin'] == 0)]
display(all_ecg_df.head())
print(all_ecg_df.shape)

In [None]:
downsampled_df = pd.read_csv('../../data/mimic-acute-mi_modelling.csv')

In [None]:
all_ecg_df = all_ecg_df[~all_ecg_df.study_id.isin(downsampled_df.study_id)]
print(all_ecg_df.STEMI.sum())
print(all_ecg_df.NSTEMI.sum())
print()

print(all_ecg_df.st_elevation.sum())
print(all_ecg_df.st_depression.sum())
print(all_ecg_df.t_wave.sum())

In [None]:
# Function to sample rows for a specific condition
def sample_condition(df, condition, count, sampled_indices):
    condition_df = df[condition & ~df.index.isin(sampled_indices)]
    if len(condition_df) < count:
        raise ValueError(f"Not enough rows to sample for condition: {condition}")
    sampled = condition_df.sample(n=count, random_state=RANDOM_STATE)
    sampled_dfs.append(sampled)
    return sampled.index

# Set a random seed for reproducibility
RANDOM_STATE = 42

# Define the minimum required counts for each ECG category
required_counts = {
    'STEMI': 2000,
    'NSTEMI': 3200,
    'ST_elevation': 4000,
    'ST_depression': 600,
    'T_wave_inversion': 200
}

# Create a copy of the original dataframe to work with
df = all_ecg_df.copy()

# Initialize an empty list to store the sampled dataframes
sampled_dfs = []

# Initialize a set to keep track of sampled indices to avoid duplication where necessary
sampled_indices = set()

# Sample STEMI ECGs
sampled_indices.update(sample_condition(
    df,
    df['STEMI'] == 1,
    required_counts['STEMI'],
    sampled_indices
))

# Sample NSTEMI ECGs
sampled_indices.update(sample_condition(
    df,
    df['NSTEMI'] == 1,
    required_counts['NSTEMI'],
    sampled_indices
))

# Sample ST-elevation ECGs
sampled_indices.update(sample_condition(
    df,
    df['st_elevation'] == 1,
    required_counts['ST_elevation'],
    sampled_indices
))

# Sample ST-depression ECGs
sampled_indices.update(sample_condition(
    df,
    df['st_depression'] == 1,
    required_counts['ST_depression'],
    sampled_indices
))

# Sample T-wave inversion ECGs
sampled_indices.update(sample_condition(
    df,
    df['t_wave'] == 1,
    required_counts['T_wave_inversion'],
    sampled_indices
))

# Concatenate all sampled dataframes
downsampled_df = pd.concat(sampled_dfs)

# Calculate the remaining number of rows to reach 30,000
remaining_rows = 30000 - len(downsampled_df)

# Check if there are enough remaining rows to sample
if remaining_rows > 0:
    # Exclude already sampled indices
    remaining_df = df[~df.index.isin(sampled_indices)]
    
    if len(remaining_df) < remaining_rows:
        raise ValueError("Not enough remaining rows to reach 30,000 after sampling required categories.")
    
    # Sample the remaining rows
    remaining_sampled = remaining_df.sample(n=remaining_rows, random_state=RANDOM_STATE)
    downsampled_df = pd.concat([downsampled_df, remaining_sampled])

# Shuffle the final dataframe
downsampled_df = downsampled_df.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)

# Optional: Verify the counts
print("Downsampled DataFrame Shape:", downsampled_df.shape)
print("STEMI Count:", downsampled_df['STEMI'].sum())
print("NSTEMI Count:", downsampled_df['NSTEMI'].sum())
print("ST Elevation Count:", (downsampled_df['st_elevation'] == 1).sum())
print("ST Depression Count:", (downsampled_df['st_depression'] == 1).sum())
print("T-wave Inversion Count:", (downsampled_df['t_wave'] == 1).sum())


In [None]:
study_ids = downsampled_df['study_id'].tolist()
print(study_ids[:10])
# Path to the image directory
image_dir = '../../data/image_folder'

# Initialize a set to store the integer filenames
image_ids = set()

# Traverse through each file in the image directory
for filename in os.listdir(image_dir):
    # Check if the file is a JPEG image
    if filename.lower().endswith('.jpeg'):
        # Remove the '.jpeg' extension
        name_without_ext = filename[:-5]
        try:
            # Convert the filename to an integer and add to the set
            file_id = int(name_without_ext)
            image_ids.add(file_id)
        except ValueError:
            # If the filename is not an integer, skip it
            print(f"Skipping file with non-integer name: {filename}")

# Convert study_ids to a set for efficient lookup
study_ids_set = set(study_ids)

# Find the intersection of study_ids and image_ids
matching_ids = study_ids_set.intersection(image_ids)

# Calculate the number of matching IDs
num_matching = len(matching_ids)
total_study_ids = len(study_ids)

# Print the results
print(f"Total number of study_ids: {total_study_ids}")
print(f"Number of study_ids with corresponding image files: {num_matching}")
print(f"Percentage matched: { (num_matching / total_study_ids) * 100:.2f}%")

In [None]:
downsampled_df = downsampled_df[downsampled_df['study_id'].isin(matching_ids)] 
print(downsampled_df.shape)

In [None]:
downsampled_df.to_csv('../../data/mimic-acute-mi_resnet.csv')

### Create Train and Test Set

In [None]:
# Set parameters
RANDOM_STATE = 42
TRAIN_RATIO = 0.8

# Paths to save split DataFrames
train_df_path = '../../data/train_reset_df.csv'
test_df_path = '../../data/test_reset_df.csv'


# 1. Split the study_ids
study_ids = downsampled_df['study_id'].astype(int).unique()
train_ids, test_ids = train_test_split(
    study_ids,
    test_size=1 - TRAIN_RATIO,
    random_state=RANDOM_STATE,
    shuffle=True
)

print(f"Training study_ids: {len(train_ids)}")
print(f"Testing study_ids: {len(test_ids)}")

# 2. Create training and test DataFrames
train_df = downsampled_df[downsampled_df['study_id'].astype(int).isin(train_ids)].reset_index(drop=True)
test_df = downsampled_df[downsampled_df['study_id'].astype(int).isin(test_ids)].reset_index(drop=True)

print(f"Training DataFrame shape: {train_df.shape}")
print(f"Test DataFrame shape: {test_df.shape}")

# 4. Save the split DataFrames
train_df.to_csv(train_df_path, index=False)
test_df.to_csv(test_df_path, index=False)
print("Training and test DataFrames saved successfully!")
