In [1]:
import pandas as pd
import tensorflow as tf
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import os
from tqdm import tqdm
print("GPU Available: ", tf.config.list_physical_devices('GPU'))

# File path
file_path = "/home/ahmedyra/scratch/Dataset/generalized-image-embeddings-for-the-mimic-chest-x-ray-dataset-1.0/SHA256SUMS.txt"

# Read the file and extract paths
with open(file_path, "r") as file:
    lines = [line.strip().split(maxsplit=1)[-1] for line in file if "files/" in line]  # Extract only paths

# Create a DataFrame
df = pd.DataFrame(lines, columns=["file_paths"])

# Show first few rows
print(df.head())


2025-03-14 16:18:45.747197: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-14 16:18:45.769090: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-14 16:18:45.795258: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-14 16:18:45.803245: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-14 16:18:45.822085: I tensorflow/core/platform/cpu_feature_guar

GPU Available:  [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]
                                          file_paths
0  files/p10/p10000032/s50414267/02aa804e-bde0afd...
1  files/p10/p10000032/s53189527/2a2277a9-b0ded15...
2  files/p10/p10000032/s53911762/68b5c4b1-227d048...
3  files/p10/p10000032/s53911762/fffabebf-74fd3a1...
4  files/p10/p10000032/s56699142/ea030e7a-2e3b134...


In [None]:
import pandas as pd
import tensorflow as tf
import torch
from torch.utils.data import Dataset
import numpy as np
import os
import re
from tqdm import tqdm
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
tf.get_logger().setLevel(logging.ERROR)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# Base path for dataset
base_path = "/home/ahmedyra/scratch/Dataset/generalized-image-embeddings-for-the-mimic-chest-x-ray-dataset-1.0/"

# Load labels
labels_path = "/home/ahmedyra/scratch/Dataset/mimic-cxr-2.0.0-chexpert.csv"
try:
    labels_df = pd.read_csv(labels_path)
    labels_df['subject_id'] = labels_df['subject_id'].astype(str)
    labels_df['study_id'] = labels_df['study_id'].astype(str)
    logger.info(f"Loaded labels: {labels_df.shape[0]} rows")
except Exception as e:
    logger.error(f"Error loading labels: {e}")
    labels_df = pd.DataFrame(columns=['subject_id', 'study_id'])

# Label columns
label_columns = ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 
                'Enlarged Cardiomediastinum', 'Fracture', 'Lung Lesion', 
                'Lung Opacity', 'No Finding', 'Pleural Effusion', 
                'Pleural Other', 'Pneumonia', 'Pneumothorax', 'Support Devices']

# TFRecord feature description
feature_description = {
    'embedding': tf.io.FixedLenFeature([1376], tf.float32),
    'image/id': tf.io.FixedLenFeature([], tf.string),
    'image/format': tf.io.FixedLenFeature([], tf.string)
}

def extract_ids_from_path(path):
    """Extract subject_id and study_id from image path"""
    p_pattern = r'/p(\d+)/p(\d+)/s(\d+)/'
    match = re.search(p_pattern, path)
    
    if match:
        subject_id = match.group(2)
        study_id = match.group(3)
        return subject_id, study_id
    
    # Fallback pattern
    alt_pattern = r'p(\d+)/s(\d+)'
    alt_match = re.search(alt_pattern, path)
    if alt_match:
        subject_id = alt_match.group(1)
        study_id = alt_match.group(2)
        return subject_id, study_id
    
    return None, None

class MIMICEmbeddingDataset(Dataset):
    def __init__(self, file_paths, base_path, labels_df):
        self.file_paths = file_paths
        self.base_path = base_path
        self.labels_df = labels_df
        self.label_columns = label_columns
        self.data = []
        
        self.matched_count = 0
        self.unmatched_count = 0
        self.skipped_files = 0
        
        # Create labels lookup dictionary
        self.label_dict = {}
        if not self.labels_df.empty:
            for _, row in self.labels_df.iterrows():
                key = (row['subject_id'], row['study_id'])
                self.label_dict[key] = row[self.label_columns].to_dict()
                
        logger.info(f"Created label dictionary with {len(self.label_dict)} entries")
        
        # Test extraction with a sample path
        if self.file_paths:
            test_path = self.file_paths[0]
            full_test_path = os.path.join(self.base_path, test_path)
            if os.path.exists(full_test_path):
                self._test_extraction(full_test_path)
        
        # Load dataset
        logger.info("Loading TFRecord files...")
        self._load_data()
        logger.info(f"Records with matched labels: {self.matched_count}")
        logger.info(f"Records without matched labels: {self.unmatched_count}")
        logger.info(f"Skipped files: {self.skipped_files}")
    
    def _test_extraction(self, test_path):
        """Test ID extraction on a sample file"""
        try:
            dataset = tf.data.TFRecordDataset(test_path)
            for record in dataset:
                parsed = tf.io.parse_single_example(record, feature_description)
                image_id = parsed['image/id'].numpy().decode('utf-8')
                subject_id, study_id = extract_ids_from_path(image_id)
                logger.info(f"Sample image_id: {image_id}")
                logger.info(f"Extracted IDs: subject_id={subject_id}, study_id={study_id}")
                key = (subject_id, study_id)
                if key in self.label_dict:
                    logger.info(f"✓ Found matching entry in labels")
                else:
                    logger.warning(f"✗ No matching entry in labels")
                return
        except Exception as e:
            logger.warning(f"Error testing extraction: {e}")
        
    def _load_data(self):
        # Process files in batches
        batch_size = 1000
        total_files = len(self.file_paths)
        
        for batch_start in range(0, total_files, batch_size):
            batch_end = min(batch_start + batch_size, total_files)
            batch_paths = self.file_paths[batch_start:batch_end]
            batch_num = batch_start // batch_size + 1
            total_batches = (total_files + batch_size - 1) // batch_size
            
            logger.info(f"Processing batch {batch_num}/{total_batches}")
            
            for path in tqdm(batch_paths, desc=f"Batch {batch_num}/{total_batches}"):
                full_path = os.path.join(self.base_path, path)
                
                if not os.path.exists(full_path):
                    self.skipped_files += 1
                    continue
                    
                try:
                    dataset = tf.data.TFRecordDataset(
                        full_path,
                        buffer_size=1280*1024*1024, 
                        num_parallel_reads=tf.data.experimental.AUTOTUNE
                    )
                    
                    batch_matched = 0
                    for record in dataset:
                        try:
                            parsed = tf.io.parse_single_example(record, feature_description)
                            embedding = parsed['embedding'].numpy()
                            image_id = parsed['image/id'].numpy().decode('utf-8')
                            subject_id, study_id = extract_ids_from_path(image_id)
                            
                            # Find matching labels
                            labels = None
                            if subject_id and study_id:
                                key = (subject_id, study_id)
                                if key in self.label_dict:
                                    labels = self.label_dict[key]
                                    self.matched_count += 1
                                    batch_matched += 1
                                else:
                                    self.unmatched_count += 1
                            else:
                                self.unmatched_count += 1
                            
                            # Add to dataset
                            self.data.append({
                                'embedding': embedding,
                                'image_id': image_id,
                                'subject_id': subject_id,
                                'study_id': study_id,
                                'labels': labels
                            })
                            
                        except (tf.errors.DataLossError, tf.errors.OutOfRangeError):
                            continue
                        except Exception:
                            continue
                
                except Exception:
                    self.skipped_files += 1
            
            # Log progress and free memory
            logger.info(f"Batch {batch_num} complete. Total records: {len(self.data)}")
            import gc
            gc.collect()
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        embedding_tensor = torch.tensor(item['embedding'], dtype=torch.float32)
        
        result = {
            'embedding': embedding_tensor,
            'subject_id': item['subject_id'],
            'study_id': item['study_id']
        }
        
        # Process labels
        if item['labels'] is not None:
            # Convert to binary labels (handling NaN as 0)
            label_values = []
            for col in self.label_columns:
                value = item['labels'].get(col, 0)
                if pd.isna(value):
                    value = 0
                label_values.append(float(value))
            
            # Create label tensors
            labels_tensor = torch.tensor(label_values, dtype=torch.float32)
            result['labels'] = labels_tensor
            
            # Create one-hot encoding for positive findings
            positive_indices = [i for i, val in enumerate(label_values) if val == 1]
            one_hot = torch.zeros(len(self.label_columns))
            
            if positive_indices:
                for idx in positive_indices:
                    one_hot[idx] = 1
            else:
                # If no positives, mark as "No Finding"
                no_finding_idx = self.label_columns.index('No Finding')
                one_hot[no_finding_idx] = 1
                
            result['labels_one_hot'] = one_hot
        else:
            # Default labels if none available
            result['labels'] = torch.zeros(len(self.label_columns), dtype=torch.float32)
            result['labels_one_hot'] = torch.zeros(len(self.label_columns), dtype=torch.float32)
            no_finding_idx = self.label_columns.index('No Finding')
            result['labels_one_hot'][no_finding_idx] = 1
        
        return result

# Load data and create dataset
try:
    # Load file paths from SHA256SUMS.txt
    file_path = "/home/ahmedyra/scratch/Dataset/generalized-image-embeddings-for-the-mimic-chest-x-ray-dataset-1.0/SHA256SUMS.txt"
    
    # Read the file and extract paths
    with open(file_path, "r") as file:
        lines = [line.strip().split(maxsplit=1)[-1] for line in file if "files/" in line]  # Extract only paths
    
    # Filter for TFRecord files
    file_paths = [path for path in lines if path.endswith('.tfrecord')]
    
    logger.info(f"Found {len(file_paths)} TFRecord files")
    if file_paths:
        logger.info(f"Sample paths: {file_paths[:2]}")
    
    # Create the dataset
    dataset = MIMICEmbeddingDataset(file_paths, base_path, labels_df)
    logger.info(f"Dataset size: {len(dataset)}")
    
    # Display sample
    if dataset.data:
        sample = dataset[0]
        logger.info(f"Embedding shape: {sample['embedding'].shape}")
        logger.info(f"Labels shape: {sample['labels'].shape}")
except Exception as e:
    logger.error(f"Error: {e}")

2025-03-14 16:21:39,031 - INFO - Loaded labels: 227827 rows
2025-03-14 16:21:39,339 - INFO - Found 243324 TFRecord files
2025-03-14 16:21:39,339 - INFO - Sample paths: ['files/p10/p10000032/s50414267/02aa804e-bde0afdd-112c0b34-7bc16630-4e384014.tfrecord', 'files/p10/p10000032/s53189527/2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab.tfrecord']


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
import numpy as np
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# Define a simple neural network model
class ChestXrayClassifier(nn.Module):
    def __init__(self, input_dim=1376, hidden_dims=[512, 256], output_dim=14):
        super(ChestXrayClassifier, self).__init__()
        
        layers = []
        prev_dim = input_dim
        
        # Create hidden layers
        for hidden_dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, hidden_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.2))
            prev_dim = hidden_dim
        
        # Output layer (no activation, will use BCEWithLogitsLoss)
        layers.append(nn.Linear(prev_dim, output_dim))
        
        self.model = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.model(x)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Split dataset into train and test
test_size = 0.2
dataset_to_use = dataset  # Use the subset for testing
# dataset_to_use = dataset  # Uncomment to use full dataset
dataset_size = len(dataset_to_use)
test_count = int(test_size * dataset_size)
train_count = dataset_size - test_count

train_dataset, test_dataset = random_split(
    dataset_to_use, [train_count, test_count], 
    generator=torch.Generator().manual_seed(42)
)

print(f"Training on {len(train_dataset)} samples, testing on {len(test_dataset)} samples")

# Create data loaders
batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Initialize model
model = ChestXrayClassifier(input_dim=1376, output_dim=len(label_columns))
model.to(device)

# Initialize loss and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train model
num_epochs = 20
train_losses = []
test_losses = []
test_aucs = []

# Training loop
for epoch in range(num_epochs):
    # Training
    model.train()
    train_loss = 0.0
    
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
    for batch in progress_bar:
        # Move data to device
        inputs = batch['embedding'].to(device)
        targets = batch['labels_one_hot'].to(device)
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        
        # Update statistics
        train_loss += loss.item() * inputs.size(0)
        progress_bar.set_postfix({'loss': loss.item()})
    
    train_loss /= len(train_loader.dataset)
    train_losses.append(train_loss)
    
    # Evaluation
    model.eval()
    test_loss = 0.0
    all_outputs = []
    all_targets = []
    
    with torch.no_grad():
        for batch in test_loader:
            inputs = batch['embedding'].to(device)
            targets = batch['labels_one_hot'].to(device)
            
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            
            test_loss += loss.item() * inputs.size(0)
            all_outputs.append(outputs.cpu().numpy())
            all_targets.append(targets.cpu().numpy())
    
    test_loss /= len(test_loader.dataset)
    test_losses.append(test_loss)
    
    # Calculate AUC for each label
    all_outputs = np.vstack(all_outputs)
    all_targets = np.vstack(all_targets)
    all_probs = 1 / (1 + np.exp(-all_outputs))  # sigmoid
    
    aucs = {}
    for i, label in enumerate(label_columns):
        if sum(all_targets[:, i]) > 0:  # Only if there are positive examples
            aucs[label] = roc_auc_score(all_targets[:, i], all_probs[:, i])
    
    mean_auc = np.mean(list(aucs.values()))
    test_aucs.append(mean_auc)
    
    print(f"Epoch {epoch+1}/{num_epochs}:")
    print(f"  Train Loss: {train_loss:.4f}")
    print(f"  Test Loss: {test_loss:.4f}")
    print(f"  Mean AUC: {mean_auc:.4f}")

# Plot training curves
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Train Loss')
plt.plot(test_losses, label='Test Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(test_aucs, label='Mean AUC')
plt.xlabel('Epoch')
plt.ylabel('AUC')
plt.legend()
plt.tight_layout()
plt.show()

# Final evaluation
model.eval()
all_outputs = []
all_targets = []

with torch.no_grad():
    for batch in test_loader:
        inputs = batch['embedding'].to(device)
        targets = batch['labels_one_hot'].to(device)
        
        outputs = model(inputs)
        all_outputs.append(outputs.cpu().numpy())
        all_targets.append(targets.cpu().numpy())

all_outputs = np.vstack(all_outputs)
all_targets = np.vstack(all_targets)
all_probs = 1 / (1 + np.exp(-all_outputs))  # sigmoid
all_preds = (all_probs >= 0.5).astype(int)

# Calculate metrics for each label
print("\nFinal metrics by condition:")
for i, label in enumerate(label_columns):
    # Skip if no positive examples
    if sum(all_targets[:, i]) > 0:
        auc = roc_auc_score(all_targets[:, i], all_probs[:, i])
        accuracy = accuracy_score(all_targets[:, i], all_preds[:, i])
        precision = precision_score(all_targets[:, i], all_preds[:, i], zero_division=0)
        recall = recall_score(all_targets[:, i], all_preds[:, i], zero_division=0)
        f1 = f1_score(all_targets[:, i], all_preds[:, i], zero_division=0)
        
        print(f"{label}:")
        print(f"  AUC: {auc:.4f}")
        print(f"  Accuracy: {accuracy:.4f}")
        print(f"  Precision: {precision:.4f}")
        print(f"  Recall: {recall:.4f}")
        print(f"  F1: {f1:.4f}")

print(f"\nOverall Mean AUC: {np.mean(list(aucs.values())):.4f}")

# Save model
torch.save(model.state_dict(), 'chest_xray_model.pth')
print("Model saved to 'chest_xray_model.pth'")

# Testing PKL

In [18]:
import torch
from torch.utils.data import Dataset
import pandas as pd
import numpy as np
import tensorflow as tf

class MIMICDataset(Dataset):
    """
    A PyTorch Dataset for the MIMIC Chest X-ray data with embeddings.
    """
    def __init__(self, data_path, base_path="/home/ahmedyra/scratch/Dataset/", transform=None):
        """
        Args:
            data_path (str): Path to the pickle file containing the preprocessed data
            base_path (str): Base path to the embeddings
            transform (callable, optional): Optional transform to be applied on a sample
        """
        self.data_df = pd.read_pickle(data_path)
        self.transform = transform
        self.base_path = base_path
        self.labels = ['Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity',
                       'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis',
                       'Pneumothorax', 'Pleural Effusion', 'Pleural Other', 'Fracture',
                       'Support Devices', 'No Finding']
        self.demographic = ['gender', 'insurance', 'anchor_age', 'race']
    
    def __read_tf_record__(self, file_path):
        """Read a TensorFlow record file and parse the example"""
        full_path = f"{self.base_path}/{file_path}"
        raw_dataset = tf.data.TFRecordDataset(full_path)
        for raw_record in raw_dataset.take(1):
            example = tf.train.Example()
            example.ParseFromString(raw_record.numpy())
            return example
            
    def __len__(self):
        """Return the total number of samples"""
        return len(self.data_df)
        
    def __getitem__(self, idx):
        """
        Args:
            idx (int): Index of the sample to fetch
        Returns:
            dict: A dictionary containing the data sample
        """
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        # Get the row from the dataframe
        row = self.data_df.iloc[idx]
        
        # Get the embedding
        try:
            example = self.__read_tf_record__(row['path'])
            # Extract embedding values from float_list
            embedding_values = np.array(example.features.feature['embedding'].float_list.value, dtype=np.float32)
        except Exception as e:
            print(f"Error loading embedding for {row['path']}: {e}")
            # Fallback to zeros if there's an error
            embedding_values = np.zeros(1376, dtype=np.float32)
        
        # Get the labels
        labels = row[self.labels].values.astype(np.float32)
        
        # Get the demographic data
        demographics = row[self.demographic].values
        
        # Create sample dictionary
        sample = {
            'embedding': torch.tensor(embedding_values, dtype=torch.float32),
            'labels': torch.tensor(labels, dtype=torch.float32),
            'demographics': demographics,
            'gender': row['gender'],
            'insurance': row['insurance'],
            'anchor_age': row['anchor_age'],
            'race': row['race'],
            'study_id': row['study_id'],
            'dicom_id': row['dicom_id'],
            'path': row['path']
        }
        
        if self.transform:
            sample = self.transform(sample)
            
        return sample

# Create the dataset
dataset = MIMICDataset("/home/ahmedyra/projects/def-hinat/ahmedyra/EECS_Fairness_Project/preprocessed_data.pkl")

# Print the dataset length
print(f"Dataset length: {len(dataset)}")

# Get and print one sample (the first one)
sample = dataset[0]
print("\nSample contents:")
for key, value in sample.items():
    if isinstance(value, torch.Tensor):
        print(f"{key}: Tensor shape {value.shape}")
    else:
        print(f"{key}: {value}")

# Print the first few values of the embedding
print("\nFirst 10 embedding values:")
print(sample['embedding'][:10])

# Print the labels
print("\nLabels:")
for i, label_name in enumerate(dataset.labels):
    print(f"{label_name}: {sample['labels'][i]}")
print(sample.keys())

Dataset length: 228905

Sample contents:
embedding: Tensor shape torch.Size([1376])
labels: Tensor shape torch.Size([14])
demographics: ['F' 'Medicaid' 52.0 'WHITE']
gender: F
insurance: Medicaid
anchor_age: 52.0
race: WHITE
study_id: 50414267
dicom_id: 02aa804e-bde0afdd-112c0b34-7bc16630-4e384014
path: generalized-image-embeddings-for-the-mimic-chest-x-ray-dataset-1.0/files/p10/p10000032/s50414267/02aa804e-bde0afdd-112c0b34-7bc16630-4e384014.tfrecord

First 10 embedding values:
tensor([ 0.1257, -1.8030,  1.2843, -1.8088,  0.1278, -0.1912,  0.6093,  0.8306,
        -0.4438,  1.2389])

Labels:
Enlarged Cardiomediastinum: 0.0
Cardiomegaly: 0.0
Lung Opacity: 0.0
Lung Lesion: 0.0
Edema: 0.0
Consolidation: 0.0
Pneumonia: 0.0
Atelectasis: 0.0
Pneumothorax: 0.0
Pleural Effusion: 0.0
Pleural Other: 0.0
Fracture: 0.0
Support Devices: 0.0
No Finding: 1.0
dict_keys(['embedding', 'labels', 'demographics', 'gender', 'insurance', 'anchor_age', 'race', 'study_id', 'dicom_id', 'path'])


In [19]:
sample['labels']

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.])