In [1]:
import os
import pandas as pd
import torch
from dotenv import load_dotenv
from src.XRayDataset import XRayDataset
from src.utils import compute_mean_std, compute_class_weights
from src.prePro import preprocess_metadata, calculate_balanced_label_statistics, distribution_df_split
from src.ICNTrainer import ICNTrainer
from torch.utils.data import DataLoader
import torchvision.models as models
import torch.nn as nn

# Load environment variables from .env file
load_dotenv()
data_dir = os.getenv('DATA_DIR')

filtered_df = preprocess_metadata(
    f'../{data_dir}/raw/xraysMD.csv',
    f'../{data_dir}/raw/xrays',
    f'../{data_dir}/processed/xraysMD.csv'
)

filtered_df_stats = calculate_balanced_label_statistics(filtered_df)



In [2]:
train_df, test_df = distribution_df_split(filtered_df, train_size=7000, test_size=3000)

# Print sizes of the resulting DataFrames
print(f"Training set size: {len(train_df)}")
print(f"Test/Validation set size: {len(test_df)}")

# Calculate the label distribution for the training and test sets
train_distribution = pd.Series([label for labels in train_df['Labels'] for label in labels]).value_counts()
test_distribution = pd.Series([label for labels in test_df['Labels'] for label in labels]).value_counts()

# Combine both distributions into a DataFrame
statistics_df = pd.DataFrame({
    'Training': train_distribution,
    'Test/Validation': test_distribution
})

# Fill NaN values with 0 (in case a label is not present in either set)
statistics_df.fillna(0, inplace=True)

# Add a total row to both columns
statistics_df.loc['Total'] = statistics_df.sum()

print("\nCombined label distribution statistics:")
print(statistics_df)


Training set size: 7000
Test/Validation set size: 3000

Combined label distribution statistics:
                    Training  Test/Validation
Atelectasis              687              268
Cardiomegaly             211               78
Consolidation            229               98
Edema                     99               39
Effusion                 654              292
Emphysema                149               61
Fibrosis                 135               49
Hernia                    26                4
Infiltration            1030              434
Mass                     270              122
No Finding              4037             1757
Nodule                   338              166
Pleural_Thickening       230               94
Pneumonia                 79               28
Pneumothorax             293              135
Total                   8467             3625


In [3]:
# Compute mean and std using a list of image paths
image_paths = [f"../data/raw/xrays/{image_id}.png" for image_id in train_df['ImageID']]
img_size = 1000  # Set to your desired size

mean, std = compute_mean_std(image_paths, img_size)

print(f"Computed Mean: {mean}, Computed Std: {std}")


KeyboardInterrupt: 

In [4]:
# Testing the DataLoader
data_iter = iter(train_loader)
images, labels = next(data_iter)

# Print the shapes of the batch
print(f"Images batch shape: {images.shape}")  # Should be [batch_size, channels, height, width]
print(f"Labels batch shape: {labels.shape}")  # Should be [batch_size, num_labels]

# Check the individual data points (optional)
print(f"First image shape: {images[0].shape}")
print(f"First label: {labels[0]}")

Images batch shape: torch.Size([20, 1, 300, 300])
Labels batch shape: torch.Size([20, 15])
First image shape: torch.Size([1, 300, 300])
First label: tensor([0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0.])
