In [None]:
import os
import torch
from datasets import get_dataloader, custom_collate_fn

# We have to keep the code in another folder so that we can be able to parallelize the preprocessing jobs. Saves time. 

# =============================================================================
# Adjust these paths according to your folder structure.
# =============================================================================
if __name__ == '__main__':
    # Set this to your individual parent folder
    base_dir = "/Users/edgarsuritis/Downloads/FinalProjectData/"
    
    # Use the CSV files in the easy split folder:
    csv_train = os.path.join(base_dir, "TestTrainSplits", "train_test_easy", "train.csv")
    csv_test  = os.path.join(base_dir, "TestTrainSplits", "train_test_easy", "test.csv")

    # Additional smaller dataset splits
    csv_james = os.path.join(base_dir, "TestTrainSplits", "James.csv")
    csv_rashik = os.path.join(base_dir, "TestTrainSplits", "Rashik.csv")
    
    # Directory containing JPEG images.
    images_dir = os.path.join(base_dir, "JPEGImage")
    # Directory containing positive XML annotations.
    annotations_dir = os.path.join(base_dir, "positive-Annotation")
    
    # DataLoaders for training and testing.
    # Pass the custom collate function here:
    train_loader = get_dataloader(csv_james, images_dir, annotations_dir, batch_size=32, train=True)
    test_loader  = get_dataloader(csv_rashik, images_dir, annotations_dir, batch_size=32, train=False)
    
    # When creating the DataLoader inside get_dataloader, set the collate_fn parameter
    # For example, modify get_dataloader to:
    # return DataLoader(dataset, batch_size=batch_size, shuffle=train, num_workers=4, collate_fn=custom_collate_fn)
    
    # For our testing, we can either modify get_dataloader() or wrap it here:
    from torch.utils.data import DataLoader
    # Reconstruct using our custom_collate_fn for demonstration:
    train_loader = DataLoader(train_loader.dataset, batch_size=32, shuffle=True, num_workers=4, collate_fn=custom_collate_fn)
    
    # Simple test: iterate through one batch.
    for imgs, targets in train_loader:
        print("Train Images shape:", imgs.shape)  # Expected: [batch, 3, 416, 416]
        print("Train Targets:", targets)  # A list, each element a tensor of shape [N, 4] (or [N, 5] if you include classes)
        


Train Images shape: torch.Size([32, 3, 416, 416])
Train Targets: [tensor([[0.0000, 0.6633, 0.3523, 0.3456, 0.3917],
        [0.0000, 0.1480, 0.7422, 0.2402, 0.3882],
        [3.0000, 0.5375, 0.7625, 0.4346, 0.2016],
        [2.0000, 0.4091, 0.2468, 0.3659, 0.2248]]), tensor([[0.0000, 0.4022, 0.5041, 0.2478, 0.2341],
        [3.0000, 0.4701, 0.2202, 0.2872, 0.1483],
        [3.0000, 0.5705, 0.7265, 0.4371, 0.2410]]), tensor([], size=(0, 5)), tensor([[1.0000, 0.2789, 0.8210, 0.5121, 0.1611]]), tensor([[1.0000, 0.6328, 0.5805, 0.5184, 0.3129],
        [1.0000, 0.6595, 0.5869, 0.1626, 0.5504]]), tensor([], size=(0, 5)), tensor([], size=(0, 5)), tensor([[0.0000, 0.8488, 0.3656, 0.3024, 0.2607],
        [3.0000, 0.7992, 0.7213, 0.3532, 0.3395]]), tensor([[0.0000, 0.4130, 0.6813, 0.2643, 0.2688]]), tensor([[1.0000, 0.5775, 0.7735, 0.4435, 0.2955],
        [0.0000, 0.4714, 0.5365, 0.1347, 0.2202],
        [0.0000, 0.2802, 0.4930, 0.2020, 0.1703]]), tensor([[0.0000, 0.4701, 0.4589, 0.2490, 0.20