## Data Processing - PyTorch

In [1]:
import os
import pandas as pd
import numpy as np
import cv2
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import matplotlib.pyplot as plt

In [2]:
# Define the dataset class
class ChestXRayDataset(Dataset):
    def __init__(self, image_paths, labels, transform=None):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        if self.transform:
            image = self.transform(image)

        label = self.labels[idx]
        return image, label

In [3]:
# Load your data
data = pd.read_csv('/Users/ananyajain/Desktop/CSC413/CSC413-Final-Project/archive/sample_labels.csv')
data['labels'] = data['Finding Labels'].map(lambda x: x.split('|'))
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(data['labels'])
# print("Number of classes:", len(mlb.classes_))
# if len(mlb.classes_) != 15:
#     raise ValueError("The number of target classes does not match num_classes in the model")
labels = np.array(labels, dtype=float)

In [4]:
image_dir = '/Users/ananyajain/Desktop/CSC413/CSC413-Final-Project/archive/sample/images'
image_paths = [os.path.join(image_dir, x) for x in data['Image Index']]

# Define transformations
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(20),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])


In [5]:
# Split the data
train_paths, val_test_paths, train_labels, val_test_labels = train_test_split(
    image_paths, labels, test_size=0.2, random_state=42)
val_paths, test_paths, val_labels, test_labels = train_test_split(
    val_test_paths, val_test_labels, test_size=0.5, random_state=42)

# Create datasets
train_dataset = ChestXRayDataset(train_paths, train_labels, transform)
val_dataset = ChestXRayDataset(val_paths, val_labels, transform)
test_dataset = ChestXRayDataset(test_paths, test_labels, transform)

# Create DataLoaders
batch_size = 16
loader_train = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
loader_val = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
loader_test = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)