## Processing the data

In [2]:
import glob
import os
import random
import math
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
# import tensorflow as tf  # Commented out - not compatible with Python 3.14
import torch
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split

# For data augmentation and preprocessing, we'll use torchvision instead of Keras
from torchvision import transforms
from PIL import Image

warnings.filterwarnings('ignore')


In [9]:
class DatasetParser():
    def __init__(self, root_dir, images_dir, labels_csv):
        self.image_paths = sorted(glob.glob(os.path.join(root_dir, images_dir,"*.png")))
        self.labels_df = self._labels_by_task(root_dir=root_dir, labels_csv=labels_csv, images_dir=images_dir)
        
        self.labels = ['Cardiomegaly','Emphysema','Effusion',
                           'Hernia','Nodule','Pneumothorax','Atelectasis',
                           'Pleural_Thickening','Mass','Edema','Consolidation',
                           'Infiltration','Fibrosis','Pneumonia', 'No Finding']
    
    def visualize_random_images(self, num_images=1, label=None, display_label=False):
        fig = plt.figure(figsize=(20,20))
        fig.tight_layout(pad=10.0)
        if label is None:
            idxs = random.sample(range(len(self.image_paths)), num_images)
        else:
            idxs = [idx for idx in range(len(self.labels_df['Label'])) if label in self.labels_df['Label'][idx]]
            if len(idxs) < num_images:
                num_images = len(idxs)
            else:
                idxs = random.sample(idxs, num_images)
                
        num_rows = math.ceil(np.sqrt(num_images))
        num_cols = math.ceil(num_images/num_rows)
        
        for i in range(num_images):
            img = cv2.imread(self.image_paths[idxs[i]])
            plt.subplot(num_rows, num_cols, i+1)
            if display_label:
                plt.gca().set_title(self.labels_df['Label'][idxs[i]],wrap=True)
            plt.axis('off')
            plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    
    def _labels_by_task(self, root_dir=None, labels_csv=None, images_dir="."):
        labels_df = pd.read_csv(os.path.join(root_dir, labels_csv))
        
        # Create image path mapping - handle both current dir and subdirectory cases
        if images_dir == ".":
            image_paths = glob.glob(os.path.join(root_dir, '*.png'))
        else:
            image_paths = glob.glob(os.path.join(root_dir, images_dir, '*.png'))
        
        image_path = {os.path.basename(x): x for x in image_paths}
        
        labels_df = labels_df[labels_df['Image Index'].map(os.path.basename).isin(image_path)]

        new_labels_df = pd.DataFrame()
        new_labels_df['Id'] = labels_df['Image Index'].copy()
        
        new_labels_df['Label'] = labels_df['Finding Labels'].apply(lambda val: val.split('|'))
        
        del labels_df
        
        return new_labels_df
        
    def get_labels_df(self):
        new_labels_df = self.labels_df.copy()
        
        for i in range(len(new_labels_df)):
                one_hot = [0 for element in self.labels]
                for element in new_labels_df['Label'][i]:
                    one_hot[self.labels.index(element)] = 1
                new_labels_df['Label'][i] = one_hot
                
        return new_labels_df
    
    def sample(self, num_samples, is_weighted=False):
        if not is_weighted:
            return self.labels_df.sample(num_samples)
        else:
            sample_weights = self.labels_df['Label'].map(lambda x: len(x)).values + 4e-2
            sample_weights /= sample_weights.sum()
            return self.labels_df.sample(num_samples, weights=sample_weights)

In [10]:
# Use relative paths for Windows compatibility
import os

# Get the project root directory (parent of 'Project' folder)
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) if '__file__' in dir() else os.path.dirname(os.getcwd())
# For notebook execution, use current working directory approach
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), "..")) if os.path.basename(os.getcwd()) == "Project" else os.getcwd()

# Configure paths - update these to match your data location
ROOT_DIR = os.path.join(PROJECT_ROOT, "Project", "input")
# Images are directly in the input folder, not in a subdirectory
IMAGES_DIR = "."  # Changed from "images" to "." since images are in the root input dir
LABELS_CSV = "Data_Entry_2017_v2020.csv"

parser = DatasetParser(root_dir=ROOT_DIR,
                       images_dir=IMAGES_DIR,
                       labels_csv=LABELS_CSV)
print("Data root:", ROOT_DIR)
print("Total Trainable Data: ", parser.labels_df.shape[0])

Data root: d:\MSE\10.Deep Learning\Group_Final\ViT-Chest-Xray\Project\input
Total Trainable Data:  112120


In [11]:
df = parser.sample(100, is_weighted=True)
df.head()

Unnamed: 0,Id,Label
5605,00001504_012.png,[Pleural_Thickening]
106005,00028526_000.png,[Infiltration]
11933,00003109_012.png,[No Finding]
100783,00026768_000.png,[No Finding]
34991,00009231_009.png,[No Finding]


In [12]:
train_val, test = train_test_split(df, test_size=0.2, random_state=42)  # Split into train+val (80%) and test (20%)
train, val = train_test_split(train_val, test_size=0.25, random_state=42)  # Split remaining data into train (60%) and val (20%)

train = train.reset_index(drop=True)
val = val.reset_index(drop=True)
test = test.reset_index(drop=True)

print("Training set size: ", len(train))
print("Validation set size: ", len(val))
print("Test set size: ", len(test))

Training set size:  60
Validation set size:  20
Test set size:  20


In [13]:
# PyTorch Dataset class for chest X-ray data
class ChestXrayDataset(Dataset):
    def __init__(self, dataframe, images_path, labels, transform=None, is_training=True):
        self.dataframe = dataframe.reset_index(drop=True)
        self.images_path = images_path
        self.labels = labels
        self.transform = transform
        self.is_training = is_training
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        img_name = self.dataframe.iloc[idx]['Id']
        img_path = os.path.join(self.images_path, img_name)
        
        # Load image
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = Image.fromarray(image)
        
        # Get labels
        label_list = self.dataframe.iloc[idx]['Label']
        one_hot = [0 for _ in self.labels]
        for label in label_list:
            if label in self.labels:
                one_hot[self.labels.index(label)] = 1
        label = torch.tensor(one_hot, dtype=torch.float32)
        
        if self.transform:
            image = self.transform(image)
        
        return image, label

# Image directory path - since images are directly in ROOT_DIR
IMAGES_PATH = ROOT_DIR  # Changed from os.path.join(ROOT_DIR, "images")

# Define transforms for training (with augmentation)
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(degrees=5),
    transforms.ColorJitter(brightness=0.1, contrast=0.1),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Define transforms for validation/test (no augmentation)
val_test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Create PyTorch datasets
train_dataset = ChestXrayDataset(train, IMAGES_PATH, parser.labels, transform=train_transform)
val_dataset = ChestXrayDataset(val, IMAGES_PATH, parser.labels, transform=val_test_transform)
test_dataset = ChestXrayDataset(test, IMAGES_PATH, parser.labels, transform=val_test_transform)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=0)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=0)

print("Images path:", IMAGES_PATH)
print("Training batches:", len(train_loader))
print("Validation batches:", len(val_loader))
print("Test batches:", len(test_loader))

Images path: d:\MSE\10.Deep Learning\Group_Final\ViT-Chest-Xray\Project\input
Training batches: 2
Validation batches: 1
Test batches: 1
