# 1 Imports

In [56]:
import os
import pandas as pd
from PIL import Image
from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
import torch.nn as nn
import torch.optim as optim
from torchvision.models import resnet50

from tqdm import tqdm

# 2 Find All Available Image Files (Fixed for Your Structure)

In [57]:
image_folders = [f"images_{str(i).zfill(3)}_lighter/images" for i in range(1, 13)]
available_images = set()

for folder in image_folders:
    folder_path = os.path.join("nih_chest_xrays_light", folder)
    if os.path.exists(folder_path):
        for fname in os.listdir(folder_path):
            if fname.lower().endswith(('.png', '.jpg', '.jpeg')):
                available_images.add(fname)

print("Total image files found:", len(available_images))


Total image files found: 112120


# 3 Load CSV and Filter Valid Images

In [58]:
df = pd.read_csv("nih_chest_xrays_light/Data_Entry_2017 copy.csv")

# Fix the extension from .png to .jpg
df['Image Index'] = df['Image Index'].str.strip().str.replace('.png', '.jpg')

# Add binary label
df['label'] = df['Finding Labels'].apply(lambda x: 0 if x == 'No Finding' else 1)

# Keep only rows where the image file actually exists
df = df[df['Image Index'].isin(available_images)]

print("Filtered dataset size:", len(df))
print("Label distribution:\n", df['label'].value_counts())


Filtered dataset size: 112120
Label distribution:
 label
0    60361
1    51759
Name: count, dtype: int64


# 4 Define Image Transformations

In [59]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])


# 5 Define Custom Dataset Class

In [60]:
class ChestXrayDataset(Dataset):
    def __init__(self, df, root_dir, transform=None):
        self.df = df.reset_index(drop=True)
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_name = self.df.loc[idx, 'Image Index']
        label = self.df.loc[idx, 'label']
        img_path = self._find_image(img_name)
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        return image, label

    def _find_image(self, filename):
        for i in range(1, 13):
            path = os.path.join(self.root_dir, f"images_{str(i).zfill(3)}_lighter", "images", filename)
            if os.path.exists(path):
                return path
        raise FileNotFoundError(f"{filename} not found.")


# 6 Create Datasets and Loaders

In [61]:
# Load split lists
with open("nih_chest_xrays_light/train_val_list copy.txt", 'r') as f:
    train_files = set(x.strip().replace('.png', '.jpg') for x in f.readlines())

with open("nih_chest_xrays_light/test_list copy.txt", 'r') as f:
    test_files = set(x.strip().replace('.png', '.jpg') for x in f.readlines())

# Filter df using available image list
train_df = df[df['Image Index'].isin(train_files)]
test_df = df[df['Image Index'].isin(test_files)]

print("Official split sizes:")
print("Train:", len(train_df))
print("Test:", len(test_df))


Official split sizes:
Train: 86524
Test: 25596


In [62]:
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'])

train_dataset = ChestXrayDataset(train_df, "nih_chest_xrays_light", transform=transform)
test_dataset = ChestXrayDataset(test_df, "nih_chest_xrays_light", transform=transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

print(train_df['label'].value_counts(normalize=True))

label
0    0.538363
1    0.461637
Name: proportion, dtype: float64


In [74]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print(device)

mps


# 7 Training Function

In [64]:
def train_model(criterion, optimizer, model, dataloader, epochs=3):
    model.train()

    for epoch in range(epochs):
        total_loss = 0
        for images, labels in tqdm(dataloader):
            images = images.to(device)
            labels = labels.float().unsqueeze(1).to(device)

            optimizer.zero_grad()
            outputs = model(images)

            # temp
            #with torch.no_grad():
            #    print("Output stats:", outputs.min().item(), outputs.max().item())
            # end temp

            loss = criterion(outputs, labels)
            loss.backward()

            # temp
            #for name, param in model.named_parameters():
            #    if param.requires_grad and param.grad is not None:
            #        print(f"{name}: grad norm = {param.grad.norm().item()}")
            # end temp 
            
            optimizer.step()

            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(dataloader):.4f}")


# 8 Evaluation Function

In [70]:
def evaluate(model, dataloader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in dataloader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            preds = torch.sigmoid(outputs).squeeze() > 0.5
            correct += (preds.int() == labels).sum().item()
            total += labels.size(0)
    print(f"Accuracy: {correct / total * 100:.2f}%")


# ResNet-18 Model

In [66]:
resnet18_model = models.resnet18(pretrained=True)
resnet18_model.fc = nn.Linear(resnet18_model.fc.in_features, 1)
resnet18_model = resnet18_model.to(device)

resnet18_criterion = nn.BCEWithLogitsLoss()
resnet18_optimizer = optim.Adam(resnet18_model.parameters(), lr=0.0001)



the trainning data i used in the run of this cell is not good !!!

In [None]:
train_model(resnet18_criterion, resnet18_optimizer, resnet18_model, train_loader, epochs=3)
evaluate(resnet18_model, test_loader)

100%|██████████| 2803/2803 [2:19:46<00:00,  2.99s/it]    


Epoch 1, Loss: 0.5976


100%|██████████| 2803/2803 [3:34:34<00:00,  4.59s/it]     


Epoch 2, Loss: 0.5733


100%|██████████| 2803/2803 [3:51:01<00:00,  4.95s/it]    


Epoch 3, Loss: 0.5530
Accuracy: 68.80%


# ResNet-50 Model

In [72]:
# Define ResNet-50 Model
resnet50_model = resnet50(pretrained=True)
resnet50_model.fc = nn.Linear(resnet50_model.fc.in_features, 1)
resnet50_model = resnet50_model.to(device)

# Freeze backbone
for param in resnet50_model.parameters():
    param.requires_grad = False
# Unfreeze only the final layer
for param in resnet50_model.fc.parameters():
    param.requires_grad = True

resnet50_criterion = nn.BCEWithLogitsLoss()
resnet50_optimizer = optim.Adam(resnet50_model.fc.parameters(), lr=1e-3)



In [None]:
# Train & Evaluate
train_model(resnet50_criterion, resnet50_optimizer, resnet50_model, train_loader, epochs=4)
evaluate(resnet50_model, test_loader)

In [None]:
# Take just 10000 samples to train faster
small_train_df = train_df.sample(10000, random_state=42)
small_train_dataset = ChestXrayDataset(small_train_df, "nih_chest_xrays_light", transform=transform)
small_train_loader = DataLoader(small_train_dataset, batch_size=32, shuffle=True)

# Try training for 8 epochs
train_model(resnet50_criterion, resnet50_optimizer, resnet50_model, small_train_loader, epochs=8)
evaluate(resnet50_model, test_loader)

100%|██████████| 313/313 [02:58<00:00,  1.76it/s]


Epoch 1, Loss: 0.6515


100%|██████████| 313/313 [02:59<00:00,  1.75it/s]


Epoch 2, Loss: 0.6294


100%|██████████| 313/313 [03:00<00:00,  1.73it/s]


Epoch 3, Loss: 0.6249


100%|██████████| 313/313 [03:12<00:00,  1.63it/s]


Epoch 4, Loss: 0.6287


100%|██████████| 313/313 [03:11<00:00,  1.64it/s]


Epoch 5, Loss: 0.6253


100%|██████████| 313/313 [03:06<00:00,  1.68it/s]


Epoch 6, Loss: 0.6302


100%|██████████| 313/313 [03:05<00:00,  1.69it/s]


Epoch 7, Loss: 0.6238


100%|██████████| 313/313 [03:10<00:00,  1.64it/s]


Epoch 8, Loss: 0.6137
Accuracy: 65.66%
