## 1 Imports

In [None]:
import os
import pandas as pd
from PIL import Image
from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
import torch.nn as nn
import torch.optim as optim
from torchvision.models import resnet50
import random
from tqdm import tqdm
from our_models import Resnet50Model, Resnet18Model, Densenet121Model
import numpy as np
from sklearn.cluster import KMeans
from costume_dataset import ChestXrayDataset

# Preprocces

## 2 Find All Available Image Files (Fixed for Your Structure)

In [3]:
image_folders = [f"images_{str(i).zfill(3)}_lighter/images" for i in range(1, 13)]
available_images = set()

for folder in image_folders:
    folder_path = os.path.join("nih_chest_xrays_light", folder)
    if os.path.exists(folder_path):
        for fname in os.listdir(folder_path):
            if fname.lower().endswith(('.png', '.jpg', '.jpeg')):
                available_images.add(fname)

print("Total image files found:", len(available_images))


Total image files found: 112120


## 3 Load CSV and Filter Valid Images

In [4]:
df = pd.read_csv("nih_chest_xrays_light/Data_Entry_2017 copy.csv")

# Fix the extension from .png to .jpg
df['Image Index'] = df['Image Index'].str.strip().str.replace('.png', '.jpg')

# Add binary label
df['label'] = df['Finding Labels'].apply(lambda x: 0 if x == 'No Finding' else 1)

# Keep only rows where the image file actually exists
df = df[df['Image Index'].isin(available_images)]

print("Filtered dataset size:", len(df))
print("Label distribution:\n", df['label'].value_counts())


Filtered dataset size: 112120
Label distribution:
 label
0    60361
1    51759
Name: count, dtype: int64


In [5]:
print("Sample data:\n", df.head())

Sample data:
         Image Index          Finding Labels  Follow-up #  Patient ID  \
0  00000001_000.jpg            Cardiomegaly            0           1   
1  00000001_001.jpg  Cardiomegaly|Emphysema            1           1   
2  00000001_002.jpg   Cardiomegaly|Effusion            2           1   
3  00000002_000.jpg              No Finding            0           2   
4  00000003_000.jpg                  Hernia            0           3   

   Patient Age Patient Gender View Position  OriginalImage[Width  Height]  \
0           58              M            PA                 2682     2749   
1           58              M            PA                 2894     2729   
2           58              M            PA                 2500     2048   
3           81              M            PA                 2500     2048   
4           81              F            PA                 2582     2991   

   OriginalImagePixelSpacing[x     y]  Unnamed: 11  label  
0                        0.143

## 4 Define Image Transformations

In [59]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])


# Explore models

## Create Datasets and Loaders

In [6]:
with open("nih_chest_xrays_light/train_val_list copy.txt", 'r') as f:
    content = f.readlines()
content = [x.strip().replace('.png', '.jpg') for x in content]
with open("nih_chest_xrays_light/train_val_list copy.txt", 'w') as f:
    f.write('\n'.join(content))

with open("nih_chest_xrays_light/test_list copy.txt", 'r') as f:
    content = f.readlines()
content = [x.strip().replace('.png', '.jpg') for x in content]
with open("nih_chest_xrays_light/test_list copy.txt", 'w') as f:
    f.write('\n'.join(content))

In [8]:
# Load split lists
with open("nih_chest_xrays_light/train_val_list copy.txt", 'r') as f:
    train_files = set(x.strip() for x in f.readlines())

with open("nih_chest_xrays_light/test_list copy.txt", 'r') as f:
    test_files = set(x.strip() for x in f.readlines())

# Filter df using available image list
train_df = df[df['Image Index'].isin(train_files)]
test_df = df[df['Image Index'].isin(test_files)]

print("Official split sizes:")
print("Train:", len(train_df))
print("Test:", len(test_df))


Official split sizes:
Train: 86524
Test: 25596


In [None]:
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'])

train_dataset = ChestXrayDataset(train_df, "nih_chest_xrays_light", transform=transform)
test_dataset = ChestXrayDataset(test_df, "nih_chest_xrays_light", transform=transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

print(train_df['label'].value_counts(normalize=True))

label
0    0.538363
1    0.461637
Name: proportion, dtype: float64


In [74]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print(device)

mps


## Training Function

In [None]:
def train_model(criterion, optimizer, model, dataloader, epochs=3):
    model.train()

    for epoch in range(epochs):
        total_loss = 0
        for images, labels, _ in tqdm(dataloader):
            images = images.to(device)
            labels = labels.float().unsqueeze(1).to(device)

            optimizer.zero_grad()
            outputs = model(images)

            # temp
            #with torch.no_grad():
            #    print("Output stats:", outputs.min().item(), outputs.max().item())
            # end temp

            loss = criterion(outputs, labels)
            loss.backward()

            # temp
            #for name, param in model.named_parameters():
            #    if param.requires_grad and param.grad is not None:
            #        print(f"{name}: grad norm = {param.grad.norm().item()}")
            # end temp 
            
            optimizer.step()

            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(dataloader):.4f}")


## Evaluation Function

In [None]:
def evaluate(model, dataloader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels, _ in dataloader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            preds = torch.sigmoid(outputs).squeeze() > 0.5
            correct += (preds.int() == labels).sum().item()
            total += labels.size(0)
    print(f"Accuracy: {correct / total * 100:.2f}%")


## ResNet-18 Model

In [66]:
resnet18_model = models.resnet18(pretrained=True)
resnet18_model.fc = nn.Linear(resnet18_model.fc.in_features, 1)
resnet18_model = resnet18_model.to(device)

resnet18_criterion = nn.BCEWithLogitsLoss()
resnet18_optimizer = optim.Adam(resnet18_model.parameters(), lr=0.0001)



the trainning data i used in the run of this cell is not good !!!

In [None]:
train_model(resnet18_criterion, resnet18_optimizer, resnet18_model, train_loader, epochs=3)
evaluate(resnet18_model, test_loader)

100%|██████████| 2803/2803 [2:19:46<00:00,  2.99s/it]    


Epoch 1, Loss: 0.5976


100%|██████████| 2803/2803 [3:34:34<00:00,  4.59s/it]     


Epoch 2, Loss: 0.5733


100%|██████████| 2803/2803 [3:51:01<00:00,  4.95s/it]    


Epoch 3, Loss: 0.5530
Accuracy: 68.80%


## ResNet-50 Model

In [72]:
# Define ResNet-50 Model
resnet50_model = resnet50(pretrained=True)
resnet50_model.fc = nn.Linear(resnet50_model.fc.in_features, 1)
resnet50_model = resnet50_model.to(device)

# Freeze backbone
for param in resnet50_model.parameters():
    param.requires_grad = False
# Unfreeze only the final layer
for param in resnet50_model.fc.parameters():
    param.requires_grad = True

resnet50_criterion = nn.BCEWithLogitsLoss()
resnet50_optimizer = optim.Adam(resnet50_model.fc.parameters(), lr=1e-3)



In [None]:
# Train & Evaluate
train_model(resnet50_criterion, resnet50_optimizer, resnet50_model, train_loader, epochs=4)
evaluate(resnet50_model, test_loader)

In [None]:
# Take just 10000 samples to train faster
small_train_df = train_df.sample(10000, random_state=42)
small_train_dataset = ChestXrayDataset(small_train_df, "nih_chest_xrays_light", transform=transform)
small_train_loader = DataLoader(small_train_dataset, batch_size=32, shuffle=True)

# Try training for 8 epochs
train_model(resnet50_criterion, resnet50_optimizer, resnet50_model, small_train_loader, epochs=8)
evaluate(resnet50_model, test_loader)

100%|██████████| 313/313 [02:58<00:00,  1.76it/s]


Epoch 1, Loss: 0.6515


100%|██████████| 313/313 [02:59<00:00,  1.75it/s]


Epoch 2, Loss: 0.6294


100%|██████████| 313/313 [03:00<00:00,  1.73it/s]


Epoch 3, Loss: 0.6249


100%|██████████| 313/313 [03:12<00:00,  1.63it/s]


Epoch 4, Loss: 0.6287


100%|██████████| 313/313 [03:11<00:00,  1.64it/s]


Epoch 5, Loss: 0.6253


100%|██████████| 313/313 [03:06<00:00,  1.68it/s]


Epoch 6, Loss: 0.6302


100%|██████████| 313/313 [03:05<00:00,  1.69it/s]


Epoch 7, Loss: 0.6238


100%|██████████| 313/313 [03:10<00:00,  1.64it/s]


Epoch 8, Loss: 0.6137
Accuracy: 65.66%


# Define model_fn, Optimizer_fn and Criterion_fn

### Criterion

In [None]:
# NOTE: The models from our_models.py all return logits, not probabilities.
def BCEcriterion_fn():
    return nn.BCEWithLogitsLoss() 

### Densenet121

In [None]:
def densenet121_fn():
    model = Densenet121Model()
    return model

### Resnet-50

In [None]:
def resnet50_model_fn():
    model = Resnet50Model(pretrained=True)
    return model                                         

### Resnet-18

In [None]:
def resnet18_model_fn():
    model = Resnet18Model(pretrained=True)
    return model

### Optimizer

In [7]:
def optimizer_fn(model, lr=1e-3):
    return optim.Adam(model.fc.parameters(), lr=lr)

# AL pipeline


In [None]:
class ActiveLearningPipeline:
    def __init__(self, seed,
                 test_indices,
                 pool_indices,
                 train_indices,
                 root_dir,
                 dataset,
                 device,
                 model_fn,
                 creiterion_fn,
                 optimizer_fn,
                 transform_fn,
                 selection_criterion,
                 iterations,
                 epochs_per_iter,
                 budget_per_iter,
                 batch_size=32,
                 max_train_size=60000): #NOTR: update default values later as needed

        self.seed = seed
        self.iterations = iterations
        self.budget_per_iter = budget_per_iter
        self.batch_size = batch_size
        self.max_train_size = max_train_size
        self.root_dir = root_dir
        self.epochs_per_iter = epochs_per_iter

        # NOTE: pool_indices, train_indices and test_indices are *sets* of image filenames
        self.pool_indices = pool_indices
        self.train_indices = train_indices
        self.test_indices = test_indices
        self.selection_criterion = selection_criterion

        self.dataset = dataset
        
        self.device = device
        self.model_fn = model_fn
        self.criterion_fn = creiterion_fn
        self.optimizer_fn = optimizer_fn
        self.transform_fn = transform_fn
        
        test_df = self.dataset[self.dataset['Image Index'].isin(self.test_indices)] 
        test_dataset = ChestXrayDataset(test_df, self.root_dir, transform=self.transform_fn)
        self.test_loader = DataLoader(test_dataset, batch_size=self.batch_size, shuffle=True)
        
    def run_pipeline(self):
        accuracy_scores = []
        recall_scores = []

        for iteration in range(self.iterations):
            if len(self.train_indices) > self.max_train_size:
                raise ValueError("The train set is larger than 600 samples")

            print(f"Iteration {iteration + 1}/{self.iterations}")

            trained_model = self._train_model()
            accuracy, recall = self._evaluate_model(trained_model)
            accuracy_scores.append(accuracy)
            recall_scores.append(recall)

            if len(self.pool_indices) < self.budget_per_iter:
                print("Not enough samples in pool to continue.")
                break

            if self.selection_criterion == 'random':
                new_selected_indices = self._random_sampling()
            else:
                new_selected_indices = self._custom_sampling(trained_model)

            self._update_train_indices(new_selected_indices)
            self._update_pool_indices(new_selected_indices)

            print(f"Accuracy: {accuracy:.4f}")
            print(f"Recall: {recall:.4f}")
            print("----------------------------------------")

        return accuracy_scores, recall_scores
    
    def _train_model(self):
        model = self.model_fn().to(self.device)
        criterion = self.criterion_fn()
        optimizer = self.optimizer_fn(model)
        train_df = self.dataset[self.dataset['Image Index'].isin(self.train_indices)] 
        train_dataset = ChestXrayDataset(train_df, self.root_dir, transform=self.transform_fn)
        train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
        model.train()
        for epoch in range(self.epochs_per_iter):
            total_loss = 0
            for images, labels, _ in tqdm(train_loader):
                images = images.to(self.device)
                labels = labels.float().unsqueeze(1).to(self.device)

                optimizer.zero_grad()
                outputs = model(images)

                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

                total_loss += loss.item()
            print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader):.4f}")
        return model
    
    def _evaluate_model(self, model):
        model.eval()
        correct = 0
        total = 0
        true_positives = 0
        actual_positives = 0
        with torch.no_grad():
            for images, labels, _ in self.test_loader:
                images = images.to(self.device)
                labels = labels.to(self.device)
                outputs = model(images)
                preds = torch.sigmoid(outputs).squeeze() > 0.5
                correct += (preds.int() == labels).sum().item()
                total += labels.size(0)
                true_positives += ((preds.int() == 1) & (labels == 1)).sum().item()
                actual_positives += (labels == 1).sum().item()
        
        accuracy = correct / total * 100
        recall = (true_positives / actual_positives * 100) if actual_positives > 0 else 0
        
        print(f"Accuracy: {accuracy:.2f}%")
        print(f"Recall: {recall:.2f}%")
        
        return accuracy, recall
    
    def _random_sampling(self):
        random.seed(self.seed)
        return set(random.sample(self.pool_indices, self.budget_per_iter))
    
    def _update_train_indices(self, new_selected_samples):
        """
           Update the train indices by adding newly selected samples.
           new_selected_samples should be a set of image filenames.
        """
        self.train_indices.update(new_selected_samples)
        
    def _update_pool_indices(self, new_selected_samples):
        """
           Update the pool indices by removing the newly selected samples.
           new_selected_samples should be a set of image filenames.
        """
        self.pool_indices.difference_update(new_selected_samples)
        
        

# Sampling technics

## BADGE

In [None]:
def badge_sampling(model, dataloader, budget):
    """
    This is non checked code that performs badge sampling, generated by ChatGPT.
    changes needed to be done:
    1. Varify returned samples are set of image indices
    2. Ensure that the model has a method `gradient_embedding` that returns embeddings
    3. Ensure that the model is in evaluation mode before inference
    
    
    BADGE = Batch Active learning by Diverse Gradient Embeddings
    May improve active learning where:
    You want both uncertainty (picking hard-to-classify points).
    And diversity (picking a wide range of points, not duplicates).
    The key insight is:
    Instead of sampling based on just predictions or just diversity in feature space, sample based on the gradients you would get if you trained on the sample. 
    """
    model.eval()
    all_embeddings = []
    all_indices = []

    with torch.no_grad():
        for imgs, _, indices in dataloader:
            embeddings = model.gradient_embedding(imgs)
            all_embeddings.append(embeddings)
            all_indices.extend(indices.numpy())

    all_embeddings = np.concatenate(all_embeddings)

    # Perform k-means++ clustering
    kmeans = KMeans(n_clusters=budget, init='k-means++').fit(all_embeddings)
    centers = kmeans.cluster_centers_
    chosen = []

    # Choose points closest to cluster centers
    for center in centers:
        idx = np.argmin(np.linalg.norm(all_embeddings - center, axis=1))
        chosen.append(all_indices[idx])

    return set(chosen)


## something else