In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
from torchvision import transforms, datasets
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
import torchvision
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split
from PIL import ImageDraw, Image 

In [3]:
#define transformation for training set
train_transforms = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
                         
])

#define transformation for validation set
val_transforms = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
                         
])

In [2]:
train_dir = r"C:\Users\Aalap\Documents\ML\rnsa\training"
val_dir = r"C:\Users\Aalap\Documents\ML\rnsa\validation"

In [4]:
train_dataset = datasets.ImageFolder(root = train_dir, transform=train_transforms)
val_dataset = datasets.ImageFolder(root = val_dir, transform=val_transforms)

In [5]:
classes = train_dataset.classes
class_to_idx = train_dataset.class_to_idx
print(f"Class names: {classes} \nClass to index mapping: {class_to_idx}")

Class names: ['lung_opacity', 'no_lung_opacity_not_normal', 'normal'] 
Class to index mapping: {'lung_opacity': 0, 'no_lung_opacity_not_normal': 1, 'normal': 2}


In [3]:
BATCH_SIZE = 32

In [14]:
train_dataloader = DataLoader(
    train_dataset,
    batch_size = BATCH_SIZE,
    shuffle = True,
    num_workers = 4,
    pin_memory = True
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size = BATCH_SIZE,
    shuffle = False,
    num_workers = 4,
    pin_memory = True
)

In [3]:
# Choose a computing device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
device

device(type='cuda')

In [9]:
#load pretrained model
model = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V1)

#get the numeber of input features of the last layer
num_ftrs = model.fc.in_features

#modify the last layer to match the number of classes in our dataset
model.fc = nn.Linear(num_ftrs, 3) 

#move model to device
model = model.to(device)

print("Model is read to be trained on device:", device)

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to C:\Users\Aalap/.cache\torch\hub\checkpoints\resnet50-0676ba61.pth


100.0%


Model is read to be trained on device: cuda


In [10]:
#define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10

for epoch in range(num_epochs):
    #train
    model.train()
    running_loss = 0.0
    for images, labels in train_dataloader:
        images, labels = images.to(device), labels.to(device)

        #set the gradients to zero
        optimizer.zero_grad()

        #forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)

        #backward pass and optimization
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    
    #print average loss for the epoch
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_dataloader):.4f}")

Epoch [1/10], Loss: 0.7754
Epoch [2/10], Loss: 0.7000
Epoch [3/10], Loss: 0.6817
Epoch [4/10], Loss: 0.6649
Epoch [5/10], Loss: 0.6566
Epoch [6/10], Loss: 0.6479
Epoch [7/10], Loss: 0.6386
Epoch [8/10], Loss: 0.6254
Epoch [9/10], Loss: 0.6203
Epoch [10/10], Loss: 0.6139


In [11]:
torch.save(model.state_dict(), 'resnet_classifier.pth')

In [None]:
# # 1. Instantiate the model architecture
# num_classes = 3
# model_classifier = models.resnet50()
# num_ftrs = model_classifier.fc.in_features
# model_classifier.fc = nn.Linear(num_ftrs, num_classes)

# # 2. Load the state dictionary from the saved file
# model_classifier.load_state_dict(torch.load('resnet_classifier.pth'))

# # Move the model to the appropriate device (CPU or GPU)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model_classifier.to(device)

# # Set the model to evaluation mode
# model_classifier.eval()

# print("Model has been loaded successfully.")

In [12]:
# Validation phase
model.eval()
correct = 0
total = 0
val_loss = 0.0
with torch.no_grad(): # Disable gradient calculation for evaluation
    for images, labels in val_dataloader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        loss = criterion(outputs, labels)
        val_loss += loss.item()

        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

# Print validation metrics
print(f"Validation Loss: {val_loss/len(val_dataloader):.4f}")
print(f"Validation Accuracy: {100 * correct / total:.2f}%")

Validation Loss: 0.6861
Validation Accuracy: 68.30%


This model performance is only based on the images in training. This does not include the meta data yet. It can be useful to add meta data into the classification process. 

First fine tune the model for 3 more epochs.

In [17]:
##Loading the previously saved model

# 1. Instantiate the model architecture
num_classes = 3
model_classifier = models.resnet50()
num_ftrs = model_classifier.fc.in_features
model_classifier.fc = nn.Linear(num_ftrs, num_classes)

# 2. Load the state dictionary from the saved file
model_classifier.load_state_dict(torch.load('resnet_classifier.pth'))

# Move the model to the appropriate device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_classifier.to(device)

# Set the model to evaluation mode
model_classifier.eval()

print("Model has been loaded successfully.")

Model has been loaded successfully.


In [3]:
#check the model layers
for name, layer in model_classifier.named_children():
    print(name, layer)

conv1 Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
bn1 BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
relu ReLU(inplace=True)
maxpool MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
layer1 Sequential(
  (0): Bottleneck(
    (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (downsample): Sequential(
      (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (1): BatchNorm2d(256, eps=1e-05

In [18]:
#setting all the layers to not require gradients
#freezing layers
for param in model_classifier.parameters():
    param.requires_grad = False

#unfreezing the final layer
for param in model_classifier.fc.parameters():
    param.requires_grad = True

#unfreezing the layer 4 parameters
for param in model_classifier.layer4.parameters():
    param.requires_grad = True

#define a new optimizer that includes the unfrozen layers' parameters
#use a very small learning rate 
optimizer = optim.Adam(
    filter(lambda p : p.requires_grad, model_classifier.parameters()),
    lr = 1e-4
)

In [19]:
BATCH_SIZE = 16

In [21]:
train_dataloader = DataLoader(
    train_dataset,
    batch_size = BATCH_SIZE,
    shuffle = True,
    num_workers = 2,
    pin_memory = True
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size = BATCH_SIZE,
    shuffle = False,
    num_workers = 2,
    pin_memory = True
)

In [22]:
criterion = nn.CrossEntropyLoss()

num_epochs = 3

for epoch in range(num_epochs):
    model_classifier.train()
    train_loss, train_labels, train_preds = 0.0, [], []

    for images, labels in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]"):
        images, labels = images.to(device), labels.to(device)

        #set the greadients to zero
        optimizer.zero_grad()

        #forward pass
        outputs = model_classifier(images)
        loss = criterion(outputs, labels)

        #backward pass and optimization
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        train_preds.extend(outputs.argmax(1).detach().cpu().numpy())
        train_labels.extend(labels.cpu().numpy())

    train_acc = accuracy_score(train_labels, train_preds)
    train_f1 = f1_score(train_labels, train_preds, average='macro')
    
    print(f"Epoch {epoch+1}/{num_epochs} | "
          f"Train Loss: {train_loss/len(train_dataloader):.4f}, Acc: {train_acc:.4f}, F1: {train_f1:.4f} | ")



Epoch 1/3 [Train]:   0%|          | 0/1501 [00:00<?, ?it/s]

Epoch 1/3 [Train]: 100%|██████████| 1501/1501 [05:05<00:00,  4.91it/s]


Epoch 1/3 | Train Loss: 0.6005, Acc: 0.7328, F1: 0.7151 | 


Epoch 2/3 [Train]: 100%|██████████| 1501/1501 [03:48<00:00,  6.57it/s]


Epoch 2/3 | Train Loss: 0.5904, Acc: 0.7346, F1: 0.7189 | 


Epoch 3/3 [Train]: 100%|██████████| 1501/1501 [03:41<00:00,  6.77it/s]

Epoch 3/3 | Train Loss: 0.5839, Acc: 0.7367, F1: 0.7213 | 





In [23]:
for epoch in range(num_epochs):
    model_classifier.eval()
    val_loss, val_labels, val_preds = 0.0, [], []

    for images, labels in tqdm(val_dataloader, desc=f"Epoch {epoch+1}/{num_epochs} [Val]"):
        images, labels = images.to(device), labels.to(device)

        #set the greadients to zero
        optimizer.zero_grad()

        #forward pass
        outputs = model_classifier(images)
        loss = criterion(outputs, labels)

        #backward pass and optimization
        loss.backward()
        optimizer.step()

        val_loss += loss.item()
        val_preds.extend(outputs.argmax(1).cpu().numpy())
        val_labels.extend(labels.cpu().numpy())

    val_acc   = accuracy_score(val_labels, val_preds)
    val_f1    = f1_score(val_labels, val_preds, average="macro")

    try:
        val_auc = roc_auc_score(val_labels, val_preds, multi_class="ovr")
    except:
        val_auc = float("nan")
    
    print(f"Epoch {epoch+1}/{num_epochs} | "
          f"Val Loss: {val_loss/len(val_dataloader):.4f}, Acc: {val_acc:.4f}, F1: {val_f1:.4f}, AUC: {val_auc:.4f} |")

Epoch 1/3 [Val]: 100%|██████████| 167/167 [00:41<00:00,  4.02it/s]


Epoch 1/3 | Val Loss: 0.8051, Acc: 0.8134, F1: 0.8138, AUC: nan |


Epoch 2/3 [Val]: 100%|██████████| 167/167 [00:29<00:00,  5.67it/s]


Epoch 2/3 | Val Loss: 1.0042, Acc: 0.6456, F1: 0.6248, AUC: nan |


Epoch 3/3 [Val]: 100%|██████████| 167/167 [00:29<00:00,  5.67it/s]

Epoch 3/3 | Val Loss: 0.7554, Acc: 0.6613, F1: 0.6384, AUC: nan |





Ok, there is error here. Now, here is the fixed validation loop.

In [27]:
# Corrected validation loop
model_classifier.eval() # Set model to evaluation mode
val_loss, val_labels, val_preds = 0.0, [], []

with torch.no_grad(): # Disable gradient calculation
    for images, labels in tqdm(val_dataloader, desc=f"Epoch {epoch+1}/{num_epochs} [Val]"):
        images, labels = images.to(device), labels.to(device)

        # Forward pass
        outputs = model_classifier(images)
        loss = criterion(outputs, labels)
        val_loss += loss.item()

        # To fix the AUC issue, collect probability scores for each class
        # Use torch.softmax() to convert logits to probabilities
        probs = torch.softmax(outputs, dim=1) 
        val_preds.extend(probs.detach().cpu().numpy())
        val_labels.extend(labels.cpu().numpy())
    
# After the loop, you can calculate metrics
val_acc = accuracy_score(val_labels, np.argmax(val_preds, axis=1))
val_f1 = f1_score(val_labels, np.argmax(val_preds, axis=1), average="macro")
val_auc = roc_auc_score(val_labels, val_preds, multi_class="ovr")

print(f"Epoch {epoch+1}/{num_epochs} | "
      f"Val Loss: {val_loss/len(val_dataloader):.4f}, Acc: {val_acc:.4f}, F1: {val_f1:.4f}, AUC: {val_auc:.4f} |")

Epoch 3/3 [Val]: 100%|██████████| 167/167 [00:28<00:00,  5.85it/s]

Epoch 3/3 | Val Loss: 1.3829, Acc: 0.5069, F1: 0.3709, AUC: 0.8211 |





In [28]:
torch.save(model_classifier.state_dict(), 'resnet_classifier_finetuned.pth')

I want to train this again. Changes: 1. Train for 5 epochs. 2. Unfreeze more layers. 3. Add more transformations to images.

In [5]:
# This function is crucial for freeing up GPU memory
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("GPU cache has been cleared.")

GPU cache has been cleared.


In [31]:
#define transformation for training set
train_transforms = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
                         
])

#define transformation for validation set
val_transforms = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
                         
])

In [32]:
##Loading the previously saved model

# 1. Instantiate the model architecture
num_classes = 3
model_classifier = models.resnet50()
num_ftrs = model_classifier.fc.in_features
model_classifier.fc = nn.Linear(num_ftrs, num_classes)

# 2. Load the state dictionary from the saved file
model_classifier.load_state_dict(torch.load('resnet_classifier_finetuned.pth'))

# Move the model to the appropriate device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_classifier.to(device)

# Set the model to evaluation mode
model_classifier.eval()

print("Model has been loaded successfully.")

Model has been loaded successfully.


In [33]:
#setting all the layers to not require gradients
#freezing layers
for param in model_classifier.parameters():
    param.requires_grad = False

#unfreezing the final layer
for param in model_classifier.fc.parameters():
    param.requires_grad = True

#unfreezing the layer 3 parameters
for param in model_classifier.layer3.parameters():
    param.requires_grad = True

#unfreezing the layer 4 parameters
for param in model_classifier.layer4.parameters():
    param.requires_grad = True

#define a new optimizer that includes the unfrozen layers' parameters
#use a very small learning rate 
optimizer = optim.Adam(
    filter(lambda p : p.requires_grad, model_classifier.parameters()),
    lr = 1e-4
)

In [34]:
BATCH_SIZE = 16

train_dataloader = DataLoader(
    train_dataset,
    batch_size = BATCH_SIZE,
    shuffle = True,
    num_workers = 2,
    pin_memory = True
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size = BATCH_SIZE,
    shuffle = False,
    num_workers = 2,
    pin_memory = True
)

In [37]:
criterion = nn.CrossEntropyLoss()
num_epochs = 5

for epoch in range(num_epochs):
    # ==================== Training Loop ====================
    model_classifier.train()
    train_loss, train_labels, train_preds = 0.0, [], []

    for images, labels in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]"):
        images, labels = images.to(device), labels.to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model_classifier(images)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        
        # Append predictions and labels for evaluation
        train_preds.extend(outputs.argmax(1).detach().cpu().numpy())
        train_labels.extend(labels.cpu().numpy())

    # Calculate training metrics
    train_acc = accuracy_score(train_labels, train_preds)
    train_f1 = f1_score(train_labels, train_preds, average='macro')

    # ==================== Validation Loop ====================
    model_classifier.eval() # Set model to evaluation mode
    val_loss, val_labels, val_preds_probs, val_preds_hard = 0.0, [], [], []

    with torch.no_grad():
        for images, labels in tqdm(val_dataloader, desc=f"Epoch {epoch+1}/{num_epochs} [Val]"):
            images, labels = images.to(device), labels.to(device)

            # Forward pass
            outputs = model_classifier(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            # Collect probability scores for AUC and hard predictions for accuracy
            probs = torch.softmax(outputs, dim=1)
            val_preds_probs.extend(probs.detach().cpu().numpy())
            val_preds_hard.extend(outputs.argmax(1).detach().cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    # Calculate validation metrics
    val_acc = accuracy_score(val_labels, val_preds_hard)
    val_f1 = f1_score(val_labels, val_preds_hard, average="macro")
    try:
        val_auc = roc_auc_score(val_labels, val_preds_probs, multi_class="ovr")
    except:
        val_auc = float("nan")

    # ==================== Print Metrics ====================
    print(f"Epoch {epoch+1}/{num_epochs} | "
          f"Train Loss: {train_loss/len(train_dataloader):.4f}, Acc: {train_acc:.4f}, F1: {train_f1:.4f} | "
          f"Val Loss: {val_loss/len(val_dataloader):.4f}, Acc: {val_acc:.4f}, F1: {val_f1:.4f}, AUC: {val_auc:.4f} |")

Epoch 1/5 [Train]:   0%|          | 0/1501 [00:00<?, ?it/s]

Epoch 1/5 [Train]: 100%|██████████| 1501/1501 [09:26<00:00,  2.65it/s]
Epoch 1/5 [Val]: 100%|██████████| 167/167 [00:38<00:00,  4.36it/s]


Epoch 1/5 | Train Loss: 0.5294, Acc: 0.7652, F1: 0.7532 | Val Loss: 0.6275, Acc: 0.7269, F1: 0.7107, AUC: 0.8730 |


Epoch 2/5 [Train]: 100%|██████████| 1501/1501 [03:41<00:00,  6.77it/s]
Epoch 2/5 [Val]: 100%|██████████| 167/167 [00:33<00:00,  5.02it/s]


Epoch 2/5 | Train Loss: 0.5196, Acc: 0.7702, F1: 0.7583 | Val Loss: 0.6445, Acc: 0.7186, F1: 0.7025, AUC: 0.8688 |


Epoch 3/5 [Train]: 100%|██████████| 1501/1501 [03:46<00:00,  6.63it/s]
Epoch 3/5 [Val]: 100%|██████████| 167/167 [00:33<00:00,  4.95it/s]


Epoch 3/5 | Train Loss: 0.5093, Acc: 0.7764, F1: 0.7659 | Val Loss: 0.6471, Acc: 0.7164, F1: 0.6964, AUC: 0.8700 |


Epoch 4/5 [Train]: 100%|██████████| 1501/1501 [03:46<00:00,  6.63it/s]
Epoch 4/5 [Val]: 100%|██████████| 167/167 [00:33<00:00,  5.00it/s]


Epoch 4/5 | Train Loss: 0.4953, Acc: 0.7858, F1: 0.7755 | Val Loss: 0.6597, Acc: 0.7194, F1: 0.7014, AUC: 0.8669 |


Epoch 5/5 [Train]: 100%|██████████| 1501/1501 [03:48<00:00,  6.58it/s]
Epoch 5/5 [Val]: 100%|██████████| 167/167 [00:33<00:00,  4.93it/s]

Epoch 5/5 | Train Loss: 0.4814, Acc: 0.7926, F1: 0.7833 | Val Loss: 0.6735, Acc: 0.7224, F1: 0.7072, AUC: 0.8670 |





In [39]:
torch.save(model_classifier.state_dict(), 'resnet_classifier_finetuned2.pth')

Next steps:
- Need to address overfitting
- add weight decay to optimizer
- introduce dropout
- expand data agmentation
- implement early stopping
- adjust learning rate


optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)


self.classifier = nn.Sequential(
    nn.Linear(num_ftrs + 64, 256),
    nn.ReLU(),
    nn.Dropout(0.5), # Add this line
    nn.Linear(256, num_classes)
)

train_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.1, contrast=0.1),
    transforms.RandomPerspective(distortion_scale=0.2, p=0.5),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])


In [3]:
# Choose a computing device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
# This function is crucial for freeing up GPU memory
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("GPU cache has been cleared.")

GPU cache has been cleared.


In [24]:
#define transformation for training set
train_transforms = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness = 0.1, contrast = 0.1), #adding color jitter
    transforms.RandomPerspective(distortion_scale=0.2, p=0.5), #adding random perspective
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
                         
])

#define transformation for validation set
val_transforms = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness = 0.1, contrast = 0.1), #adding color jitter
    transforms.RandomPerspective(distortion_scale=0.2, p=0.5), #adding random perspective
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
                         
])

In [25]:
##Loading the previously saved model

# 1. Instantiate the model architecture
num_classes = 3
model_classifier = models.resnet50()
num_ftrs = model_classifier.fc.in_features
model_classifier.fc = nn.Linear(num_ftrs, num_classes)

# 2. Load the state dictionary from the saved file
saved_state_dict = torch.load('resnet_classifier_finetuned2.pth')
model_classifier.load_state_dict(saved_state_dict)

# Create the new Sequential model with the Dropout and Linear layers
num_ftrs = 2048 # ResNet-50 features
num_classes = 3
new_fc = nn.Sequential(
    nn.Dropout(p=0.5),
    nn.Linear(num_ftrs, num_classes)
)

# Extract the weights from the old state dictionary
old_fc_weight = saved_state_dict['fc.weight']
old_fc_bias = saved_state_dict['fc.bias']

# Load the weights into the new Linear layer
new_fc[1].weight = nn.Parameter(old_fc_weight)
new_fc[1].bias = nn.Parameter(old_fc_bias)

# Replace the model's fully connected layer with the new Sequential model
model_classifier.fc = new_fc

# Move the model to the appropriate device (CPU or GPU)
model_classifier.to(device)

print("Model has been loaded successfully.")

Model has been loaded successfully.


In [26]:
#setting all the layers to not require gradients
#freezing layers
for param in model_classifier.parameters():
    param.requires_grad = False

#unfreezing the final layer
for param in model_classifier.fc.parameters():
    param.requires_grad = True

#unfreezing the layer 3 parameters
for param in model_classifier.layer3.parameters():
    param.requires_grad = True

#unfreezing the layer 4 parameters
for param in model_classifier.layer4.parameters():
    param.requires_grad = True

#define a new optimizer that includes the unfrozen layers' parameters
#use a very small learning rate 
optimizer = optim.Adam(
    filter(lambda p : p.requires_grad, model_classifier.parameters()),
    lr = 1e-5, #reducing learning rate to 1e-5 from 1e-4
    weight_decay=1e-5 #adding weight decay of 1e-5
)

In [27]:
train_dataset = datasets.ImageFolder(root = train_dir, transform=train_transforms)
val_dataset = datasets.ImageFolder(root = val_dir, transform=val_transforms)

In [28]:
BATCH_SIZE = 16

train_dataloader = DataLoader(
    train_dataset,
    batch_size = BATCH_SIZE,
    shuffle = True,
    num_workers = 2,
    pin_memory = True
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size = BATCH_SIZE,
    shuffle = False,
    num_workers = 2,
    pin_memory = True
)

In [30]:
criterion = nn.CrossEntropyLoss()
num_epochs = 5

best_val_loss = float('inf')
patience = 2
patience_counter = 0

for epoch in range(num_epochs):
    # ==================== Training Loop ====================
    model_classifier.train()
    train_loss, train_labels, train_preds = 0.0, [], []

    for images, labels in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]"):
        images, labels = images.to(device), labels.to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model_classifier(images)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        
        # Append predictions and labels for evaluation
        train_preds.extend(outputs.argmax(1).detach().cpu().numpy())
        train_labels.extend(labels.cpu().numpy())

    # Calculate training metrics
    train_acc = accuracy_score(train_labels, train_preds)
    train_f1 = f1_score(train_labels, train_preds, average='macro')

    # ==================== Validation Loop ====================
    model_classifier.eval() # Set model to evaluation mode
    val_loss, val_labels, val_preds_probs, val_preds_hard = 0.0, [], [], []

    with torch.no_grad():
        for images, labels in tqdm(val_dataloader, desc=f"Epoch {epoch+1}/{num_epochs} [Val]"):
            images, labels = images.to(device), labels.to(device)

            # Forward pass
            outputs = model_classifier(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            # Collect probability scores for AUC and hard predictions for accuracy
            probs = torch.softmax(outputs, dim=1)
            val_preds_probs.extend(probs.detach().cpu().numpy())
            val_preds_hard.extend(outputs.argmax(1).detach().cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    # Calculate validation metrics
    val_acc = accuracy_score(val_labels, val_preds_hard)
    val_f1 = f1_score(val_labels, val_preds_hard, average="macro")
    try:
        val_auc = roc_auc_score(val_labels, val_preds_probs, multi_class="ovr")
    except:
        val_auc = float("nan")

    # ==================== Print Metrics ====================
    print(f"Epoch {epoch+1}/{num_epochs} | "
          f"Train Loss: {train_loss/len(train_dataloader):.4f}, Acc: {train_acc:.4f}, F1: {train_f1:.4f} | "
          f"Val Loss: {val_loss/len(val_dataloader):.4f}, Acc: {val_acc:.4f}, F1: {val_f1:.4f}, AUC: {val_auc:.4f} |")
    
    # Early Stopping Check
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        print("Validation loss improved. Saving model.")
        #save the best model
        torch.save(model_classifier.state_dict(), 'best_resnet_classifier3.pth')
    else:
        patience_counter += 1
        print(f"Validation loss has not improved for {patience_counter} epochs.")
        if patience_counter >= patience:
            print("Early stopping triggered.")
            break

Epoch 1/5 [Train]:   0%|          | 0/1501 [00:00<?, ?it/s]

Epoch 1/5 [Train]: 100%|██████████| 1501/1501 [07:38<00:00,  3.28it/s]
Epoch 1/5 [Val]: 100%|██████████| 167/167 [00:46<00:00,  3.56it/s]


Epoch 1/5 | Train Loss: 0.6707, Acc: 0.7013, F1: 0.6895 | Val Loss: 0.6899, Acc: 0.6995, F1: 0.6854, AUC: 0.8540 |
Validation loss improved. Saving model.


Epoch 2/5 [Train]: 100%|██████████| 1501/1501 [05:26<00:00,  4.59it/s]
Epoch 2/5 [Val]: 100%|██████████| 167/167 [00:47<00:00,  3.51it/s]


Epoch 2/5 | Train Loss: 0.6475, Acc: 0.7083, F1: 0.6954 | Val Loss: 0.6670, Acc: 0.6939, F1: 0.6785, AUC: 0.8580 |
Validation loss improved. Saving model.


Epoch 3/5 [Train]: 100%|██████████| 1501/1501 [04:54<00:00,  5.09it/s]
Epoch 3/5 [Val]: 100%|██████████| 167/167 [00:47<00:00,  3.53it/s]


Epoch 3/5 | Train Loss: 0.6339, Acc: 0.7110, F1: 0.6979 | Val Loss: 0.6714, Acc: 0.6980, F1: 0.6815, AUC: 0.8549 |
Validation loss has not improved for 1 epochs.


Epoch 4/5 [Train]: 100%|██████████| 1501/1501 [04:40<00:00,  5.35it/s]
Epoch 4/5 [Val]: 100%|██████████| 167/167 [00:43<00:00,  3.80it/s]


Epoch 4/5 | Train Loss: 0.6324, Acc: 0.7150, F1: 0.7009 | Val Loss: 0.6598, Acc: 0.7010, F1: 0.6844, AUC: 0.8602 |
Validation loss improved. Saving model.


Epoch 5/5 [Train]: 100%|██████████| 1501/1501 [04:49<00:00,  5.19it/s]
Epoch 5/5 [Val]: 100%|██████████| 167/167 [00:39<00:00,  4.21it/s]

Epoch 5/5 | Train Loss: 0.6271, Acc: 0.7189, F1: 0.7044 | Val Loss: 0.6513, Acc: 0.7100, F1: 0.6921, AUC: 0.8633 |
Validation loss improved. Saving model.





more training ...

In [31]:
criterion = nn.CrossEntropyLoss()
num_epochs = 10

best_val_loss = float('inf')
patience = 2
patience_counter = 0

for epoch in range(num_epochs):
    # ==================== Training Loop ====================
    model_classifier.train()
    train_loss, train_labels, train_preds = 0.0, [], []

    for images, labels in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]"):
        images, labels = images.to(device), labels.to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model_classifier(images)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        
        # Append predictions and labels for evaluation
        train_preds.extend(outputs.argmax(1).detach().cpu().numpy())
        train_labels.extend(labels.cpu().numpy())

    # Calculate training metrics
    train_acc = accuracy_score(train_labels, train_preds)
    train_f1 = f1_score(train_labels, train_preds, average='macro')

    # ==================== Validation Loop ====================
    model_classifier.eval() # Set model to evaluation mode
    val_loss, val_labels, val_preds_probs, val_preds_hard = 0.0, [], [], []

    with torch.no_grad():
        for images, labels in tqdm(val_dataloader, desc=f"Epoch {epoch+1}/{num_epochs} [Val]"):
            images, labels = images.to(device), labels.to(device)

            # Forward pass
            outputs = model_classifier(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            # Collect probability scores for AUC and hard predictions for accuracy
            probs = torch.softmax(outputs, dim=1)
            val_preds_probs.extend(probs.detach().cpu().numpy())
            val_preds_hard.extend(outputs.argmax(1).detach().cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    # Calculate validation metrics
    val_acc = accuracy_score(val_labels, val_preds_hard)
    val_f1 = f1_score(val_labels, val_preds_hard, average="macro")
    try:
        val_auc = roc_auc_score(val_labels, val_preds_probs, multi_class="ovr")
    except:
        val_auc = float("nan")

    # ==================== Print Metrics ====================
    print(f"Epoch {epoch+1}/{num_epochs} | "
          f"Train Loss: {train_loss/len(train_dataloader):.4f}, Acc: {train_acc:.4f}, F1: {train_f1:.4f} | "
          f"Val Loss: {val_loss/len(val_dataloader):.4f}, Acc: {val_acc:.4f}, F1: {val_f1:.4f}, AUC: {val_auc:.4f} |")
    
    # Early Stopping Check
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        print("Validation loss improved. Saving model.")
        #save the best model
        torch.save(model_classifier.state_dict(), 'best_resnet_classifier3_more_trained.pth')
    else:
        patience_counter += 1
        print(f"Validation loss has not improved for {patience_counter} epochs.")
        if patience_counter >= patience:
            print("Early stopping triggered.")
            break

Epoch 1/10 [Train]: 100%|██████████| 1501/1501 [04:19<00:00,  5.77it/s]
Epoch 1/10 [Val]: 100%|██████████| 167/167 [00:38<00:00,  4.33it/s]


Epoch 1/10 | Train Loss: 0.6232, Acc: 0.7185, F1: 0.7034 | Val Loss: 0.6631, Acc: 0.7033, F1: 0.6850, AUC: 0.8601 |
Validation loss improved. Saving model.


Epoch 2/10 [Train]: 100%|██████████| 1501/1501 [04:36<00:00,  5.44it/s]
Epoch 2/10 [Val]: 100%|██████████| 167/167 [00:37<00:00,  4.50it/s]


Epoch 2/10 | Train Loss: 0.6209, Acc: 0.7235, F1: 0.7091 | Val Loss: 0.6542, Acc: 0.7141, F1: 0.6990, AUC: 0.8622 |
Validation loss improved. Saving model.


Epoch 3/10 [Train]: 100%|██████████| 1501/1501 [04:25<00:00,  5.65it/s]
Epoch 3/10 [Val]: 100%|██████████| 167/167 [00:37<00:00,  4.46it/s]


Epoch 3/10 | Train Loss: 0.6225, Acc: 0.7198, F1: 0.7058 | Val Loss: 0.6494, Acc: 0.7051, F1: 0.6898, AUC: 0.8641 |
Validation loss improved. Saving model.


Epoch 4/10 [Train]: 100%|██████████| 1501/1501 [04:26<00:00,  5.64it/s]
Epoch 4/10 [Val]: 100%|██████████| 167/167 [00:37<00:00,  4.44it/s]


Epoch 4/10 | Train Loss: 0.6198, Acc: 0.7213, F1: 0.7067 | Val Loss: 0.6460, Acc: 0.7138, F1: 0.6960, AUC: 0.8646 |
Validation loss improved. Saving model.


Epoch 5/10 [Train]: 100%|██████████| 1501/1501 [04:29<00:00,  5.56it/s]
Epoch 5/10 [Val]: 100%|██████████| 167/167 [00:37<00:00,  4.50it/s]


Epoch 5/10 | Train Loss: 0.6175, Acc: 0.7209, F1: 0.7059 | Val Loss: 0.6447, Acc: 0.7141, F1: 0.6965, AUC: 0.8663 |
Validation loss improved. Saving model.


Epoch 6/10 [Train]: 100%|██████████| 1501/1501 [04:31<00:00,  5.54it/s]
Epoch 6/10 [Val]: 100%|██████████| 167/167 [00:37<00:00,  4.45it/s]


Epoch 6/10 | Train Loss: 0.6177, Acc: 0.7220, F1: 0.7073 | Val Loss: 0.6441, Acc: 0.7108, F1: 0.6889, AUC: 0.8660 |
Validation loss improved. Saving model.


Epoch 7/10 [Train]: 100%|██████████| 1501/1501 [04:25<00:00,  5.66it/s]
Epoch 7/10 [Val]: 100%|██████████| 167/167 [00:37<00:00,  4.47it/s]


Epoch 7/10 | Train Loss: 0.6121, Acc: 0.7275, F1: 0.7133 | Val Loss: 0.6403, Acc: 0.7104, F1: 0.6933, AUC: 0.8677 |
Validation loss improved. Saving model.


Epoch 8/10 [Train]: 100%|██████████| 1501/1501 [04:32<00:00,  5.52it/s]
Epoch 8/10 [Val]: 100%|██████████| 167/167 [00:37<00:00,  4.48it/s]


Epoch 8/10 | Train Loss: 0.6140, Acc: 0.7222, F1: 0.7075 | Val Loss: 0.6350, Acc: 0.7063, F1: 0.6917, AUC: 0.8685 |
Validation loss improved. Saving model.


Epoch 9/10 [Train]: 100%|██████████| 1501/1501 [04:24<00:00,  5.68it/s]
Epoch 9/10 [Val]: 100%|██████████| 167/167 [00:37<00:00,  4.47it/s]


Epoch 9/10 | Train Loss: 0.6149, Acc: 0.7245, F1: 0.7102 | Val Loss: 0.6424, Acc: 0.7152, F1: 0.6994, AUC: 0.8663 |
Validation loss has not improved for 1 epochs.


Epoch 10/10 [Train]: 100%|██████████| 1501/1501 [04:27<00:00,  5.60it/s]
Epoch 10/10 [Val]: 100%|██████████| 167/167 [00:40<00:00,  4.10it/s]

Epoch 10/10 | Train Loss: 0.6097, Acc: 0.7247, F1: 0.7103 | Val Loss: 0.6481, Acc: 0.7085, F1: 0.6919, AUC: 0.8638 |
Validation loss has not improved for 2 epochs.
Early stopping triggered.





Ok. This is good enough accuract for a stand alone model. Now I want to include the meta data along with the images for training. Hopefully, this will increase classification accuracy.

In [3]:
# Choose a computing device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
device

device(type='cuda')

In [5]:
# This function is crucial for freeing up GPU memory
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("GPU cache has been cleared.")

GPU cache has been cleared.


In [23]:
#load the latest model
num_classes = 3
num_ftrs = 2048 

model_classifier = models.resnet50()
model_classifier.fc = nn.Sequential(
    nn.Dropout(p=0.5),
    nn.Linear(num_ftrs, num_classes)
)

model_classifier.load_state_dict(torch.load('best_resnet_classifier3_more_trained.pth'))

model_classifier.to(device)
model_classifier.eval()

print(f"Model has been loaded successfully on {device}.")

  model_classifier.load_state_dict(torch.load('best_resnet_classifier3_more_trained.pth'))


Model has been loaded successfully on cuda.


I realized that validation transformations are not supposed to have augmentation. This results in model unnecessarily testing on harder images and thus showing lower validation score.
Will try to fix this for now by running a standalone validation pass using clean val_transform.

In [24]:
clean_val_transforms = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

#train_dataset = datasets.ImageFolder(root = train_dir, transform=train_transforms)
val_dataset = datasets.ImageFolder(root = val_dir, transform=clean_val_transforms)

#BATCH_SIZE = 16

# train_dataloader = DataLoader(
#     train_dataset,
#     batch_size = BATCH_SIZE,
#     shuffle = True,
#     num_workers = 2,
#     pin_memory = True
# )

val_dataloader = DataLoader(
    val_dataset,
    batch_size = BATCH_SIZE,
    shuffle = False,
    num_workers = 2,
    pin_memory = True
)


model_classifier.eval()
val_labels, val_preds_probs = [], []

with torch.no_grad():
    for images, labels in tqdm(val_dataloader):
        images, labels = images.to(device), labels.to(device)
        outputs = model_classifier(images)
        probs = torch.softmax(outputs, dim=1)
        val_preds_probs.extend(probs.cpu().numpy())
        val_labels.extend(labels.cpu().numpy())

val_preds_hard = np.argmax(val_preds_probs, axis=1)
true_acc = accuracy_score(val_labels, val_preds_hard)
print(f"Your TRUE Validation Accuracy is: {true_acc:.4f}")

100%|██████████| 84/84 [00:39<00:00,  2.11it/s]

Your TRUE Validation Accuracy is: 0.7186





In [25]:
torch.save(model_classifier.state_dict(), 'final_resnet_before_adding_metadata.pth')

Now, working on the metadata to help model intake.

In [6]:
df = pd.read_csv('stage2_train_metadata.csv')
df.head()

Unnamed: 0,patientId,x,y,width,height,Target,class,age,sex,modality,position
0,0004cfab-14fd-4e49-80ba-63a80b6bddd6,,,,,0,No Lung Opacity / Not Normal,51,F,CR,PA
1,00313ee0-9eaa-42f4-b0ab-c148ed3241cd,,,,,0,No Lung Opacity / Not Normal,48,F,CR,PA
2,00322d4d-1c29-4943-afc9-b6754be640eb,,,,,0,No Lung Opacity / Not Normal,19,M,CR,AP
3,003d8fa0-6bf1-40ed-b54c-ac657f8495c5,,,,,0,Normal,28,M,CR,PA
4,00436515-870c-4b36-a041-de91049b9ab4,264.0,152.0,213.0,379.0,1,Lung Opacity,32,F,CR,AP


In [7]:
# Changing categorical variables of interest to numerical values

#Sex: Male = 0, Female = 1
df['sex'] = df['sex'].map({'M': 0, 'F': 1})

#position: AP = 0, PA = 1
df['position'] = df['position'].map({'AP': 0, 'PA': 1})

#scale age feature to be from 0 to 1
scaler = MinMaxScaler()
df['age'] = scaler.fit_transform(df[['age']])

#we are not including modality as it has only one unique value

In [8]:
# --- Map Folder Structure to Paths ---
import os

def get_path_dict(directory):
    """Creates a dictionary mapping patientId to the absolute path of their image file."""
    path_dict = {}
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(('.png', '.jpg')):
                pid = os.path.splitext(file)[0]
                path_dict[pid] = os.path.join(root, file)
    return path_dict

train_path_dict = get_path_dict(train_dir) 
val_path_dict = get_path_dict(val_dir)

In [9]:
temp = list(train_path_dict.items())
temp[0]

('000db696-cf54-4385-b10b-6b16fbb3f985',
 'C:\\Users\\Aalap\\Documents\\ML\\rnsa\\training\\lung_opacity\\000db696-cf54-4385-b10b-6b16fbb3f985.png')

In [10]:
# --- Synchronize CSV Labels (3 Classes) ---
class_mapping = {'Normal': 0,
                 'No Lung Opacity / Not Normal' : 1,
                 'Lung Opacity' : 2}

# Ensure the 'class' column strings match folder names
df['Target_3Class'] = df['class'].map(class_mapping)

In [11]:
df.head()

Unnamed: 0,patientId,x,y,width,height,Target,class,age,sex,modality,position,Target_3Class
0,0004cfab-14fd-4e49-80ba-63a80b6bddd6,,,,,0,No Lung Opacity / Not Normal,0.324675,1,CR,1,1
1,00313ee0-9eaa-42f4-b0ab-c148ed3241cd,,,,,0,No Lung Opacity / Not Normal,0.305195,1,CR,1,1
2,00322d4d-1c29-4943-afc9-b6754be640eb,,,,,0,No Lung Opacity / Not Normal,0.116883,0,CR,0,1
3,003d8fa0-6bf1-40ed-b54c-ac657f8495c5,,,,,0,Normal,0.175325,0,CR,1,0
4,00436515-870c-4b36-a041-de91049b9ab4,264.0,152.0,213.0,379.0,1,Lung Opacity,0.201299,1,CR,0,2


In [12]:
# Check if any rows failed to map (will result in NaN)
nans = df['Target_3Class'].isna().sum()
if nans > 0:
    print(f"Warning: {nans} rows failed to map. Check your class names!")
    print("Unique values in CSV:", df['class'].unique())
else:
    print("All rows mapped successfully!")

All rows mapped successfully!


In [13]:
train_df, val_df = train_test_split(df, test_size = 0.2, stratify=df['Target_3Class'], random_state=42)

print("Training set shape: ", train_df.shape)
print("Validation set shape: ", val_df.shape)

Training set shape:  (24181, 12)
Validation set shape:  (6046, 12)


In [13]:
class MultiModalClassifier(nn.Module):
    def __init__(self, num_classes, metadata_features, fine_tuned_model):
        super(MultiModalClassifier, self).__init__()

        #Image branch: use the existing fine-tuned model
        self.image_model = fine_tuned_model

        if isinstance(self.image_model.fc, nn.Sequential):
            num_ftrs = self.image_model.fc[1].in_features
        else:
            num_ftrs = self.image_model.fc.in_features

        self.image_model.fc = nn.Identity() #remove the final layer

        #Metadata branch
        self.metadata_fc = nn.Sequential(
            nn.Linear(metadata_features, 16),
            nn.ReLU(),
            nn.BatchNorm1d(16)#,
            #nn.Dropout(0.3)
        )

        self.classifier = nn.Sequential(
            nn.Linear(num_ftrs + 16, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(0.4),
            nn.Linear(256, num_classes)
        )

    def forward(self, image, metadata):
            img_features = self.image_model(image)
            meta_features = self.metadata_fc(metadata)
            combined = torch.cat((img_features, meta_features), dim=1)
            logits = self.classifier(combined)
            return logits


In [16]:
num_classes = 3
num_ftrs = 2048

model_classifier = models.resnet50()
model_classifier.fc = nn.Sequential(
    nn.Dropout(0.5),
    nn.Linear(num_ftrs, num_classes)
)

#load the weights
model_classifier.load_state_dict(torch.load('final_resnet_before_adding_metadata.pth', weights_only=True))
model_classifier.to(device)

#wrap this into MultiModalModel class
final_model = MultiModalClassifier(
    num_classes = num_classes,
    metadata_features = 3,
    fine_tuned_model = model_classifier
)

final_model.to(device)

print("Multi-modal model created successfully.")

Multi-modal model created successfully.


In [17]:
print(f"Using device: {device}")
print(f"Is model on CUDA? {next(final_model.parameters()).is_cuda}")

Using device: cuda
Is model on CUDA? True


In [18]:
optimizer = optim.Adam([
    {'params': final_model.image_model.parameters(), 'lr': 1e-6},
    {'params': final_model.metadata_fc.parameters(), 'lr': 1e-4},
    {'params': final_model.classifier.parameters(), 'lr': 1e-4}
], weight_decay=1e-5)

In [14]:
from PIL import Image

class RSNA_MultimodalDataset(Dataset):
    def __init__(self, metadata_df, path_dict, transform = None):
        self.df = metadata_df[metadata_df['patientId'].isin(path_dict.keys())].reset_index(drop=True)
        self.path_dict = path_dict
        self.transform = transform

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        pid = row['patientId']
        
        img_path = self.path_dict.get(pid)
        
        # DEBUGGING: If path is missing, print it!
        if img_path is None:
            print(f"MISSING IMAGE: {pid}") 
            # If you see this scrolling endlessly, your mapping is broken.
            return self.__getitem__((idx + 1) % len(self))
        
        image = Image.open(img_path).convert("RGB")

        if self.transform:
            image = self.transform(image)
        
        # Metadata: [age, sex, position]
        metadata = torch.tensor([row['age'], row['sex'], row['position']], dtype=torch.float32)
        label = int(row['Target_3Class'])

        return image, metadata, label

In [20]:
#training transformation
train_transforms = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

#validation transformation
val_transforms = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

In [21]:
#initiate datasets
train_ds = RSNA_MultimodalDataset(metadata_df = train_df, path_dict = train_path_dict, transform=train_transforms)
val_ds = RSNA_MultimodalDataset(metadata_df = val_df, path_dict = val_path_dict, transform=val_transforms)

BATCH_SIZE = 16

#Create DataLoaders
train_dataloader = DataLoader(
    train_ds,
    batch_size = BATCH_SIZE,
    shuffle = True,
    num_workers = 0,
    pin_memory = True
)

val_dataloader = DataLoader(
    val_ds,
    batch_size = BATCH_SIZE,
    shuffle = False,
    num_workers = 0,
    pin_memory = True
)

In [22]:
from sklearn.utils.class_weight import compute_class_weight

classes = np.unique(train_df['Target_3Class'])
weights = compute_class_weight(
    class_weight='balanced', 
    classes=classes, 
    y=train_df['Target_3Class']
)

class_weights = torch.tensor(weights, dtype=torch.float32).to(device)

print(f"Class mapping: {class_mapping}")
print(f"Calculated weights: {class_weights}")

Class mapping: {'Normal': 0, 'No Lung Opacity / Not Normal': 1, 'Lung Opacity': 2}
Calculated weights: tensor([1.1383, 0.8524, 1.0545], device='cuda:0')


In [28]:
#train and val loop
criterion = nn.CrossEntropyLoss(weight=class_weights)
num_epochs = 15
best_val_loss = float('inf')
patience = 3
patience_counter = 0

for epoch in range(num_epochs):

    #===================== Training Loop ====================
    final_model.train()
    train_loss, train_labels, train_preds = 0.0, [], []

    for images, metadata, labels in tqdm(train_dataloader, desc = f"Epoch {epoch+1}/{num_epochs} [Train]"):
        images = images.to(device)
        metadata = metadata.to(device).float()
        labels = labels.to(device)

        optimizer.zero_grad()

        #Forward pass with two outputs
        outputs = final_model(images, metadata)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        train_preds.extend(outputs.argmax(1).detach().cpu().numpy())
        train_labels.extend(labels.cpu().numpy())

    train_acc = accuracy_score(train_labels, train_preds)
    train_f1 = f1_score(train_labels, train_preds, average='macro')

    #===================== Validation Loop ====================

    final_model.eval()
    val_loss, val_labels, val_preds_probs, val_preds_hard = 0.0, [], [], []


    with torch.no_grad():
        for images, metadata, labels in tqdm(val_dataloader, desc = f"Epoch {epoch+1}/{num_epochs} [Val]"):
            images = images.to(device)
            metadata = metadata.to(device).float()
            labels = labels.to(device)

            outputs = final_model(images, metadata)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            probs = torch.softmax(outputs, dim = 1)
            val_preds_probs.extend(probs.detach().cpu().numpy())
            val_preds_hard.extend(outputs.argmax(1).detach().cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    
    # --- Calculate Metrics ---
    avg_train_loss = train_loss / len(train_dataloader)
    avg_val_loss = val_loss / len(val_dataloader)
    val_acc = accuracy_score(val_labels, val_preds_hard)
    val_f1 = f1_score(val_labels, val_preds_hard, average="macro")

    try:
        val_auc = roc_auc_score(val_labels, val_preds_probs, multi_class="ovr")
    except:
        val_auc = float("nan")

    # --- Print Metrics ---
    print(f"Epoch {epoch+1}/{num_epochs} | "
          f"Train Loss: {train_loss/len(train_dataloader):.4f}, Acc: {train_acc:.4f}, F1: {train_f1:.4f} | "
          f"Val Loss: {val_loss/len(val_dataloader):.4f}, Acc: {val_acc:.4f}, F1: {val_f1:.4f}, AUC: {val_auc:.4f} |")
    
    
    #check accuracy per class
    cm = confusion_matrix(val_labels, val_preds_hard, labels = [0,1,2])
    class_accuracies = cm.diagonal() / (cm.sum(axis=1) + 1e-6)

    print("\nClass-wise Validation Accuracies:")
    for i, class_name in enumerate(class_mapping.keys()):
        print(f"  {class_name}: {class_accuracies[i]:.4f}")

    names = list(class_mapping.keys())
    for i, class_name in enumerate(names):
        print(f"  -> {class_name}: {class_accuracies[i]:.4f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        print("Validation loss improved. Saving MultiModal Model...")
        torch.save(final_model.state_dict(), 'best_multimodal_classifier.pth')
    else:
        patience_counter += 1
        print(f"No improvement for {patience_counter} epochs.")
        if patience_counter >= patience:
            print("Early stopping triggered.")
            break
    print("-" * 30)
        


Epoch 1/15 [Train]: 100%|██████████| 1358/1358 [13:37<00:00,  1.66it/s]
Epoch 1/15 [Val]: 100%|██████████| 37/37 [00:19<00:00,  1.94it/s]


Epoch 1/15 | Train Loss: 0.6469, Acc: 0.7050, F1: 0.7082 | Val Loss: 0.6205, Acc: 0.7254, F1: 0.7256, AUC: 0.8857 |

Class-wise Validation Accuracies:
  Normal: 0.9176
  No Lung Opacity / Not Normal: 0.5689
  Lung Opacity: 0.7267
  -> Normal: 0.9176
  -> No Lung Opacity / Not Normal: 0.5689
  -> Lung Opacity: 0.7267
Validation loss improved. Saving MultiModal Model...
------------------------------


Epoch 2/15 [Train]: 100%|██████████| 1358/1358 [10:55<00:00,  2.07it/s]
Epoch 2/15 [Val]: 100%|██████████| 37/37 [00:13<00:00,  2.76it/s]


Epoch 2/15 | Train Loss: 0.6173, Acc: 0.7193, F1: 0.7235 | Val Loss: 0.5923, Acc: 0.7375, F1: 0.7387, AUC: 0.8883 |

Class-wise Validation Accuracies:
  Normal: 0.9066
  No Lung Opacity / Not Normal: 0.5867
  Lung Opacity: 0.7558
  -> Normal: 0.9066
  -> No Lung Opacity / Not Normal: 0.5867
  -> Lung Opacity: 0.7558
Validation loss improved. Saving MultiModal Model...
------------------------------


Epoch 3/15 [Train]: 100%|██████████| 1358/1358 [10:52<00:00,  2.08it/s]
Epoch 3/15 [Val]: 100%|██████████| 37/37 [00:13<00:00,  2.73it/s]


Epoch 3/15 | Train Loss: 0.6085, Acc: 0.7172, F1: 0.7216 | Val Loss: 0.5972, Acc: 0.7202, F1: 0.7190, AUC: 0.8878 |

Class-wise Validation Accuracies:
  Normal: 0.9231
  No Lung Opacity / Not Normal: 0.5200
  Lung Opacity: 0.7674
  -> Normal: 0.9231
  -> No Lung Opacity / Not Normal: 0.5200
  -> Lung Opacity: 0.7674
No improvement for 1 epochs.
------------------------------


Epoch 4/15 [Train]: 100%|██████████| 1358/1358 [10:49<00:00,  2.09it/s]
Epoch 4/15 [Val]: 100%|██████████| 37/37 [00:13<00:00,  2.82it/s]


Epoch 4/15 | Train Loss: 0.6027, Acc: 0.7231, F1: 0.7277 | Val Loss: 0.5948, Acc: 0.7340, F1: 0.7364, AUC: 0.8867 |

Class-wise Validation Accuracies:
  Normal: 0.8901
  No Lung Opacity / Not Normal: 0.6089
  Lung Opacity: 0.7326
  -> Normal: 0.8901
  -> No Lung Opacity / Not Normal: 0.6089
  -> Lung Opacity: 0.7326
No improvement for 2 epochs.
------------------------------


Epoch 5/15 [Train]: 100%|██████████| 1358/1358 [10:46<00:00,  2.10it/s]
Epoch 5/15 [Val]: 100%|██████████| 37/37 [00:13<00:00,  2.80it/s]

Epoch 5/15 | Train Loss: 0.5981, Acc: 0.7232, F1: 0.7281 | Val Loss: 0.5941, Acc: 0.7340, F1: 0.7327, AUC: 0.8879 |

Class-wise Validation Accuracies:
  Normal: 0.9341
  No Lung Opacity / Not Normal: 0.5422
  Lung Opacity: 0.7733
  -> Normal: 0.9341
  -> No Lung Opacity / Not Normal: 0.5422
  -> Lung Opacity: 0.7733
No improvement for 3 epochs.
Early stopping triggered.





In [29]:
torch.save(final_model.state_dict(), 'multimodal_classifier_iteration1.pth')

Saliency Map Visualizer

In [15]:
num_ftrs = 2048
model_classifier = models.resnet50()
model_classifier.fc = nn.Sequential(
    nn.Dropout(0.5),
    nn.Linear(num_ftrs, 3)
)

#recreate multimodel shell
final_model = MultiModalClassifier(
    num_classes=3,
    metadata_features=3,
    fine_tuned_model=model_classifier
)

#load the weights
checkpoint_path = 'multimodal_classifier_iteration1.pth'
state_dict = torch.load(checkpoint_path, map_location=device, weights_only=True)
final_model.load_state_dict(state_dict)

#move model to device
final_model.to(device)

#set model to evaluation mode
final_model.eval()

print("Multi-modal model loaded successfully for evaluation.")

Multi-modal model loaded successfully for evaluation.


In [24]:
import torch.nn.functional as F
from PIL import Image

def save_saliency_map(model, dataset, idx, device, save_path="saliency_check.png"):
    model.eval()
    
    # 1. Get the image and metadata
    img_tensor, meta, label = dataset[idx]
    img_input = img_tensor.unsqueeze(0).to(device) # [1, 3, 224, 224]
    meta_input = meta.unsqueeze(0).to(device).float()
    
    # 2. Hook into the last convolutional layer
    # For ResNet50, this is layer4[-1]
    target_layer = model.image_model.layer4[-1]
    
    feature_maps = []
    gradients = []
    
    def hook_fn(module, input, output):
        feature_maps.append(output)
        
    def backward_hook(module, grad_in, grad_out):
        gradients.append(grad_out[0])
        
    # Register hooks
    handle1 = target_layer.register_forward_hook(hook_fn)
    handle2 = target_layer.register_full_backward_hook(backward_hook)
    
    # 3. Forward Pass
    output = model(img_input, meta_input)
    
    # 4. Backward Pass (Focus on the 'Lung Opacity' class, which is index 2)
    model.zero_grad()
    score = output[0, 2] # Force it to explain why it thinks it's Class 2
    score.backward()
    
    # 5. Process Gradients (The Math part)
    grads = gradients[0] # [1, 2048, 7, 7]
    fmap = feature_maps[0] # [1, 2048, 7, 7]
    
    # Global Average Pooling of gradients
    weights = torch.mean(grads, dim=(2, 3), keepdim=True) # [1, 2048, 1, 1]
    
    # Weighted combination of feature maps
    cam = torch.sum(weights * fmap, dim=1, keepdim=True) # [1, 1, 7, 7]
    
    # ReLU (Remove negative correlations)
    cam = F.relu(cam)
    
    # Resize up to image size (224x224) using PyTorch Interpolation
    cam = F.interpolate(cam, size=(224, 224), mode='bilinear', align_corners=False)
    
    # Normalize 0-1
    cam = cam - cam.min()
    cam = cam / (cam.max() + 1e-7)
    
    # 6. Create the Overlay using PIL
    # Convert tensor to numpy for PIL
    heatmap_data = cam.squeeze().cpu().detach().numpy() # [224, 224]
    heatmap_uint8 = (heatmap_data * 255).astype(np.uint8)
    
    # Create a "Red" heatmap image
    # We make a red image where the transparency (Alpha) is controlled by the heatmap intensity
    red_overlay = Image.new("RGB", (224, 224), color=(255, 0, 0))
    mask = Image.fromarray(heatmap_uint8)
    
    # Get original image
    # Undo normalization to display it correctly
    inv_normalize = torch.as_tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
    mean = torch.as_tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
    orig_img_tensor = img_tensor.cpu() * inv_normalize + mean
    orig_img_tensor = torch.clamp(orig_img_tensor, 0, 1)
    
    # Convert original tensor to PIL
    orig_pil = transforms.ToPILImage()(orig_img_tensor)
    
    # Composite the images
    # We blend the Red Overlay onto the Original using the heatmap as the mask
    final_image = Image.composite(red_overlay, orig_pil, mask)
    
    # Blend it slightly so you can see the bones *under* the red
    final_output = Image.blend(orig_pil, final_image, alpha=0.5)
    
    # 7. Save to disk
    final_output.save(save_path)
    print(f"Saliency map saved to: {save_path}")
    
    # Clean up hooks
    handle1.remove()
    handle2.remove()

# --- Run it on a 'Lung Opacity' case ---
# Find a sample where the label is 2 (Lung Opacity)
opacity_indices = [i for i, x in enumerate(val_ds.df['Target_3Class']) if x == 2]

if len(opacity_indices) > 0:
    idx_to_check = opacity_indices[0] # Pick the first one
    print(f"Visualizing Patient ID: {val_ds.df.iloc[idx_to_check]['patientId']}")
    save_saliency_map(final_model, val_ds, idx_to_check, device, "saliency_test.png")
else:
    print("No Opacity cases found in validation set to visualize.")

Visualizing Patient ID: f99c02a3-1731-4d3b-80c2-fc0cf713787e
Saliency map saved to: saliency_test.png


Looking at saliency_test.png, the red "heat" is concentrated directly on the lung fields.

- Left Side (Patient's Right Lung): There is a clear hotspot on the mid-to-lower lung zone.

- Right Side (Patient's Left Lung): There is a massive area of activation covering almost the entire lung field.

- What it ignored: Crucially, the model is ignoring the "L" tag in the top right corner, the shoulder joints, and the empty black space above the shoulders.

This confirms the model (77% accuracy on Opacity) is not "cheating." It has learned that texture inside the ribcage is what matters for determining pneumonia. The diffuse red glow suggests it's picking up on the general "haziness" or "consolidation" characteristic of lung opacity.

The backbone and classifier are ready now. Can move ahead and build the Object Detection part using Faster R-CNN.

In [13]:
#processing metadata file to group boxes
import ast

#load the metadata csv
box_df = pd.read_csv('stage2_train_metadata.csv')

#filer only for the lung opacity classes
box_df = box_df[box_df['Target'] == 1]

#group by patientId
grouped_df = box_df.groupby('patientId').agg({
    'x': list,
    'y': list,
    'width': list,
    'height': list
}).reset_index()

print(f"Found {len(grouped_df)} images with bounding boxes.")
grouped_df.head()

Found 6012 images with bounding boxes.


Unnamed: 0,patientId,x,y,width,height
0,000db696-cf54-4385-b10b-6b16fbb3f985,"[316.0, 660.0]","[318.0, 375.0]","[170.0, 146.0]","[478.0, 402.0]"
1,000fe35a-2649-43d4-b027-e67796d412e0,"[570.0, 83.0]","[282.0, 227.0]","[269.0, 296.0]","[409.0, 438.0]"
2,001031d9-f904-4a23-b3e5-2c088acd19c6,"[66.0, 552.0]","[160.0, 164.0]","[373.0, 376.0]","[608.0, 676.0]"
3,001916b8-3d30-4935-a5d1-8eaddb1646cd,[198.0],[375.0],[114.0],[206.0]
4,0022073f-cec8-42ec-ab5f-bc2314649235,"[575.0, 161.0]","[232.0, 230.0]","[246.0, 223.0]","[528.0, 486.0]"


Detection Dataset Class

In [15]:
class RSNADetectionDataset(Dataset):
    def __init__(self, dataframe, path_dict, transform=None, resize_to=(512, 512)):
        self.df = dataframe[dataframe['patientId'].isin(path_dict.keys())].reset_index(drop=True)
        self.path_dict = path_dict
        self.transform = transform
        self.resize_to = resize_to

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        # 1. Load Image
        row = self.df.iloc[idx]
        pid = row['patientId']

        # Use path_dict for instant lookup
        image_path = self.path_dict[pid] 
        img = Image.open(image_path).convert("RGB")
        w_original, h_original = img.size

        # 2. Resize Image
        img = img.resize(self.resize_to)

        # 3. Convert Image to Tensor
        # This converts [0, 255] -> [0.0, 1.0] automatically
        img_tensor = transforms.ToTensor()(img)

        # 4. Handle Bounding Boxes
        boxes = []

        # Get lists of coordinates
        xs = row['x']
        ys = row['y']
        ws = row['width']
        hs = row['height']

        # Calculate scaling factors
        scale_x = self.resize_to[0] / w_original
        scale_y = self.resize_to[1] / h_original

        for i in range(len(xs)):
            # Convert xywh to xyxy format
            x_min = xs[i] * scale_x
            y_min = ys[i] * scale_y
            x_max = (xs[i] + ws[i]) * scale_x
            y_max = (ys[i] + hs[i]) * scale_y
            
            # --- CRITICAL FIX: CLAMPING ---
            # Ensure coordinates stay strictly within image boundaries [0, resize_to]
            # We subtract 1 from the max dimension to keep it valid (0-511 for a 512 image)
            x_min = max(0, min(x_min, self.resize_to[0] - 1))
            y_min = max(0, min(y_min, self.resize_to[1] - 1))
            x_max = max(0, min(x_max, self.resize_to[0]))
            y_max = max(0, min(y_max, self.resize_to[1]))

            # Only add the box if it still has area (width > 0 and height > 0)
            if (x_max > x_min) and (y_max > y_min):
                boxes.append([x_min, y_min, x_max, y_max])

        # 5. Create Targets
        # Handle cases where clamping removed all boxes or original list was empty
        if len(boxes) == 0:
            boxes = torch.zeros((0, 4), dtype=torch.float32)
            labels = torch.zeros((0,), dtype=torch.int64)
            area = torch.zeros((0,), dtype=torch.float32)
        else: 
            boxes = torch.as_tensor(boxes, dtype=torch.float32)
            labels = torch.ones((len(boxes),), dtype=torch.int64) # Class 1 = Opacity
            area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])

        image_id = torch.tensor([idx])
        iscrowd = torch.zeros((len(boxes),), dtype=torch.int64)

        target = {}
        target['boxes'] = boxes
        target['labels'] = labels
        target['image_id'] = image_id
        target['area'] = area
        target['iscrowd'] = iscrowd

        return img_tensor, target

In [16]:
#faster R-CNN requires a custom collate function for batching
def collate_fn(batch):
    return tuple(zip(*batch))

In [19]:
train_detection_ds = RSNADetectionDataset(
    dataframe = grouped_df,
    path_dict = train_path_dict,
    resize_to = (512, 512)
)

train_detection_loader = DataLoader(
    train_detection_ds,
    batch_size = 8, 
    shuffle = True,
    num_workers = 0,
    collate_fn = collate_fn
)

In [None]:
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
import torchvision

def get_detection_model(num_classes):
    #load the pre-trained model Faster R-CNN 
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights='DEFAULT')

    #get the number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features

    #replace the pre-trained head with a new one
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    return model

#Initialize the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_classes = 2 #background and lung opacity
model = get_detection_model(num_classes)
model.to(device)

print("Faster R-CNN model initialized and moved to device.")


Faster R-CNN model initialized and moved to device.


In [21]:
# 1. Get the weights from your classifier
classifier_weights = final_model.image_model.state_dict()

# 2. Load them into the 'body' of the detector backbone
# CRITICAL CHANGE: Added .body before .load_state_dict
msg = model.backbone.body.load_state_dict(classifier_weights, strict=False)

print("Transferred medically intelligent weights to Faster R-CNN backbone.")
print(f"Transfer details: {msg}")

Transferred medically intelligent weights to Faster R-CNN backbone.
Transfer details: <All keys matched successfully>


In [28]:
#define optimizer
params = [p for p in model.parameters() if p.requires_grad]

optimizer = optim.SGD(params,
                      lr = 0.0005,
                      momentum=0.9,
                      weight_decay = 0.0005)

#learning rete scheduler
lr_scheduler = optim.lr_scheduler.StepLR(optimizer,
                                         step_size=3,
                                         gamma=0.1)

num_epochs = 10

In [None]:

def save_visual_check(model, dataset, idx, device, epoch_num, save_path = 'visual_check.png'):

    model.eval()

    with torch.no_grad():
        #get the image and target
        img_tensor, _ = dataset[idx]

        #Run inference
        prediction = model([img_tensor.to(device)])[0]

        #covert tensor to PIL Image
        pil_img = transforms.ToPILImage()(img_tensor.cpu())

        #prepare drawing tool
        draw = ImageDraw.Draw(pil_img)

        #draw boxes
        boxes = prediction['boxes'].cpu().numpy()
        scores = prediction['scores'].cpu().numpy()

        found_box = False

        for i, box in enumerate(boxes):
            score = scores[i]
            if score >= 0.5:
                found_box = True
                x_min, y_min, x_max, y_max = box
                draw.rectangle([(x_min, y_min), (x_max, y_max)], outline='red', width=3)
                text = f"{score:.2f}"
                draw.text((x_min + 2, y_min - 10), text, fill = 'white')
    
    final_filename = f'epoch_{epoch_num}_' + save_path
    pil_img.save(final_filename)

    status = "Found Boxes" if found_box else "No Boxes Found > 0.5"
    print(f"Visual check saved to: {final_filename} ({status})")

    model.train()

In [None]:
def verify_dataset(dataset):
    print(f"Scanning {len(dataset)} items for invalid boxes...")
    invalid_count = 0
    for i in range(len(dataset)):
        try:
            img, target = dataset[i]
            boxes = target['boxes']
            
            # Check 1: No boxes (shouldn't happen if we filtered for Opacity)
            if boxes.shape[0] == 0:
                print(f"Warning: Index {i} has empty boxes.")
                
            # Check 2: Negative coordinates
            if (boxes < 0).any():
                print(f"FAIL: Index {i} has negative coordinates: {boxes}")
                invalid_count += 1
                
            # Check 3: x_max <= x_min
            # boxes is [x_min, y_min, x_max, y_max]
            widths = boxes[:, 2] - boxes[:, 0]
            heights = boxes[:, 3] - boxes[:, 1]
            
            if (widths <= 0).any() or (heights <= 0).any():
                print(f"FAIL: Index {i} has inverted/zero-area box: {boxes}")
                invalid_count += 1

        except Exception as e:
            print(f"CRASH at Index {i}: {e}")
            invalid_count += 1
            
    if invalid_count == 0:
        print("✅ Data Integrity Check Passed. All boxes are valid.")
    else:
        print(f"❌ Found {invalid_count} invalid items.")

# Run it
verify_dataset(train_detection_ds)

In [29]:
#train

## Freeze batchnorm
#this prevents the backbone statistics from fluctuating wildly with small batches
def freeze_batchnorm_stats(model):
    for module in model.modules():
        if isinstance(module, torch.nn.BatchNorm2d):
            module.eval()

model.train()
freeze_batchnorm_stats(model)

for epoch in range(num_epochs):
    print(f"--- Epoch {epoch+1}/{num_epochs} ---")

    epoch_loss = 0.0
    progress_bar = tqdm(train_detection_loader, desc = f"Epoch {epoch+1}/{num_epochs}")

    for images, targets in progress_bar:
        #move data to device
        images = list(image.to(device) for image in images)

        #targets must be a list of dictionaries (one dict per images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        #zero gradients
        optimizer.zero_grad()

        #forward pass (returns loss dictionary)
        loss_dict = model(images, targets)

        #sum all losses
        losses = sum(loss for loss in loss_dict.values())

        # Check for NaN immediately
        if torch.isnan(losses):
            print("ERROR: Loss went to NaN!")
            continue # Skip this batch to prevent crashing the whole run

        #backward pass
        losses.backward()

        # This forces gradients to stay small, preventing explosions
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)


        optimizer.step()

        #logging
        loss_value = losses.item()
        epoch_loss += loss_value
        progress_bar.set_postfix(loss=f"{loss_value: .4f}")

    #update learning rate
    lr_scheduler.step()

    print(f"Epoch {epoch+1} Average Loss: {epoch_loss/len(train_detection_loader):.4f}")

    #save checkpoint
    torch.save(model.state_dict(), f'faster_rcnn_epoch_{epoch+1}.pth')

    try:
        save_visual_check(model, train_detection_ds, idx = 0, device = device, epoch_num = epoch+1)
    except Exception as e:
        print(f"Visual check failed: {e}")


--- Epoch 1/10 ---


Epoch 1/10:   0%|          | 0/676 [00:00<?, ?it/s]

Epoch 1/10:   1%|▏         | 9/676 [02:20<2:53:56, 15.65s/it, loss=1886895603712.0000]


KeyboardInterrupt: 

In [30]:
# 1. Initialize STANDARD model (No custom weights)
model_control = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT")

# 2. Modify the head for 2 classes (Background + Opacity)
in_features = model_control.roi_heads.box_predictor.cls_score.in_features
model_control.roi_heads.box_predictor = FastRCNNPredictor(in_features, 2)
model_control.to(device)

# 3. Quick Training Check (Just 1 epoch, small LR)
params = [p for p in model_control.parameters() if p.requires_grad]
optimizer_control = optim.SGD(params, lr=0.001, momentum=0.9, weight_decay=0.0005)

print("--- STARTING CONTROL TEST ---")
model_control.train()

# We only run 20 batches to check stability
for i, (images, targets) in enumerate(train_detection_loader):
    if i > 20: break 
    
    images = list(image.to(device) for image in images)
    targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
    
    optimizer_control.zero_grad()
    loss_dict = model_control(images, targets)
    losses = sum(loss for loss in loss_dict.values())
    
    # Clip gradients just in case
    torch.nn.utils.clip_grad_norm_(model_control.parameters(), max_norm=2.0)
    
    losses.backward()
    optimizer_control.step()
    
    print(f"Batch {i}: Loss = {losses.item():.4f}")

print("--- CONTROL TEST FINISHED ---")

--- STARTING CONTROL TEST ---
Batch 0: Loss = 1.1104
Batch 1: Loss = 0.8452
Batch 2: Loss = 0.5505
Batch 3: Loss = 0.3050
Batch 4: Loss = 0.3250
Batch 5: Loss = 0.2132
Batch 6: Loss = 0.2926
Batch 7: Loss = 0.2744
Batch 8: Loss = 0.3157
Batch 9: Loss = 0.3188
Batch 10: Loss = 0.3718
Batch 11: Loss = 0.3343
Batch 12: Loss = 0.3473
Batch 13: Loss = 0.3905
Batch 14: Loss = 0.3219
Batch 15: Loss = 0.4499
Batch 16: Loss = 0.4459
Batch 17: Loss = 0.4141
Batch 18: Loss = 0.3990
Batch 19: Loss = 0.4001
Batch 20: Loss = 0.3943
--- CONTROL TEST FINISHED ---


CHANGE OF PLAN <br>
The model learned for classification is not helping Faster RCNN model with region proposals. Probably, it has not learned to detect edges as that was not the goal of its training. This has caused the loss values to explode. Continuing training for such a model is pointless. The plan therefore now, is to train Faster RCNN with standard weights and use the previouly trained classifier in a pipeline. So only if the classification is "lung_opacity", the image will be passed on the Faster RCNN model. The previous code cell demonstrated that using standard weights will not be too troublesome either. 

In [21]:
train_detection_ds = RSNADetectionDataset(
    dataframe=grouped_df,
    path_dict=train_path_dict,
    resize_to=(512, 512)
)

# Collate function for batching
def collate_fn(batch):
    return tuple(zip(*batch))

train_detection_loader = DataLoader(
    train_detection_ds,
    batch_size=8,
    shuffle=True,
    num_workers=0,
    collate_fn=collate_fn
)

In [None]:
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
import torchvision

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_classes = 2 #background and lung opacity
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights='DEFAULT')
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

model.to(device)

print("Standard Faster R-CNN initialized (COCO Weights).")


Standard Faster R-CNN initialized (COCO Weights).


In [22]:
#define optimizer
params = [p for p in model.parameters() if p.requires_grad]

optimizer = optim.SGD(params,
                      lr = 0.001,
                      momentum=0.9,
                      weight_decay = 0.0005)

#learning rete scheduler
lr_scheduler = optim.lr_scheduler.StepLR(optimizer,
                                         step_size=3,
                                         gamma=0.1)

In [23]:
## Freeze batchnorm
#this prevents the backbone statistics from fluctuating wildly with small batches
def freeze_batchnorm_stats(model):
    for module in model.modules():
        if isinstance(module, torch.nn.BatchNorm2d):
            module.eval()

In [24]:
from PIL import ImageDraw

def save_visual_check(model, dataset, idx, device, epoch_num, save_path = 'visual_check.png'):

    model.eval()

    with torch.no_grad():
        #get the image and target
        img_tensor, _ = dataset[idx]

        #Run inference
        prediction = model([img_tensor.to(device)])[0]

        #covert tensor to PIL Image
        pil_img = transforms.ToPILImage()(img_tensor.cpu())

        #prepare drawing tool
        draw = ImageDraw.Draw(pil_img)

        #draw boxes
        boxes = prediction['boxes'].cpu().numpy()
        scores = prediction['scores'].cpu().numpy()

        found_box = False

        for i, box in enumerate(boxes):
            score = scores[i]
            if score >= 0.5:
                found_box = True
                x_min, y_min, x_max, y_max = box
                draw.rectangle([(x_min, y_min), (x_max, y_max)], outline='red', width=3)
                text = f"{score:.2f}"
                draw.text((x_min + 2, y_min - 10), text, fill = 'white')
    
    final_filename = f'epoch_{epoch_num}_' + save_path
    pil_img.save(final_filename)

    status = "Found Boxes" if found_box else "No Boxes Found > 0.5"
    print(f"Visual check saved to: {final_filename} ({status})")

    model.train()

In [27]:
num_epochs = 10

print("--- STARTING TRAINING ---")
model.train()
freeze_batchnorm_stats(model) # Freeze BN stats once before starting

for epoch in range(num_epochs):
    print(f"--- Epoch {epoch+1}/{num_epochs} ---")
    
    epoch_loss = 0.0
    progress_bar = tqdm(train_detection_loader, desc=f"Epoch {epoch+1}")
    
    for images, targets in progress_bar:
        # Move to device
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
        
        optimizer.zero_grad()
        
        # Forward pass
        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())
        
        # Safety Check
        if not torch.isfinite(losses):
            print(f"WARNING: Loss is {losses.item()}, skipping batch.")
            continue
            
        # Backward pass
        losses.backward()
        
        # Clip Gradients (Stability)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        optimizer.step()
        
        # Logging
        loss_value = losses.item()
        epoch_loss += loss_value
        progress_bar.set_postfix(loss=f"{loss_value:.4f}")
    
    # End of Epoch Updates
    lr_scheduler.step()
    print(f"Epoch {epoch+1} Average Loss: {epoch_loss/len(train_detection_loader):.4f}")
    
    # Save Model
    torch.save(model.state_dict(), f'faster_rcnn_epoch_{epoch+1}.pth')
    
    # Visual Check
    try:
        save_visual_check(model, train_detection_ds, idx=0, device=device, epoch_num=epoch+1)
    except Exception as e:
        print(f"Visual check failed: {e}")

print("Training Complete!")

--- STARTING TRAINING ---
--- Epoch 1/10 ---


Epoch 1:  13%|█▎        | 87/676 [28:22<3:12:08, 19.57s/it, loss=0.3173]


KeyboardInterrupt: 

This setup is taking too long to train. <br>
Possible solutions:
- Pre-resize images
- use 320x320 instead of 512x512

In [14]:
#creating new resized folders
import os

os.makedirs("fast_train_images", exist_ok=True)

print("Pre-resizing training images...")
for pid, path in tqdm(train_path_dict.items()):
    #open image and resize
    img = Image.open(path).convert("RGB")
    img = img.resize((320,320))
    img.save(f"fast_train_images/{pid}.png")

Pre-resizing training images...


100%|██████████| 24015/24015 [22:54<00:00, 17.48it/s]


In [14]:
fast_train_path_dict = {}

for pid in train_path_dict.keys():
    # Point to the new .png file
    fast_train_path_dict[pid] = f"fast_train_images/{pid}.png"

print(f"Updated path dict for {len(fast_train_path_dict)} images.")

Updated path dict for 24015 images.


In [15]:
# --- 2. THE CACHED DATASET (RAM LOADING) ---
class CachedRSNADataset(Dataset):
    def __init__(self, dataframe, path_dict):
        self.df = dataframe[dataframe['patientId'].isin(path_dict.keys())].reset_index(drop=True)
        self.path_dict = path_dict
        
        self.current_size = (320, 320)
        self.original_size = (1024, 1024) 
        
        # LOAD ALL IMAGES INTO RAM NOW
        self.images_in_ram = []
        print(f"Loading {len(self.df)} images into RAM (approx 1-2 mins)...")
        
        for idx in tqdm(range(len(self.df))):
            row = self.df.iloc[idx]
            pid = row['patientId']
            image_path = self.path_dict[pid]
            
            with Image.open(image_path) as img:
                self.images_in_ram.append(img.convert("RGB"))
                
        print("✅ All images loaded. Training will be fast.")

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        # Instant access from RAM
        img = self.images_in_ram[idx]
        img_tensor = transforms.ToTensor()(img)

        # Handle Boxes
        row = self.df.iloc[idx]
        boxes = []
        xs = row['x']; ys = row['y']; ws = row['width']; hs = row['height']
        
        scale_x = self.current_size[0] / self.original_size[0]
        scale_y = self.current_size[1] / self.original_size[1]

        for i in range(len(xs)):
            x_min = max(0, min(xs[i] * scale_x, self.current_size[0] - 1))
            y_min = max(0, min(ys[i] * scale_y, self.current_size[1] - 1))
            x_max = max(0, min((xs[i] + ws[i]) * scale_x, self.current_size[0]))
            y_max = max(0, min((ys[i] + hs[i]) * scale_y, self.current_size[1]))

            if (x_max > x_min) and (y_max > y_min):
                boxes.append([x_min, y_min, x_max, y_max])

        if len(boxes) == 0:
            boxes = torch.zeros((0, 4), dtype=torch.float32)
            labels = torch.zeros((0,), dtype=torch.int64)
            area = torch.zeros((0,), dtype=torch.float32)
        else: 
            boxes = torch.as_tensor(boxes, dtype=torch.float32)
            labels = torch.ones((len(boxes),), dtype=torch.int64)
            area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])

        target = {}
        target['boxes'] = boxes
        target['labels'] = labels
        target['image_id'] = torch.tensor([idx])
        target['area'] = area
        target['iscrowd'] = torch.zeros((len(boxes),), dtype=torch.int64)

        return img_tensor, target

In [16]:
# --- 3. DATALOADER ---
def collate_fn(batch):
    return tuple(zip(*batch))

# Initialize Dataset
train_detection_ds = CachedRSNADataset(
    dataframe=grouped_df,
    path_dict=fast_train_path_dict 
)

# INCREASED BATCH SIZE TO 16 for extra speed
train_detection_loader = DataLoader(
    train_detection_ds,
    batch_size=16, 
    shuffle=True,
    num_workers=0,
    collate_fn=collate_fn
)

Loading 5404 images into RAM (approx 1-2 mins)...


100%|██████████| 5404/5404 [00:16<00:00, 336.50it/s]

✅ All images loaded. Training will be fast.





In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights='DEFAULT')
model.roi_heads.box_predictor = FastRCNNPredictor(model.roi_heads.box_predictor.cls_score.in_features, 2)
model.to(device)

params = [p for p in model.parameters() if p.requires_grad]
optimizer = optim.SGD(params, lr=0.001, momentum=0.9, weight_decay=0.0005)
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

# Helper for visualization
def save_visual_check(model, dataset, idx, device, epoch_num, save_path='visual_check.png'):
    model.eval()
    with torch.no_grad():
        img_tensor, _ = dataset[idx]
        prediction = model([img_tensor.to(device)])[0]
        pil_img = transforms.ToPILImage()(img_tensor.cpu())
        draw = ImageDraw.Draw(pil_img)
        boxes = prediction['boxes'].cpu().numpy()
        scores = prediction['scores'].cpu().numpy()
        
        found_box = False
        for i, box in enumerate(boxes):
            if scores[i] >= 0.5:
                found_box = True
                x_min, y_min, x_max, y_max = box
                draw.rectangle([(x_min, y_min), (x_max, y_max)], outline='red', width=3)
                draw.text((x_min, y_min - 10), f"{scores[i]:.2f}", fill='white')
    
    final_filename = f'epoch_{epoch_num}_{save_path}'
    pil_img.save(final_filename)
    model.train()

def freeze_batchnorm_stats(model):
    for module in model.modules():
        if isinstance(module, torch.nn.BatchNorm2d):
            module.eval()

# --- 5. TRAINING LOOP ---
print("--- STARTING OPTIMIZED TRAINING ---")
model.train()
freeze_batchnorm_stats(model)

num_epochs = 10

for epoch in range(num_epochs):
    print(f"--- Epoch {epoch+1}/{num_epochs} ---")
    
    epoch_loss = 0.0
    progress_bar = tqdm(train_detection_loader, desc=f"Epoch {epoch+1}")
    
    for images, targets in progress_bar:
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
        
        optimizer.zero_grad()
        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())
        
        if not torch.isfinite(losses):
            continue
            
        losses.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        
        loss_value = losses.item()
        epoch_loss += loss_value
        progress_bar.set_postfix(loss=f"{loss_value:.4f}")
    
    lr_scheduler.step()
    print(f"Epoch {epoch+1} Average Loss: {epoch_loss/len(train_detection_loader):.4f}")
    
    torch.save(model.state_dict(), f'faster_rcnn_epoch_{epoch+1}.pth')
    
    try:
        save_visual_check(model, train_detection_ds, idx=0, device=device, epoch_num=epoch+1)
    except Exception:
        pass

print("Done.")

--- STARTING OPTIMIZED TRAINING ---
--- Epoch 1/10 ---


Epoch 1:  12%|█▏        | 40/338 [50:37<6:17:12, 75.95s/it, loss=0.3660]


KeyboardInterrupt: 

In [18]:
import torch
import time

print(f"PyTorch Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"Current Device: {torch.cuda.get_device_name(0)}")
    print(f"Device Count: {torch.cuda.device_count()}")
else:
    print("❌ CRITICAL: Running on CPU. This explains the slowness.")

# Speed Test
print("\n--- GPU SPEED TEST ---")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
x = torch.randn(8, 3, 320, 320).to(device)

start = time.time()
# Run a dummy pass through the model's backbone
if 'model' in locals():
    model.to(device)
    model.backbone(x)
    torch.cuda.synchronize() if torch.cuda.is_available() else None
    print(f"Batch processing time: {time.time() - start:.4f} seconds")
else:
    print("Model not loaded yet, skipping speed test.")

PyTorch Version: 2.5.1
CUDA Available: True
Current Device: NVIDIA GeForce GTX 1660 Ti
Device Count: 1

--- GPU SPEED TEST ---
Batch processing time: 1.3117 seconds


Since the model is taking too long to train after many attempts, I am shifting the further part of the training to a new notebook to use RAM better.