In [2]:
# Importing libraries
import pandas as pd
from preprocessing_utils import prepare_dataset, split_train_val, prepare_test_dataset
import numpy as np
from IPython.display import display
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import pandas as pd
from dataset import CheXpertDataset
import matplotlib.pyplot as plt
import numpy as np
import torchvision.models as models
import torch.nn as nn
from training_utils import train_model, upload_pretrained, upload_pretrained_vit
from torchsummary import summary
import torchvision.models as models
from tqdm.notebook import tqdm
import torchvision.models as models
import pickle


In [27]:
class_names = ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Pleural Effusion']
policies = ['ones', 'zeroes', 'mixed']
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize images to 224x224 pixels
    transforms.ToTensor(),          # Convert images to PyTorch tensors
    transforms.Normalize(mean=[0.5], std=[0.5])  # Normalize grayscale images
])


# Import the validation dataset as final test set
test_df = pd.read_csv('CheXpert-v1.0-small/valid.csv')

# Prepare the test dataset using the last policy in the list and the defined class names
test_image_paths, test_label_matrix = prepare_dataset(test_df, policies[-1], class_names)

# Create a DataFrame for the test image paths
test_image_paths_df = pd.DataFrame({'path': test_image_paths})

# Create a DataFrame for the test labels with the class names as columns
test_labels_df = pd.DataFrame(test_label_matrix, columns=class_names)

# Combine the test image paths DataFrame and the test labels DataFrame
test_df = pd.concat([test_image_paths_df, test_labels_df], axis=1)


In [9]:
resnet18 = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
model_resnet = upload_pretrained(resnet18, add_layers=True, n_labels=len(class_names), freeze_layers=True)
state_dict = torch.load("model_resnet.pth", map_location=torch.device('cpu'))

model_resnet.load_state_dict(state_dict)

<All keys matched successfully>

In [33]:
import torch
from tqdm import tqdm

def evaluate_model(model, test_loader: DataLoader, criterion, device='cuda'):
    """
    Evaluate a trained PyTorch model on a test dataset.

    Parameters:
    - model: The trained PyTorch model.
    - test_loader: DataLoader for the test dataset.
    - criterion: The loss function used during training.
    - device: Device to evaluate the model on ('cpu' or 'cuda').

    Returns:
    - test_loss: Average loss on the test dataset.
    - test_accuracy: Overall accuracy on the test dataset.
    - all_predictions: List of predicted values for all samples.
    - all_labels: List of ground-truth labels for all samples.
    """
    # model.to(device)
    # model.eval()  # Set the model to evaluation mode
    
    test_loss = 0.0
    correct = 0
    total = 0
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for inputs, labels in tqdm(test_loader, desc="Evaluating", unit="batch"):
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            test_loss += loss.item()

            # Convert outputs to binary predictions (multi-label classification)
            predicted = (torch.sigmoid(outputs) > 0.5).float()
            
            # Store predictions and labels
            all_predictions.append(predicted.cpu())
            all_labels.append(labels.cpu())

            correct += (predicted == labels).sum().item()
            total += labels.numel()  # Total elements (samples × labels)

    test_loss /= len(test_loader)
    test_accuracy = correct / total

    # Convert pre`dictions and labels to tensors
    all_predictions = torch.cat(all_predictions, dim=0)
    all_labels = torch.cat(all_labels, dim=0)

    print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

    return test_loss, test_accuracy, all_predictions, all_labels



In [34]:
test_dataset = CheXpertDataset(test_df, class_names, transform=transform)

# Create DataLoader for the test dataset
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [35]:
criterion = nn.BCEWithLogitsLoss()
test_loss, test_accuracy, predictions, true_labels = evaluate_model(model_resnet, test_loader, criterion)

Evaluating: 100%|██████████| 13/13 [00:08<00:00,  1.61batch/s]

Test Loss: 0.5618, Test Accuracy: 0.7228





In [44]:
predictions[2]

tensor([0., 0., 0., 0., 0.])

In [43]:
true_labels[2]

tensor([0., 1., 0., 1., 0.])

In [55]:
predicted_labels = (predictions > 0.5).int()

In [57]:
predictions

tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0.],
        ...,
        [0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]])

In [31]:
import torch
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    hamming_loss,
    classification_report
)

# Convert predictions to binary values based on a threshold (0.5)
predicted_labels = (predictions > 0.5).int()

# Convert tensors to numpy arrays for sklearn metrics
true_labels_np = true_labels.numpy()
predicted_labels_np = predicted_labels.numpy()

# 1. Accuracy 
subset_accuracy = accuracy_score(true_labels_np, predicted_labels_np)
print("Multi-label subset accuracy:", subset_accuracy)

# 2. Calculate precision, recall, and F1 score using 'samples', 'macro' and 'micro' averaging
precision_samples = precision_score(true_labels_np, predicted_labels_np, average='samples', zero_division=0)
recall_samples = recall_score(true_labels_np, predicted_labels_np, average='samples', zero_division=0)
f1_samples = f1_score(true_labels_np, predicted_labels_np, average='samples', zero_division=0)

print("Precision (samples average):", precision_samples)
print("Recall (samples average):", recall_samples)
print("F1 Score (samples average):", f1_samples)
print('_________________________________________________________________________________________________________')


precision_macro = precision_score(true_labels_np, predicted_labels_np, average='macro', zero_division=0)
recall_macro = recall_score(true_labels_np, predicted_labels_np, average='macro', zero_division=0)
f1_macro = f1_score(true_labels_np, predicted_labels_np, average='macro', zero_division=0)

print("Precision (macro average):", precision_macro)
print("Recall (macro average):", recall_macro)
print("F1 Score (macro average):", f1_macro)
print('_________________________________________________________________________________________________________')


precision_micro = precision_score(true_labels_np, predicted_labels_np, average='micro', zero_division=0)
recall_micro = recall_score(true_labels_np, predicted_labels_np, average='micro', zero_division=0)
f1_micro = f1_score(true_labels_np, predicted_labels_np, average='micro', zero_division=0)

print("Precision (micro average):", precision_micro)
print("Recall (micro average):", recall_micro)
print("F1 Score (micro average):", f1_micro)
print('_________________________________________________________________________________________________________')


print("\nClassification Report:")
print(classification_report(true_labels_np, predicted_labels_np, zero_division=0))


Multi-label subset accuracy: 0.3465346534653465
Precision (samples average): 0.09900990099009901
Recall (samples average): 0.03118811881188118
F1 Score (samples average): 0.046888260254596886
_________________________________________________________________________________________________________
Precision (macro average): 0.3
Recall (macro average): 0.06726190476190477
F1 Score (macro average): 0.08622540250447228
_________________________________________________________________________________________________________
Precision (micro average): 0.5121951219512195
Recall (micro average): 0.07526881720430108
F1 Score (micro average): 0.13125
_________________________________________________________________________________________________________

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        75
           1       0.00      0.00      0.00        66
           2       0.00      0.00      0.00        32
    