In [1]:
import torch
from torch import nn
from datahandling import get_dev_data
from tqdm import tqdm
import config 
from models.vqa_model import VQA
from os.path import join
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
from datahandling import load_multilabel_binarizer, QUESTIONS_TO_BE_ANSWERED

2024-11-28 13:10:07.453497: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-28 13:10:07.465940: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1732795807.481345 2367689 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1732795807.485890 2367689 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-28 13:10:07.502114: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Find logs at /home/schafhdaniel@edu.local/med/medVQA-imageClef-Polyps/models/logs/20241128-131009


In [None]:
# Model name is the filename of the model in the trained models folder, model id is the id of the model on huggingface
# MODEL_NAME = "vqa_model_dinov2.pth"
# model_id = "facebook/dinov2-large"

# MODEL_NAME = "vqa_aimv2.pth"
# model_id = "apple/aimv2-large-patch14-224"

MODEL_NAME = "vqa_beit.pth"
model_id = "microsoft/beit-base-patch16-224-pt22k-ft22k"

mlb = load_multilabel_binarizer()

# Dont change this unless there are mor answers possible than before
NUM_LABELS = len(mlb.classes_)


def evaluation():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Running on device: {device}")

    model = load_model(device)
    _, _, test_loader = get_dev_data(image_processor_model_id=model_id, debug_mode=False)  
    criterion = nn.BCEWithLogitsLoss()

    loss = 0
    y_pred = []
    y_true = []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Testing"):
            (images, questions, question_attention_mask), labels = batch
            images, questions, question_attention_mask, labels = images.to(device), questions.to(device), question_attention_mask.to(device), labels.to(device)
            
            outputs = model(images, questions, question_attention_mask)
            loss = criterion(outputs, labels)
            loss += loss.item()
            
            y_pred.append(outputs)
            y_true.append(labels)

    y_pred = torch.cat(y_pred)
    y_true = torch.cat(y_true)
    
    
    return y_true, y_pred, device, loss / len(test_loader)
   


def load_model(device) -> torch.nn.Module:
    model = VQA(vision_model_id=model_id)
    model.load_state_dict(torch.load(join(config.trained_model_path, MODEL_NAME), map_location=device , weights_only=True))
    model.to(device) 
    model.eval() 
    return model


In [None]:
y_true, y_pred, device, test_loss = evaluation()

In [None]:
from collections import defaultdict

idx_without = []


# save which indices of y_true/y_preds are for the questions
per_question = dict()

results = {
    "question": [],
    "accuracy": [],
    "precision": [],
    "recall": [],
    "f1": []
}

for answer in mlb.classes_:
    question = answer.split("_")[0]
    per_question[question] = []
    
    
for idx, truth in enumerate(y_true):
    correct_answers = mlb.inverse_transform(np.array([truth]))[0]

    if len(correct_answers) > 0:
        first_answer = correct_answers[0]
        
        key = first_answer.split("_")[0]
        per_question[key].append(idx)
    else:
        idx_without.append(idx)
# from y_preds, y_true get the indices per question, then calculate the metrics on these instances


for question, indexes in per_question.items():
    results["question"].append(question)
    results["accuracy"].append(round(accuracy_score(y_true[indexes], torch.sigmoid(y_pred[indexes])>=0.5),4))
    results["precision"].append(round(precision_score(y_true[indexes], torch.sigmoid(y_pred[indexes])>=0.5, average="samples", zero_division=0), 4))
    results["recall"].append(round(recall_score(y_true[indexes], torch.sigmoid(y_pred[indexes])>=0.5, average="samples", zero_division=0), 4))
    results["f1"].append(round(f1_score(y_true[indexes], torch.sigmoid(y_pred[indexes])>=0.5, average="samples", zero_division=0), 4))

In [None]:
idx_without.__len__() # these somehow return () idk why

In [None]:
df = pd.DataFrame.from_dict(results)

In [None]:
df

In [None]:

results = f""" Aggregated 
    accuracy: {df.accuracy.mean():.4f},  
    precision: {df.precision.mean():.4f},  
    recall: {df.recall.mean():.4f}
    F1 Score: {df.f1.mean():.4f},  
"""

print(results)

In [None]:

results = f""" Performance on Dev test set
    accuracy: {accuracy_score(y_true, torch.sigmoid(y_pred)>=0.5):.4f},  
    precision: {precision_score(y_true, torch.sigmoid(y_pred)>=0.5, average="samples", zero_division=0):.4f},  
    recall: {recall_score(y_true, torch.sigmoid(y_pred)>=0.5, average="samples", zero_division=0):.4f}
    F1 Score: {f1_score(y_true, torch.sigmoid(y_pred)>=0.5, average="samples", zero_division=0):.4f},  
"""    

print(results)

under the table insert following output for the summary statistics

In [None]:

summary_stats = """ 
\\hline
\\textbf{All} &  """ + f" \\textbf{{{df.accuracy.mean():.4f}}}  & \\textbf{{{df.precision.mean():.4f}}} & \\textbf{{{df.recall.mean():.4f}}} & \\textbf{{{df.f1.mean():.4f}}}" + """ \\\\
\\hline
"""


In [None]:
latex_table = df.to_latex(index=False, float_format="%.4f")

In [None]:
insertion_point = latex_table.rfind(r"\bottomrule")
latex_table = latex_table[:insertion_point] + summary_stats + latex_table[insertion_point:]

new_header = r"""
\begin{tabular}{lcccc}
\toprule
\textbf{question} & \textbf{accuracy} & \textbf{precision} & \textbf{recall} & \textbf{f1} \\
\hline
\midrule
"""

midrule_index = latex_table.find(r"\midrule")
modified_table = new_header + latex_table[midrule_index + len(r"\midrule"):]


In [None]:
print(modified_table)