In [1]:
import pandas as pd
import os
import torch
from typing import Tuple

import sys

sys.path.extend(["../"])

from audio_slowfast.utils.metrics import topk_accuracies_slide, multitask_topk_accuracies_slide

In [2]:
res_path = "../results/res-2s.pkl"
os.path.exists(res_path)

True

In [3]:
df = pd.read_pickle(res_path)

verbs_probs = torch.from_numpy(df["verb_output"])
verbs_labels = torch.from_numpy(df["verb_labels"])
nouns_probs = torch.from_numpy(df["noun_output"])
nouns_labels = torch.from_numpy(df["noun_labels"])

In [4]:
unique_pred_verb = torch.argmax(verbs_probs, axis=1)
unique_pred_noun = torch.argmax(nouns_probs, axis=1)

assert unique_pred_verb.shape[0] == verbs_probs.shape[0]
assert unique_pred_noun.shape[0] == nouns_probs.shape[0]

In [5]:
torch.unique(unique_pred_verb)

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 35, 36, 37,
        39, 40, 41, 42, 44, 45, 46, 47, 48, 49, 51, 53, 56, 57, 58, 60, 67, 68,
        69, 70, 72, 74, 77])

## Create function to compute metrics

In [6]:
def compute_metrics(
    verbs_probs: torch.Tensor,
    verbs_labels: torch.Tensor,
    nouns_probs: torch.Tensor,
    nouns_labels: torch.Tensor,
    per_action_instance: bool = False,
) -> Tuple[float, float, float]:
    action_acc = multitask_topk_accuracies_slide(
        (verbs_probs, nouns_probs),
        (verbs_labels, nouns_labels),
        ks=(1,),
        per_action_instance=per_action_instance,
    )[0]
    verb_acc = topk_accuracies_slide(
        verbs_probs,
        verbs_labels,
        ks=(1,),
        per_action_instance=per_action_instance,
    )[0]
    noun_acc = topk_accuracies_slide(
        nouns_probs,
        nouns_labels,
        ks=(1,),
        per_action_instance=per_action_instance,
    )[0]

    print(f"A: {action_acc:.2f}")
    print(f"V: {verb_acc:.2f}")
    print(f"N: {noun_acc:.2f}")

    return action_acc, verb_acc, noun_acc

## Get results for `WHOLE_VIDEO` mode

In [7]:
a_wv, v_wv, n_wv = compute_metrics(
    verbs_probs=verbs_probs,
    verbs_labels=verbs_labels,
    nouns_probs=nouns_probs,
    nouns_labels=nouns_labels,
    per_action_instance=False,
)

A: 6.10
V: 25.56
N: 10.63


## Get results for `IN_ACTION_BOUNDS` mode

In [8]:
# Find rows where all elements are -1
labels_to_discard_verbs = torch.all(verbs_labels == -1, dim=1)
labels_to_discard_nouns = torch.all(nouns_labels == -1, dim=1)

assert nouns_labels[~labels_to_discard_nouns].shape == verbs_labels[~labels_to_discard_verbs].shape

In [9]:
a_ab, v_ab, n_ab = compute_metrics(
    verbs_probs=verbs_probs[~labels_to_discard_verbs],
    verbs_labels=verbs_labels[~labels_to_discard_verbs],
    nouns_probs=nouns_probs[~labels_to_discard_nouns],
    nouns_labels=nouns_labels[~labels_to_discard_nouns],
    per_action_instance=False,
)

A: 8.79
V: 36.81
N: 15.31
