In [1]:
%pip install torch

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/share/apps/python/3.8.6/intel/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import os
import torch
from typing import Tuple

In [4]:
def load_data(path: str):
    df = pd.read_pickle(path)

    verbs_probs = torch.from_numpy(df["verb_output"])
    verbs_labels = torch.from_numpy(df["verb_labels"])
    nouns_probs = torch.from_numpy(df["noun_output"])
    nouns_labels = torch.from_numpy(df["noun_labels"])
    
    return verbs_probs, verbs_labels, nouns_probs, nouns_labels

## Create function to compute metrics

In [25]:
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.

"""Functions for computing metrics."""

import torch
import numpy as np


def topks_correct(preds, labels, ks, inside_action_bounds=True, weight=None):
    """
    Given the predictions, labels, and a list of top-k values, compute the
    number of correct predictions for each top-k value.

    Args:
        preds (array): array of predictions. Dimension is batchsize
            N x ClassNum.
        labels (array): array of labels. Dimension is batchsize N.
        ks (list): list of top-k values. For example, ks = [1, 5] correspods
            to top-1 and top-5.

    Returns:
        topks_correct (list): list of numbers, where the `i`-th entry
            corresponds to the number of top-`ks[i]` correct predictions.
    """
    weight = torch.ones(preds.size(0))/preds.size(0) if weight==None else weight/torch.sum(weight)
    assert preds.size(0) == labels.size(
        0
    ), "Batch dim of predictions and labels must match"
    # Find the top max_k predictions for each sample
    _top_max_k_vals, top_max_k_inds = torch.topk(
        preds, max(ks), dim=1, largest=True, sorted=True
    )
    # (batch_size, max_k) -> (max_k, batch_size).
    top_max_k_inds = top_max_k_inds.t()
    # (batch_size, ) -> (max_k, batch_size).
    if inside_action_bounds:
        rep_max_k_labels = labels.view(1, -1).expand_as(top_max_k_inds.repeat(4,1))
        top_max_k_correct = top_max_k_inds.eq(rep_max_k_labels)
    else:
        top_max_k_correct = torch.zeros_like(top_max_k_inds)
        for label in labels.t():
            rep_max_k_labels = label.view(1, -1).expand_as(top_max_k_inds)
            top_max_k_correct_ = top_max_k_inds.eq(rep_max_k_labels)
            top_max_k_correct |= top_max_k_correct_
    # (i, j) = 1 if top i-th prediction for the j-th sample is correct.
    # Compute the number of topk correct predictions for each k.
    topks_correct = [
        (weight*top_max_k_correct[:k, :]).contiguous().view(-1).float().sum() for k in ks
    ]
    return topks_correct


def multitask_topks_correct(preds, labels, ks=(1,), inside_action_bounds=True, weight=None):
    """
    Args:
        preds: tuple(torch.FloatTensor), each tensor should be of shape
            [batch_size, class_count], class_count can vary on a per task basis, i.e.
            outputs[i].shape[1] can be different to outputs[j].shape[j].
        labels: tuple(torch.LongTensor), each tensor should be of shape [batch_size]
        ks: tuple(int), compute accuracy at top-k for the values of k specified
            in this parameter.
    Returns:
        tuple(float), same length at topk with the corresponding accuracy@k in.
    """
    weight = torch.ones(preds[0].size(0)) if weight is None else weight
    num_vids = torch.sum(weight)
    weight = weight/num_vids
    max_k = int(np.max(ks))
    task_count = len(preds)
    batch_size = labels[0].size(0)
    all_correct = torch.zeros(max_k, batch_size).type(torch.ByteTensor)
    #if torch.cuda.is_available():
    #    all_correct = all_correct.cuda()
    for output, label in zip(preds, labels):
        _, max_k_idx = output.topk(max_k, dim=1, largest=True, sorted=True)
        # Flip batch_size, class_count as .view doesn't work on non-contiguous
        max_k_idx = max_k_idx.t()
        if inside_action_bounds:
            correct_for_task = max_k_idx.repeat(4,1).eq(label.view(1, -1).expand_as(max_k_idx))
        else:
            correct_for_task = torch.zeros_like(max_k_idx)
            for l in label.t():
                correct_for_task_ = max_k_idx.eq(l.view(1, -1).expand_as(max_k_idx))
                correct_for_task |= correct_for_task_
        all_correct.add_(correct_for_task)
    multitask_topks_correct = [
        (weight*torch.ge((all_correct[:k]).float().sum(0), task_count)).float().sum(0) for k in ks
    ]

    return multitask_topks_correct


def topk_errors(preds, labels, ks):
    """
    Computes the top-k error for each k.
    Args:
        preds (array): array of predictions. Dimension is N.
        labels (array): array of labels. Dimension is N.
        ks (list): list of ks to calculate the top accuracies.
    """
    num_topks_correct = topks_correct(preds, labels, ks)
    return [(1.0 - x) * 100.0 for x in num_topks_correct]


def topk_accuracies(preds, labels, ks, inside_action_bounds=True, weight = None):
    """
    Computes the top-k accuracy for each k.
    Args:
        preds (array): array of predictions. Dimension is N.
        labels (array): array of labels. Dimension is N.
        ks (list): list of ks to calculate the top accuracies.
    """
    num_topks_correct = topks_correct(preds, labels, ks, inside_action_bounds, weight)
    return [x * 100.0 for x in num_topks_correct]


def multitask_topk_accuracies(preds, labels, ks, inside_action_bounds=True, weight = None):
    """
    Computes the top-k accuracy for each k.
    Args:
        preds (array): array of predictions. Dimension is N.
        labels (array): array of labels. Dimension is N.
        ks (list): list of ks to calculate the top accuracies.
   """
    num_multitask_topks_correct = multitask_topks_correct(preds, labels, ks, inside_action_bounds, weight)
    return [x * 100.0 for x in num_multitask_topks_correct]


def multi_top1_action_accuracy_per_action(verb_probs, noun_probs, verb_labels, noun_labels):
    """
    Computes the top1 accuracy for the multi-action task.
    """
    n_correct = 0
    n_classes = 0
    
    verb_preds = torch.argmax(verb_probs, dim=1)
    noun_preds = torch.argmax(noun_probs, dim=1)

    N = noun_preds.shape[0]

    for i in range(N):
        unique_labels = set(zip(verb_labels[i], noun_labels[i]))
        n_classes += len(unique_labels)

        for verb_label, noun_label in unique_labels:
            n_correct += int((verb_preds[i] == verb_label) and (noun_preds[i] == noun_label))

    return n_correct / n_classes * 100


def top1_accuracy_per_action(probs, labels):
    """
    Computes the top1 accuracy for the multi-action task.
    """
    n_correct = 0
    n_classes = 0
    
    preds = torch.argmax(probs, dim=1)

    N = preds.shape[0]

    for i in range(N):
        unique_labels = labels[i].unique()
        n_classes += unique_labels.shape[0]

        for label in unique_labels:
            n_correct += int(preds[i] == label)

    return n_correct / n_classes * 100

In [26]:
def compute_metrics(
    verbs_probs: torch.Tensor,
    verbs_labels: torch.Tensor,
    nouns_probs: torch.Tensor,
    nouns_labels: torch.Tensor,
    per_action_instance: bool = False,
) -> Tuple[float, float, float]:
    if per_action_instance:
        action_acc = multi_top1_action_accuracy_per_action(
            verb_probs=verbs_probs,
            noun_probs=nouns_probs,
            verb_labels=verbs_labels,
            noun_labels=nouns_labels,
        )
        verb_acc = top1_accuracy_per_action(
            verbs_probs, verbs_labels
        )
        noun_acc = top1_accuracy_per_action(
            nouns_probs, nouns_labels
        )
    else:
        action_acc = multitask_topk_accuracies(
            (verbs_probs, nouns_probs),
            (verbs_labels, nouns_labels),
            (1,),
            inside_action_bounds=False,
        )[0]
        verb_acc = topk_accuracies(
            verbs_probs,
            verbs_labels,
            (1,),
            per_action_instance,
        )[0]
        noun_acc = topk_accuracies(
            nouns_probs,
            nouns_labels,
            (1,),
            per_action_instance,
        )[0]

    return action_acc, verb_acc, noun_acc

## Get results for `WHOLE_VIDEO` mode

In [27]:
a_wv, v_wv, n_wv = compute_metrics(
    verbs_probs=verbs_probs,
    verbs_labels=verbs_labels,
    nouns_probs=nouns_probs,
    nouns_labels=nouns_labels,
    per_action_instance=False,
)

## Get results for `IN_ACTION_BOUNDS` mode

In [30]:
# Find rows where all elements are -1
labels_to_discard_verbs = torch.all(verbs_labels == -1, dim=1)
labels_to_discard_nouns = torch.all(nouns_labels == -1, dim=1)

assert nouns_labels[~labels_to_discard_nouns].shape == verbs_labels[~labels_to_discard_verbs].shape

In [31]:
a_ab, v_ab, n_ab = compute_metrics(
    verbs_probs=verbs_probs[~labels_to_discard_verbs],
    verbs_labels=verbs_labels[~labels_to_discard_verbs],
    nouns_probs=nouns_probs[~labels_to_discard_nouns],
    nouns_labels=nouns_labels[~labels_to_discard_nouns],
    per_action_instance=False,
)

In [30]:
res_path = "../runs/asf-original-slide/scores/{}/slide/validation.pkl"

print("=" * 100)
windows = [0.5, 1.0, 2.0, 4.0, 8.0]
for win in windows:
    assert os.path.exists(res_path.format(str(win)))
    print(f"Window: {win}")
    verbs_probs, verbs_labels, nouns_probs, nouns_labels = load_data(res_path.format(str(win)))
    
    # Compute results for whole video
    a_wv, v_wv, n_wv = compute_metrics(
        verbs_probs=verbs_probs,
        verbs_labels=verbs_labels,
        nouns_probs=nouns_probs,
        nouns_labels=nouns_labels,
        per_action_instance=False,
    )
    
    # Compute results for inside action bounds
    labels_to_discard_verbs = torch.all(verbs_labels == -1, dim=1)
    labels_to_discard_nouns = torch.all(nouns_labels == -1, dim=1)

    assert nouns_labels[~labels_to_discard_nouns].shape == verbs_labels[~labels_to_discard_verbs].shape
    
    a_ab, v_ab, n_ab = compute_metrics(
        verbs_probs=verbs_probs[~labels_to_discard_verbs],
        verbs_labels=verbs_labels[~labels_to_discard_verbs],
        nouns_probs=nouns_probs[~labels_to_discard_nouns],
        nouns_labels=nouns_labels[~labels_to_discard_nouns],
        per_action_instance=False,
    )
    
    # Compute per action
    a_pi, v_pi, n_pi = compute_metrics(
        verbs_probs=verbs_probs[~labels_to_discard_verbs],
        verbs_labels=verbs_labels[~labels_to_discard_verbs],
        nouns_probs=nouns_probs[~labels_to_discard_nouns],
        nouns_labels=nouns_labels[~labels_to_discard_nouns],
        per_action_instance=True,
    )
    
    print(f"\tSlide per action      - A: {a_pi:.2f}, V: {v_pi:.2f}, N: {n_pi:.2f}")    
    print(f"\tSlide labeled footage - A: {a_ab:.2f}, V: {v_ab:.2f}, N: {n_ab:.2f}")
    print(f"\tSlide full footage    - A: {a_wv:.2f}, V: {v_wv:.2f}, N: {n_wv:.2f}")
    print("=" * 100)

Window: 0.5
	Slide per action      - A: 2.38, V: 18.18, N: 6.38
	Slide labeled footage - A: 2.61, V: 18.93, N: 6.57
	Slide full footage    - A: 1.80, V: 13.09, N: 4.54
Window: 1.0
	Slide per action      - A: 5.00, V: 25.17, N: 10.32
	Slide labeled footage - A: 5.33, V: 26.19, N: 10.61
	Slide full footage    - A: 3.69, V: 18.12, N: 7.34
Window: 2.0
	Slide per action      - A: 8.40, V: 35.35, N: 14.88
	Slide labeled footage - A: 8.79, V: 36.81, N: 15.31
	Slide full footage    - A: 6.10, V: 25.56, N: 10.63
Window: 4.0
	Slide per action      - A: 4.97, V: 26.92, N: 10.56
	Slide labeled footage - A: 5.19, V: 28.03, N: 10.86
	Slide full footage    - A: 3.61, V: 19.50, N: 7.56
Window: 8.0
	Slide per action      - A: 1.51, V: 17.46, N: 5.57
	Slide labeled footage - A: 1.62, V: 18.18, N: 5.73
	Slide full footage    - A: 1.13, V: 12.69, N: 4.00
