# Action recognition course : LAB 1

> Author : BABIN-RIBY Hugo, See LICENSE FIle

This lab is meant to be completed by students. There is a set of challenges they need to overcome by asking questions to gain experience.

> Note that this notebook was meant to be executed in google colab but can easily be ported to other services with minimal work.

At the end of this notebook (~1h30-2h) you will :

- Have a better expertise in video data inference
- Have hands-on experience with
  - action recognition models
  - action recognition datasets
- Know the pros and cons of each type of architecture

## 1 : Gathering data

Before doing anything fancy, we need some video data. For this first lab, w'ell focus on the **kinetics 400** dataset to compare networks execution time and precisions.

Gathering data and pre-processing it is often a very challenging part in industry projects. In this lab, the data will be provided.

In [None]:
# Deps
%pip install torch
import torch
import urllib
import platform
%pip install -q yt-dlp pandas tqdm &> /dev/null
%pip install pytorchvideo &> /dev/null
%pip install --upgrade pytorchvideo &> /dev/null
%apt update &> /dev/null
%pip install torchvision
# Kinetics label file : Kinetics has a lot of classes (400), this file associate each ID with a class name
json_url = "https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json"
json_filename = "kinetics_classnames.json"
try: urllib.URLopener().retrieve(json_url, json_filename)
except: urllib.request.urlretrieve(json_url, json_filename)
# test by importing a model form pytorchvideo
test = torch.hub.load('facebookresearch/pytorchvideo', 'slowfast_r50', pretrained=True)

In [None]:

# Download kinetics 400 videos
import pandas as pd
import os
import subprocess
from tqdm import tqdm
import urllib.request
import tarfile

def download_kinetics_samples(num_samples=20, output_dir='kinetics_samples'):
    os.makedirs(output_dir, exist_ok=True)
    print(f"Created directory: {output_dir}")

    metadata_file = "kinetics400/train.csv"
    if not os.path.exists(metadata_file):
        print("Downloading Kinetics metadata...")
        url = "https://storage.googleapis.com/deepmind-media/Datasets/kinetics400.tar.gz"
        urllib.request.urlretrieve(url, "kinetics400.tar.gz")

        print("Extracting metadata...")
        with tarfile.open("kinetics400.tar.gz", "r:gz") as tar:
            tar.extractall()
        os.remove("kinetics400.tar.gz")

    print("Reading metadata...")
    df = pd.read_csv(metadata_file)

    samples = df.sample(n=num_samples, random_state=42)
    print(f"Selected {num_samples} random samples")

    successful_downloads = 0
    for idx, row in tqdm(samples.iterrows(), total=num_samples, desc="Downloading videos"):
        video_id = row['youtube_id']
        start_time = row['time_start']
        end_time = row['time_end']
        label = row['label'].replace(' ', '_')

        output_path = os.path.join(output_dir, f"{label}-{video_id}-{start_time}-{end_time}.mp4")

        if not os.path.exists(output_path):
            try:
                command = [
                    'yt-dlp',
                    f'https://youtube.com/watch?v={video_id}',
                    '--quiet',
                    '--format', 'mp4',
                    '--output', output_path,
                    '--postprocessor-args',
                    f'-ss {start_time} -t {end_time-start_time}'
                ]
                result = subprocess.run(command, capture_output=True, text=True)

                if os.path.exists(output_path):
                    successful_downloads += 1
            except Exception as e:
                print(f"\nError downloading {video_id}: {str(e)}")

    print(f"Download complete to {output_dir}/")
    return successful_downloads

# this will later be used for testing
N = download_kinetics_samples(200, output_dir='kinetics_samples') - 1

### Prepare the data for inference

In [None]:
# Lets visualize the data..
import matplotlib.pyplot as plt
from pytorchvideo.data.encoded_video import EncodedVideo

In [None]:
import os
import random
import matplotlib.pyplot as pl

video_dir = "kinetics_samples"

videos = [f for f in os.listdir(video_dir) if f.endswith('.mp4')]

if not videos:
    raise Exception(f"No videos found in {video_dir}")

# Select random video
video_name = random.choice(videos)
video_path = os.path.join(video_dir, video_name)

print(f"Selected video: {video_name}")

# Read the video
video = EncodedVideo.from_path(video_path)
video_data = video.get_clip(start_sec=0, end_sec=1)
video_data = video_data["video"]

# imshow but in rgb
print(video_data.shape)
frame = video_data[:, 0, :, :].permute(1, 2, 0)
print(frame.shape)
plt.imshow(frame/255)
plt.axis("off")
plt.show()

In [None]:
# Create DATALOADERS builders for different models
from typing import Dict
import json
import urllib
from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)
from pytorchvideo.data.encoded_video import EncodedVideo
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample,
    UniformCropVideo
)
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import shutil
import numpy
import torch

# CREATE ID <> CLASS NAME TRANSLATORS
with open(json_filename, "r") as f:
    kinetics_classnames = json.load(f)

kinetics_id_to_classname = {}
for k, v in kinetics_classnames.items():
    kinetics_id_to_classname[v] = str(k).replace('"', "")

classname_to_kinetics_id = {v: k for k, v in kinetics_id_to_classname.items()}


def get_transform(side_size, crop_size, num_frames) :
  """Returns a transform to pre-process video frames dpending on models specificities."""
  mean = [0.45, 0.45, 0.45]
  std = [0.225, 0.225, 0.225]
  transform =  ApplyTransformToKey(
      key="video",
      transform=Compose(
          [
              UniformTemporalSubsample(num_frames),
              Lambda(lambda x: x/255.0),
              NormalizeVideo(mean, std),
              ShortSideScale(
                  size=side_size
              ),
              CenterCropVideo(crop_size)
          ]
      ),
  )
  return transform

class VideoDatasetBuilder(Dataset):
    def __init__(self, video_folder, transform, clip_duration):
        self.video_folder = video_folder
        self.transform = transform
        self.clip_duration = clip_duration
        self.file_list = []
        for filename in os.listdir(video_folder):
          if os.path.isfile(os.path.join(video_folder, filename)):
            self.file_list.append(filename)

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        # Get video path and label a,d start timastamp
        video_path = self.file_list[idx]
        class_name = video_path.split('-')[0]
        class_name = class_name.replace("_", " ")
        class_id = classname_to_kinetics_id.get(class_name, None)
        label = [0] * 400
        label[class_id] = 1
        label = torch.tensor(numpy.array(label))

        start_sec = int(video_path.split('-')[-2])

        # Load video
        video = EncodedVideo.from_path("./kinetics_samples/"+video_path)
        video_data = video.get_clip(start_sec=start_sec, end_sec=start_sec+self.clip_duration)
        video_data = self.transform(video_data)['video']


        return video_data, label


# Test the data set builder
test_data_loader = VideoDatasetBuilder(video_folder="./"+video_dir, transform=get_transform(256,256,8), clip_duration=9)
test_sample = test_data_loader[5]
print(test_sample[0].shape)

## 2 : Using Slowfast

First we will use the SlowFast model :

1. Import & declare the pre-trained model
2. Run inference
3. Gather the following metrics
  - Accuracy
  - Execution time over 200 samples

In [None]:
# Load the model
slowfast_model = torch.hub.load('facebookresearch/pytorchvideo', 'slowfast_r50', pretrained=True)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
slowfast_model = slowfast_model.to(device)

### TEST #1 : basic archery data

inpired from : https://pytorch.org/hub/facebookresearch_pytorchvideo_slowfast/

In [None]:
# get archery video
url_link = "https://dl.fbaipublicfiles.com/pytorchvideo/projects/archery.mp4"
video_path = 'archery.mp4'
try: urllib.URLopener().retrieve(url_link, video_path)
except: urllib.request.urlretrieve(url_link, video_path)
clip_duration = 9

# CROP VIDEO FOR SLOWFAST

# Slowfasts's specific input metadata
side_size = 256
crop_size = 256
num_frames = 32
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
sampling_rate = 2
frames_per_second = 30
slowfast_alpha = 4
num_clips = 10
num_crops = 3
clip_duration = (num_frames * sampling_rate)/frames_per_second

video = EncodedVideo.from_path(video_path)
start_sec = 0
end_sec = start_sec + clip_duration
video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)

# DECLARE SLOWFAST SPECIFIC TRASNFORMATIONS
class PackPathway(torch.nn.Module):
    """
    Transform for converting video frames as a list of tensors.
    """
    def __init__(self):
        super().__init__()

    def forward(self, frames: torch.Tensor):
        fast_pathway = frames
        # Perform temporal sampling from the fast pathway.
        slow_pathway = torch.index_select(
            frames,
            1,
            torch.linspace(
                0, frames.shape[1] - 1, frames.shape[1] // slowfast_alpha
            ).long(),
        )
        frame_list = [slow_pathway, fast_pathway]
        return frame_list

sf_transform =  ApplyTransformToKey(
    key="video",
    transform=Compose(
        [
            UniformTemporalSubsample(num_frames),
            Lambda(lambda x: x/255.0),
            NormalizeVideo(mean, std),
            ShortSideScale(
                size=side_size
            ),
            CenterCropVideo(crop_size),
            PackPathway()
        ]
    ),
)

video_data = sf_transform(video_data)

inputs = video_data["video"]
inputs = [i.to(device)[None, ...] for i in inputs]

preds= slowfast_model(inputs)

# Get the predicted classes
post_act = torch.nn.Softmax(dim=1)
preds = post_act(preds)
pred_classes = preds.topk(k=5).indices[0]

# Map the predicted classes to the label names
pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]
print("Top 5 predicted labels: %s" % ", ".join(pred_class_names))

# Check out what is going into the model
inputs = inputs[0]
print(inputs.shape)

### Test on actual kinetics400 data

In [None]:
import torch
import time

# Lists to store results
all_predictions = []
all_true_labels = []

# declare dataset based on kinetics 400
tsm_dataset = VideoDatasetBuilder(video_folder="./"+video_dir, transform=sf_transform, clip_duration=clip_duration)
start_time = time.time()

for i in range(N):
    sample = tsm_dataset[i]
    inputs = sample[0]
    true_labels = sample[1]
    inputs = [i.to(device)[None, ...] for i in inputs]

    # Get predictions
    preds = slowfast_model(inputs)
    post_act = torch.nn.Softmax(dim=1)
    preds = post_act(preds)

    # Get top 5 predictions
    top5_preds = preds.topk(k=5)
    pred_classes = top5_preds.indices[0]

    # Convert predicted classes to label names
    pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]

    # Find the true class (index where label is 1)
    true_class_idx = torch.where(torch.tensor(true_labels) == 1)[0].item()
    true_class_name = kinetics_id_to_classname[true_class_idx]

    print(f"\nSample {i}:")
    print(f"True label: {true_class_name}")
    print(f"Top 5 predictions: {', '.join(pred_class_names)}")

    # Store results for accuracy computation
    all_predictions.append(pred_classes.cpu())
    all_true_labels.append(true_class_idx)

end_time=time.time()

# Convert to tensors for easier computation
all_predictions = torch.stack(all_predictions)
all_true_labels = torch.tensor(all_true_labels)

# Compute Top-1 accuracy
top1_correct = (all_predictions[:, 0] == all_true_labels).sum().item()
top1_accuracy = (top1_correct / N) * 100

# Compute Top-5 accuracy
top5_correct = sum(true_label in pred_classes for true_label, pred_classes in zip(all_true_labels, all_predictions))
top5_accuracy = (top5_correct / N) * 100

print(f"\nFinal Results:")
print(f"Top-1 Accuracy: {top1_accuracy:.2f}%")
print(f"Top-5 Accuracy: {top5_accuracy:.2f}%")
print(f"Execution time =", round(end_time - start_time,2))
print(f"Execution time per sample =", round((end_time - start_time)/N,2))
print(f"Sample duration =", round(clip_duration,2))
print(f"model I/O rtio" , round((end_time - start_time)/N/clip_duration,2))

## 3 : Using TSM

First we will use the **TSM** model :

1. Import & declare the pre-trained model
2. Run inference
3. Gather the following metrics
  - Accuracy
  - Execution time over 200 samples
4. As a bonus, you can also try the more precise TSM versions. See TSM repo for pretrained links :
  - https://github.com/mit-han-lab/temporal-shift-module?tab=readme-ov-file#kinetics-400

### Loading the model

In [None]:
!rm -rf temporal-shift-module
!git clone https://github.com/mit-han-lab/temporal-shift-module.git
%cd temporal-shift-module
import sys
sys.path.append('.')
from ops.models import TSN
%cd ..
# Get the weights, see TSM's github page. WATCH OUT ! all models does not yield same performance and does not ask for
# the same method for pre-processing the data !
!wget https://hanlab18.mit.edu/projects/tsm/models/TSM_kinetics_RGB_resnet50_shift8_blockres_avg_segment8_e50.pth

In [None]:
from collections import OrderedDict

def create_tsm_model(weights_path, num_class=400):
    # Model parameters
    num_segments = 8
    modality = 'RGB'
    base_model = 'resnet50'
    consensus_type = 'avg'
    dropout = 0.8
    img_feature_dim = 256

    # TSM specific parameters
    shift_div = 8  # Number of divisions for shift
    shift_place = 'blockres'  # Where to place shift operations

    # Initialize model
    model = TSN(num_class=num_class,
                num_segments=num_segments,
                modality=modality,
                base_model=base_model,
                consensus_type=consensus_type,
                dropout=dropout,
                img_feature_dim=img_feature_dim,
                partial_bn=False,
                is_shift=True,  # This makes it TSM
                shift_div=shift_div,
                shift_place=shift_place).to("cpu")

    checkpoint = torch.load(weights_path)
    state_dict = checkpoint['state_dict']

    # Remove 'module.' prefix if it exists
    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        # Remove 'module.' prefix if exists
        if k.startswith('module.'):
            k = k[7:]

        # Fix the layer structure naming
        k = k.replace('.block.', '.')
        if '.nl.' in k:  # Skip non-local layers if they cause issues
            continue

        new_state_dict[k] = v

    # Load the weights
    model.load_state_dict(new_state_dict)

    # Set to evaluation mode and move to GPU if available
    model.eval()
    if torch.cuda.is_available():
        model = model.cuda()

    return model

# Example usage:
weights_path = "TSM_kinetics_RGB_resnet50_shift8_blockres_avg_segment8_e50.pth"
if os.path.exists(weights_path) == False:
  !wget https://hanlab18.mit.edu/projects/tsm/models/TSM_kinetics_RGB_resnet50_shift8_blockres_avg_segment8_e100_dense_nl.pth
  !wget https://hanlab18.mit.edu/projects/tsm/models/TSM_kinetics_RGB_resnet50_shift8_blockres_avg_segment8_e50.pth
  ...
tsm_model = create_tsm_model(weights_path)

In [None]:
print(tsm_model.input_size)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
tsm_model = tsm_model.to(device)

### Test #1 for TSM on example "archery" data

https://pytorch.org/hub/facebookresearch_pytorchvideo_slowfast/

In [None]:
# GET THE DATA
url_link = "https://dl.fbaipublicfiles.com/pytorchvideo/projects/archery.mp4"
video_path = 'archery.mp4'
try: urllib.URLopener().retrieve(url_link, video_path)
except: urllib.request.urlretrieve(url_link, video_path)
# clip_duration = 9

# CROP VIDEO FOR TSM
video = EncodedVideo.from_path(video_path)
start_sec = 0
end_sec = start_sec + clip_duration
video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)
tsm_transform = get_transform(224,224,8)
video_data = tsm_transform(video_data)

inputs = video_data["video"].to(device)
preds= tsm_model(inputs)

# Get the predicted classes
post_act = torch.nn.Softmax(dim=1)
preds = post_act(preds)
pred_classes = preds.topk(k=5).indices[0]

# Map the predicted classes to the label names
pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]
print("Top 5 predicted labels: %s" % ", ".join(pred_class_names))

# Check out what is going into the model
print(inputs.shape)
fig, axes = plt.subplots(2, 4, figsize=(10, 8))
for i in range(8):
    col, row = divmod(i, 2)
    axes[row, col].imshow(inputs[0, i, :, :].cpu(), cmap="gray")
    axes[row, col].axis("off")
plt.tight_layout()
plt.show()

### Testing on actual kinetics400 data...

Let's try it on more elaborate data ...

In [None]:
import torch

# Lists to store results
all_predictions = []
all_true_labels = []

start_time = time.time()
# The clip duration metric is up to the appreciation and context (kinetics clips are 10 for exmaple)
# We can also optimise this metric but it will not be covered in this course
tsm_clip_duration = 0.6

# declare dataset based on kinetics 400
tsm_dataset = VideoDatasetBuilder(video_folder="./"+video_dir, transform=get_transform(256,256,8), clip_duration=tsm_clip_duration)

for i in range(N):
    sample = tsm_dataset[i]
    input = sample[0].to(device)
    true_labels = sample[1]

    # Get predictions
    preds = tsm_model(input)
    post_act = torch.nn.Softmax(dim=1)
    preds = post_act(preds)

    # Get top 5 predictions
    top5_preds = preds.topk(k=5)
    pred_classes = top5_preds.indices[0]

    # Convert predicted classes to label names
    pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]

    # Find the true class (index where label is 1)
    true_class_idx = torch.where(torch.tensor(true_labels) == 1)[0].item()
    true_class_name = kinetics_id_to_classname[true_class_idx]

    print(f"\nSample {i}:")
    print(f"True label: {true_class_name}")
    print(f"Top 5 predictions: {', '.join(pred_class_names)}")

    # Store results for accuracy computation
    all_predictions.append(pred_classes.cpu())
    all_true_labels.append(true_class_idx)

end_time = time.time()

# Convert to tensors for easier computation
all_predictions = torch.stack(all_predictions)
all_true_labels = torch.tensor(all_true_labels)

# Compute Top-1 accuracy
top1_correct = (all_predictions[:, 0] == all_true_labels).sum().item()
top1_accuracy = (top1_correct / N) * 100

# Compute Top-5 accuracy
top5_correct = sum(true_label in pred_classes for true_label, pred_classes in zip(all_true_labels, all_predictions))
top5_accuracy = (top5_correct / N) * 100

print(f"\nFinal Results:")
print(f"Top-1 Accuracy: {top1_accuracy:.2f}%")
print(f"Top-5 Accuracy: {top5_accuracy:.2f}%")
print(f"Execution time =", round(end_time - start_time,2))
print(f"Execution time per sample =", round((end_time - start_time)/N,2))
print(f"Sample duration =", round(tsm_clip_duration,2))
print(f"model I/O rtio" , round((end_time - start_time)/N/tsm_clip_duration,2))

### Conclusion on TSM

#### 1 : Precision results

As we can see, the TSM usage on real data did not go as expected. Maybe this is due to data preprocessing not being exactly the same as the one proposed in the researcher's work (which is rather complex).

Solution ? Using a fine-tuning layer would be a great option !

#### 2 : Computing power

THis model has a better I/O ratio than slowfast



## 4 : Using TSN

Finally, we will test the TSN model :

1. Import & declare the pre-trained model
2. Run inference
3. Gather the following metrics
  - Accuracy
  - Execution time over N samples

TSN & TSM share the same base, We'll use the same methodology as TSM except we won't add any shifting, effectively creating a "native" TSN model.

Resource : https://github.com/mit-han-lab/temporal-shift-module?tab=readme-ov-file

In [None]:
# get the TSN weights
!wget https://hanlab18.mit.edu/projects/tsm/models/TSM_kinetics_RGB_resnet50_avg_segment5_e50.pth

# create the model
from collections import OrderedDict

def create_tsm_model(weights_path, num_class=400):
    # Model parameters
    num_segments = 8
    modality = 'RGB'
    base_model = 'resnet50'
    consensus_type = 'avg'
    dropout = 0.8
    img_feature_dim = 256

    # Initialize model & setting "is_shift" to False
    model = TSN(num_class=num_class,
                num_segments=num_segments,
                modality=modality,
                base_model=base_model,
                consensus_type=consensus_type,
                dropout=dropout,
                img_feature_dim=img_feature_dim,
                partial_bn=False,
                is_shift=False,).to("cpu")

    checkpoint = torch.load(weights_path)
    state_dict = checkpoint['state_dict']

    # Remove 'module.' prefix if it exists
    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        # Remove 'module.' prefix if exists
        if k.startswith('module.'):
            k = k[7:]

        # Fix the layer structure naming
        k = k.replace('.block.', '.')
        if '.nl.' in k:  # Skip non-local layers if they cause issues
            continue

        new_state_dict[k] = v

    # Load the weights
    model.load_state_dict(new_state_dict)

    # Set to evaluation mode and move to GPU if available
    model.eval()
    if torch.cuda.is_available():
        model = model.cuda()

    return model

# Example usage:
weights_path = "TSM_kinetics_RGB_resnet50_avg_segment5_e50.pth"
tsn_model = create_tsm_model(weights_path)

## Test on basic archery data

In [None]:
# GET THE DATA
url_link = "https://dl.fbaipublicfiles.com/pytorchvideo/projects/archery.mp4"
video_path = 'archery.mp4'
if not os.path.exists(video_path):
  try: urllib.URLopener().retrieve(url_link, video_path)
  except: urllib.request.urlretrieve(url_link, video_path)
# clip_duration = 10

# CROP VIDEO FOR TSN
video = EncodedVideo.from_path(video_path)
start_sec = 0
end_sec = start_sec + clip_duration
video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)
tsm_transform = get_transform(224,224,8)
video_data = tsm_transform(video_data)

inputs = video_data["video"].to(device)
preds= tsn_model(inputs)

# Get the predicted classes
post_act = torch.nn.Softmax(dim=1)
preds = post_act(preds)
pred_classes = preds.topk(k=5).indices[0]

# Map the predicted classes to the label names
pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]
print("Top 5 predicted labels: %s" % ", ".join(pred_class_names))

# Check out what is going into the model
print(inputs.shape)
fig, axes = plt.subplots(2, 4, figsize=(10, 8))
for i in range(8):
    col, row = divmod(i, 2)
    axes[row, col].imshow(inputs[0, i, :, :].cpu(), cmap="gray")
    axes[row, col].axis("off")
plt.tight_layout()
plt.show()

## Test on actual kinetics400 data

In [None]:
import torch

# Lists to store results
all_predictions = []
all_true_labels = []

start_time = time.time()
# clip_duration = 10

# declare dataset based on kinetics 400
tsn_dataset = VideoDatasetBuilder(video_folder="./"+video_dir, transform=get_transform(256,256,8), clip_duration=clip_duration)

for i in range(N):
    sample = tsn_dataset[i]
    input = sample[0].to(device)
    true_labels = sample[1]

    # Get predictions
    preds = tsn_model(input)
    post_act = torch.nn.Softmax(dim=1)
    preds = post_act(preds)

    # Get top 5 predictions
    top5_preds = preds.topk(k=5)
    pred_classes = top5_preds.indices[0]

    # Convert predicted classes to label names
    pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]

    # Find the true class (index where label is 1)
    true_class_idx = torch.where(torch.tensor(true_labels) == 1)[0].item()
    true_class_name = kinetics_id_to_classname[true_class_idx]

    print(f"\nSample {i}:")
    print(f"True label: {true_class_name}")
    print(f"Top 5 predictions: {', '.join(pred_class_names)}")

    # Store results for accuracy computation
    all_predictions.append(pred_classes.cpu())
    all_true_labels.append(true_class_idx)

end_time = time.time()

# Convert to tensors for easier computation
all_predictions = torch.stack(all_predictions)
all_true_labels = torch.tensor(all_true_labels)

# Compute Top-1 accuracy
top1_correct = (all_predictions[:, 0] == all_true_labels).sum().item()
top1_accuracy = (top1_correct / N) * 100

# Compute Top-5 accuracy
top5_correct = sum(true_label in pred_classes for true_label, pred_classes in zip(all_true_labels, all_predictions))
top5_accuracy = (top5_correct / N) * 100

print(f"\nFinal Results:")
print(f"Top-1 Accuracy: {top1_accuracy:.2f}%")
print(f"Top-5 Accuracy: {top5_accuracy:.2f}%")
print(f"Execution time =", round(end_time - start_time,2))
print(f"Execution time per sample =", round((end_time - start_time)/N,2))
print(f"Sample duration =", round(clip_duration,2))
print(f"model I/O rtio" , round((end_time - start_time)/N/clip_duration,2))