In [4]:
# ===== INSTALL DEPENDENCIES =====
!pip install huggingface_hub
!pip install boto3 -q
!pip install opencv-python torch numpy torchvision tqdm



In [5]:
# Import the required libraries
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from huggingface_hub import hf_hub_download
import boto3
from botocore import UNSIGNED
from botocore.config import Config
import os
import cv2
import numpy as np
from tqdm import tqdm
import time

# Please double, triple, quadruple check that the below code runs without errors before submitting.

## TODO 1 - Enter your HuggingFace username below:

In [6]:
hf_username = "EleftheriaK"

## TODO 2 - Define your model EXACTLY as you did in your training code (otherwise there will be errors, and, possibly, tears).

Note below the classname is 'YourModelArchitecture'. That's because it literally needs to be YOUR MODEL ARCHITECTURE. This class definition is later referred to below in the 'load_model_from_hub' method. The architecture must match here, or it will not be able to instantiate the model weights correctly once it downloads them from HuggingFace. Pay very close attention to getting this right, please.

Replace the below code, and replace the corresponding line in the 'load_model_from_hub' method.

In [7]:
# =============================================================================
# 1. MODEL DEFINITION (must match training)
# =============================================================================

import torch
import torch.nn as nn
from torchvision.models.video import r2plus1d_18, R2Plus1D_18_Weights

class YourModelArchitecture(nn.Module):
    def __init__(self, num_classes=7):
        super().__init__()
        weights = R2Plus1D_18_Weights.DEFAULT
        self.backbone = r2plus1d_18(weights=weights)
        in_features = self.backbone.fc.in_features
        self.backbone.fc = nn.Linear(in_features, num_classes)

    def forward(self, x):
        #(B, C, T, H, W)
        return self.backbone(x)


## Download the test data from s3, and create the corresponding dataset + dataloader.

There's no TODO for you here. This text is just here to explain to you what this code does.

In this instance, the test data IS the training data you were provided in the Model Training notebook. This is by design. You do not have access to the test data. This is a simple check to make sure the mechanics of this notebook work.

You should achieve the same accuracy here in this notebook, as you did in your previous notebook (random seed notwithstanding).

In [8]:
# =============================================================================
# DOWNLOAD TEST DATA FROM S3
# =============================================================================

def download_test_data(bucket_name='training-and-validation-data',download_dir='./test-data'):
    s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))

    bucket_name = 'prism-mvta'
    prefix = 'training-and-validation-data/'

    os.makedirs(download_dir, exist_ok=True)

    paginator = s3.get_paginator('list_objects_v2')
    pages = paginator.paginate(Bucket=bucket_name, Prefix=prefix)

    video_names = []

    for page in pages:
        if 'Contents' not in page:
            print("No files found at the specified path!")
            break

        print("Downloading test data:\n")
        for obj in tqdm(page['Contents']):
            key = obj['Key']
            filename = os.path.basename(key)

            if not filename:
                continue

            video_names.append(filename)
            local_path = os.path.join(download_dir, filename)
            # print(f"Downloading: {filename}")
            s3.download_file(bucket_name, key, local_path)

    print(f"\nDownloaded {len(video_names)} test videos")
    return download_dir


# ============================================================================= # DATASET AND DATALOADER =============================================================================
weights = R2Plus1D_18_Weights.DEFAULT
preprocess = weights.transforms()

class VideoDataset(Dataset):
    """Dataset for loading videos from a folder. Labels from filename prefix."""

    def __init__(self, video_dir, frame_size=(112, 112), target_frames=16, augment=None, do_preprocess=True):
        self.video_dir = video_dir
        self.frame_size = frame_size
        self.target_frames = target_frames
        self.augment = augment
        self.do_preprocess = do_preprocess

        self.video_files = [f for f in os.listdir(video_dir) if f.endswith(('.mp4', '.avi', '.mov'))]
        self.labels = [int(f.split('_')[0]) - 1 for f in self.video_files] 

    def __len__(self):
        return len(self.video_files)

    def __getitem__(self, idx):
        video_path = os.path.join(self.video_dir, self.video_files[idx])
        frames = self._load_video(video_path)
        label = self.labels[idx]

        if self.augment:
            frames = self.augment(frames)
        
        if self.do_preprocess:
           x = frames.permute(1,0,2,3)
           x = preprocess(x)
           frames = x.permute(1,0,2,3)

        return frames, label

    def _load_video(self, path, target_frames=16):
        cap = cv2.VideoCapture(path)
        all_frames = []

        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            all_frames.append(frame)

        cap.release()

        T = self.target_frames
        H, W = self.frame_size

        if len(all_frames) == 0:
            return torch.zeros(3, T, H, W)

        idxs = np.linspace(0, len(all_frames) - 1, T).astype(int)
        sampled = [cv2.resize(all_frames[i], (W, H)) for i in idxs]

        frames = torch.from_numpy(np.array(sampled)).permute(3, 0, 1, 2).float() / 255.0

        return frames


def collate_fn(batch):
    frames_list, labels = zip(*batch)
    frames = torch.stack(frames_list)
    frames = frames.permute(0, 2, 1, 3, 4)
    labels = torch.tensor(labels)

    return frames, labels

## TODO 3 - Download your model from HuggingFace and instantiate it

Replace line 8 of the below code. Line 8 is where you instantiate YOUR MODEL ARCHITECTURE (which you re-defined above) with the weights you download from HuggingFace. Make sure you get the class name, and the arguments to the __init__ method correct.


This code just downloads the same model which you uploaded in the last notebook.

In [12]:
# =============================================================================
# DOWNLOAD MODEL FROM HUGGING FACE
# =============================================================================

def load_model_from_hub(repo_id, num_classes=7):
    model_path = hf_hub_download(repo_id=repo_id, filename="model.pt")  # see note below if filename differs

    model = YourModelArchitecture(num_classes=num_classes)

    sd = torch.load(model_path, map_location="cpu")

    # If weights don't have "backbone." but model expects it, add it
    if not any(k.startswith("backbone.") for k in sd.keys()):
        sd = {f"backbone.{k}": v for k, v in sd.items()}

    model.load_state_dict(sd, strict=True)
    print(f"Model loaded from {repo_id}")
    return model

model = load_model_from_hub(f"{hf_username}/mv-final-assignment",num_classes=7)



Model loaded from EleftheriaK/mv-final-assignment


## TODO 4

Make sure the below code correctly evaluates your model performance on the given data!

This is your last chance to verify this before submission.

In [13]:
def evaluate(model, test_loader, dataset, device):
    model.eval()
    correct = 0
    total = 0

    all_preds = []
    all_labels = []
    all_times = []

    print("\n")

    with torch.no_grad():
        for idx, (frames, labels) in enumerate(test_loader):
            frames, labels = frames.to(device), labels.to(device)

            # Time the forward pass
            start_time = time.time()
            outputs = model(frames)
            if device.type == 'cuda':
                torch.cuda.synchronize()  # wait for GPU to finish
            end_time = time.time()

            inference_time = (end_time - start_time) * 1000  # ms
            all_times.append(inference_time)

            preds = outputs.argmax(dim=1)

            for i in range(labels.size(0)):
                batch_idx = idx * test_loader.batch_size + i
                video_name = dataset.video_files[batch_idx]
                pred = preds[i].item()
                true_label = labels[i].item()
                is_correct = "✓" if pred == true_label else "✗"

                print(f"{is_correct}  pred={pred}  true={true_label}  |  {inference_time:>7.1f}ms  |  {video_name}")

            correct += preds.eq(labels).sum().item()
            total += labels.size(0)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = correct / total
    return accuracy, all_preds, all_labels, all_times


# =============================================================================
# RUN INFERENCE
# =============================================================================

def run_inference(model, bucket_name='training-and-validation-data'):
    device = torch.device(
    "mps" if torch.backends.mps.is_available()
    else "cuda" if torch.cuda.is_available()
    else "cpu")
    
    print("Using device:", device)

    # Download test data
    test_dir = download_test_data(bucket_name, './test-data')

    model = model.to(device)

    # Create dataloader
    test_dataset = VideoDataset(test_dir, frame_size=(224, 224))
    test_loader = DataLoader(
        test_dataset,
        batch_size=1,
        shuffle=False,
        num_workers=0,
        collate_fn=collate_fn
    )

    print(f"\nRunning inference on {len(test_dataset)} test videos...")

    # Warmup (optional, helps get consistent GPU timings)
    if device.type == 'cuda':
        dummy = torch.randn(1, 3, 1000, 224, 224).to(device)
        with torch.no_grad():
            _ = model(dummy)
        torch.cuda.synchronize()

    total_start = time.time()
    accuracy, preds, labels, times = evaluate(model, test_loader, test_dataset, device)
    total_end = time.time()

    # Summary
    num_correct = sum(p == l for p, l in zip(preds, labels))
    num_wrong = len(preds) - num_correct

    print("\n" + "="*50)
    print("SUMMARY")
    print("="*50)
    print(f"Total videos:         {len(preds)}")
    print(f"Correct:              {num_correct}")
    print(f"Incorrect:                {num_wrong}")
    print(f"")
    print(f"ACCURACY:             {accuracy*100:.2f}%")
    print(f"")
    print(f"Total time:           {total_end - total_start:.2f}s")
    print(f"Avg per video:        {sum(times) / len(times):.1f}ms")
    print(f"Min latency:          {min(times):.1f}ms")
    print(f"Max latency:          {max(times):.1f}ms")
    print("="*50)
    return accuracy, preds, labels

_, _, _ = run_inference(model)

Using device: mps
Downloading test data:



100%|██████████| 77/77 [05:36<00:00,  4.37s/it]



Downloaded 77 test videos

Running inference on 77 test videos...


✓  pred=1  true=1  |    742.5ms  |  2_sadfasjldkfjaseifj.mp4
✓  pred=1  true=1  |      4.7ms  |  2_sdafkjaslkclaksdjkas.mp4
✓  pred=3  true=3  |      4.6ms  |  4_kling_20251206_Text_to_Video_Generate_a_28_0.mp4
✓  pred=2  true=2  |      4.1ms  |  3_kling_dskfseu.mp4
✓  pred=3  true=3  |      4.0ms  |  4_kling_20251209_Text_to_Video_Generate_a_190_0.mp4
✓  pred=2  true=2  |      3.9ms  |  3_kling_kdjflaskdjf.mp4
✓  pred=1  true=1  |      4.0ms  |  2_dsalkfjalwkenlke.mp4
✓  pred=2  true=2  |      4.2ms  |  3_dsjlaeijlksjdfie.mp4
✓  pred=2  true=2  |      4.1ms  |  3_kling_20251205_Text_to_Video_On_a_playg_5028_0.mp4
✓  pred=3  true=3  |      4.2ms  |  4_sadlfkjlknewkjejk.mp4
✓  pred=2  true=2  |      3.9ms  |  3_kling_20251206_Text_to_Video_Generate_a_315_2.mp4
✓  pred=3  true=3  |      3.9ms  |  4_kling_20251209_Text_to_Video_Generate_a_561_1.mp4
✓  pred=1  true=1  |      4.3ms  |  2_difficult_2.mp4
✓  pred=2  true=2  