In [5]:
!pip install boto3 -q
!pip install opencv-python torch numpy torchvision

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.6/140.6 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.5/14.5 MB[0m [31m70.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m


## Download the data

The data for this assignment has been made available and is downloadable to disk by running the below cell.

In [6]:
import boto3
from botocore import UNSIGNED
from botocore.config import Config
import os

# Connect to S3 without authentication (public bucket)
s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))

bucket_name = 'prism-mvta'
prefix = 'training-and-validation-data/'
download_dir = './video-data'

os.makedirs(download_dir, exist_ok=True)

# List all objects in the S3 path
paginator = s3.get_paginator('list_objects_v2')
pages = paginator.paginate(Bucket=bucket_name, Prefix=prefix)

video_names = []

for page in pages:
    if 'Contents' not in page:
        print("No files found at the specified path! Go and complain to the TAs!")
        break

    for obj in page['Contents']:
        key = obj['Key']
        filename = os.path.basename(key)

        if not filename:
            continue

        video_names.append(filename)

        local_path = os.path.join(download_dir, filename)
        print(f"Downloading: {filename}")
        s3.download_file(bucket_name, key, local_path)

print("\n" + "="*50)
print("Downloaded videos:")
print("="*50)
for name in video_names:
    print(name)

print(f"\nTotal: {len(video_names)} files")

Downloading: 1_dksksjfwijf.mp4
Downloading: 2_dfsaeklnvvalkej.mp4
Downloading: 2_difficult_2.mp4
Downloading: 2_difficult_sdafkljsalkfj.mp4
Downloading: 2_dkdjwkndkfw.mp4
Downloading: 2_dkdmkejkeimdh.mp4
Downloading: 2_dkjd823kjf.mp4
Downloading: 2_dsalkfjalwkenlke.mp4
Downloading: 2_kling_20251205_Text_to_Video_On_a_sandy_4976_0.mp4
Downloading: 2_kling_20251206_Text_to_Video_Generate_a_71_1.mp4
Downloading: 2_sadfasjldkfjaseifj.mp4
Downloading: 2_sdafkjaslkclaksdjkas.mp4
Downloading: 2_sdfkjsaleijflaskdjf.mp4
Downloading: 2_sdjfhafsldkjhjk.mp4
Downloading: 2_sdkjdsflkjfwa.mp4
Downloading: 2_sdlfjlewlkjkj.mp4
Downloading: 2_sdlkjsaelijfksdjf.mp4
Downloading: 3_asldkfjalwieaskdfaskdf.mp4
Downloading: 3_dkk873lkjlksajdf.mp4
Downloading: 3_dsjlaeijlksjdfie.mp4
Downloading: 3_dsksdfjbvsdkj.mp4
Downloading: 3_dslkaldskjflakjs.mp4
Downloading: 3_ewdfkjwaeoihjlkasdjf.mp4
Downloading: 3_kling_20251205_Text_to_Video_In_a_grass_4697_0.mp4
Downloading: 3_kling_20251205_Text_to_Video_On_a_playg_5

These videos are now available in the folder "video-data". You can click on the folder icon on the left-hand-side of this screen to see the videos in a file explorer.

# Create your Datasets and Dataloaders

Some example code for approaching the first *two* TODOs is given below just to get you started. No starter code is given for the third TODO.

Note, the below code is very rough skeleton code. Make no assumptions as to the correct manner to architect your model based on the structure of this code.

Please feel free to (if not encouraged to) change every single line of the below code (change it to best suit your chosen model architecture, in the next section).

### TODO 1 (This is mostly already done for you - Please see the v1 provided below)

Each video in the folder is prefixed by a number. That number corresponds to the number of distinct pushups visible in the video. Write code to iterate over each video in the folder, and extract the corresponding target associated with the video.

### TODO 2 (This is also mostly already done for you - Please see the v1 provided below)


Divide the data into training and validation sets.

Optionally, you can also create out your own test set to assess your performance.

### TODO 3

Any preprocessing or augmentation of your data which you deem required, should (probably) go here. You are also free to include your data-augmentation code later, though doing it before creating your dataloaders is probably a good idea.

If you complete this TODO, to maintain experimental hygiene, feel free to modify the code which was provided for TODOs 1 and 2.

In [7]:
# Here is a basic implementation of the above two TODOs. You can assume the first TODO is completed correctly.

# Please modify this code to suit you best, as you decide on your preferred model architecture.

# For example, below here we are padding every video to 1,000 frames. That may or may not be a good idea.


import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
import os
import cv2
import numpy as np
import random
import torch.nn.functional as F



class Augmentation:
  """
  Simple Augmentation for the video; applies flip, brightness, noise and cutout.
  """
  def __init__(self, p=0.3):
    self.p = p

  def __call__(self, frames):


    if random.random() < self.p:
      #Horizontal flip
      frames = torch.flip(frames, dims=[3]) #flips the W

    if random.random() < self.p:
      #Random brightness
      brightness = 1.0 + random.uniform(-0.2, 0.2)
      frames = torch.clamp(frames * brightness, 0, 1)

    if random.random() < self.p:
      #Add noise
      noise = torch.randn_like(frames) * 0.02
      frames = torch.clamp(frames + noise, 0, 1)

    if random.random() < 0.2:
      #Cutout
      C, T, H, W = frames.shape
      cutout_size = H // 6
      x = random.randint(0, H - cutout_size)
      y = random.randint(0, W - cutout_size)

      frames[:,:, x:x+cutout_size, y:y+cutout_size] = 0

    if random.random() < self.p:
      #Random Translation
      C,T, H, W = frames.shape
      max_shift = 8 #num of pixels
      dx = random.randint(-max_shift, max_shift)
      dy = random.randint(-max_shift, max_shift)

      #pad and crop to the original size
      frames = F.pad(frames, (max_shift, max_shift, max_shift, max_shift))
      frames = frames[:, :, (max_shift+dy):(max_shift+dy+H), (max_shift+dx):(max_shift+dx+W)]

    return frames


class VideoDataset(Dataset):
    """Dataset for loading videos from a folder. Labels from filename prefix."""

    def __init__(self, video_dir, frame_size=(224, 224), transform=None):
        self.video_dir = video_dir
        self.frame_size = frame_size
        self.transform = transform

        self.video_files = [
            f for f in os.listdir(video_dir)
            if f.endswith(('.mp4', '.avi', '.mov'))
        ]

        self.labels = [int(f.split('_')[0]) - 1 for f in self.video_files]  # 1..7 -> 0..6

    def __len__(self):
        return len(self.video_files)

    def __getitem__(self, idx):
        video_path = os.path.join(self.video_dir, self.video_files[idx])
        frames = self._load_video(video_path)
        label = self.labels[idx]

        if self.transform:
            frames = self.transform(frames)

        return frames, label

    def _load_video(self, path, target_frames=16):
        cap = cv2.VideoCapture(path)
        all_frames = []

        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            all_frames.append(frame)

        cap.release()

        if len(all_frames) == 0:
          return torch.zeros(3, target_frames, self.frame_size[0], self.frame_size[1])

        # pick 16 frames spread across the whole video
        idxs = np.linspace(0, len(all_frames) - 1, target_frames).astype(int)
        sampled = [all_frames[i] for i in idxs]
        sampled = [cv2.resize(f, (self.frame_size[1], self.frame_size[0])) for f in sampled]

        frames = torch.from_numpy(np.array(sampled)).permute(3, 0, 1, 2).float() / 255.0

        return frames


def collate_fn(batch):
    frames_list, labels = zip(*batch)

    return torch.stack(frames_list), torch.tensor(labels)


def get_dataloaders(video_dir, batch_size=4, val_split=0.2, frame_size=(112, 112)):
    """Create train and validation dataloaders."""

    train_dataset_full = VideoDataset(video_dir, frame_size=frame_size, transform=Augmentation(p=0.5))
    val_dataset_full   = VideoDataset(video_dir, frame_size=frame_size, transform=None)

    val_size = int(len(train_dataset_full) * val_split)
    train_size = len(train_dataset_full) - val_size

    gen = torch.Generator().manual_seed(42)
    train_subset, val_subset = random_split(range(len(train_dataset_full)), [train_size, val_size], generator=gen)

    train_dataset = torch.utils.data.Subset(train_dataset_full, train_subset.indices)
    val_dataset   = torch.utils.data.Subset(val_dataset_full, val_subset.indices)

    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=0,
        collate_fn=collate_fn
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=0,
        collate_fn=collate_fn
    )

    print(f"Train: {len(train_dataset)} videos, Val: {len(val_dataset)} videos\n")

    return train_loader, val_loader


video_dir = './video-data'

train_loader, val_loader = get_dataloaders(video_dir, batch_size=4, val_split=0.2)

for frames, labels in train_loader:
    print(f"Frames shape: {frames.shape}")  # (B, C, 1000, H, W)
    print(f"Labels: {labels}")
    break

Train: 62 videos, Val: 15 videos

Frames shape: torch.Size([4, 3, 16, 112, 112])
Labels: tensor([2, 2, 2, 2])


# Create a Model

For this assignment, we request you use PyTorch. Below is an example of how to instantiate a very basic PyTorch model.

Note, this model below needs a _lot_ of work.

Please include your code for creating your model below.

The only constraint here is that you define a Python object which inherits from a PyTorch nn.Module object. Beyond that, please feel free to implement anything you like: Transformer, Vision Transformer, MLP, CNN, etc.

### TODO 4

Create your model.

In [8]:
labels = []
for f in os.listdir("./video-data"):
    if f.endswith(".mp4"):
        labels.append(int(f.split("_")[0]) - 1)
print("num_classes:", len(set(labels)))

num_classes: 7


In [9]:
import torch
import torch.nn as nn
from torchvision.models.video import r2plus1d_18, R2Plus1D_18_Weights

#Downloads the pretrained  model
num_classes = 7
weights = R2Plus1D_18_Weights.DEFAULT
model = r2plus1d_18(weights=weights)

#Swaps the last layer so it outputs 8 logits
in_features = model.fc.in_features
model.fc = nn.Linear(in_features, num_classes)

print(model.fc)

Downloading: "https://download.pytorch.org/models/r2plus1d_18-91a641e6.pth" to /root/.cache/torch/hub/checkpoints/r2plus1d_18-91a641e6.pth


100%|██████████| 120M/120M [00:00<00:00, 166MB/s]


Linear(in_features=512, out_features=7, bias=True)


In [13]:
device = torch.device(
    "mps" if torch.backends.mps.is_available()
    else "cuda" if torch.cuda.is_available()
    else "cpu"
)
model = model.to(device)

frames, labels = next(iter(train_loader))
frames = frames.to(device)
out = model(frames)

print(frames.shape)
print(out.shape)

torch.Size([4, 3, 16, 112, 112])
torch.Size([4, 7])


# Train your Model

### TODO 5

Training time! Please include your training code below.

As per above, please feel free (and encouraged) to rip out all of the below code and replace with your (much better) code.

The below should just be used as an example to get you started.

In [14]:
import torch.optim as optim
import torch.nn as nn

def train_epoch(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for frames, labels in train_loader:
        frames = frames.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(frames)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = outputs.argmax(dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    avg_loss = total_loss / len(train_loader)
    acc = correct / total
    return avg_loss, acc


def evaluate(model, val_loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for frames, labels in val_loader:
            frames = frames.to(device)
            labels = labels.to(device)

            outputs = model(frames)
            loss = criterion(outputs, labels)

            total_loss += loss.item()
            preds = outputs.argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    avg_loss = total_loss / len(val_loader)
    acc = correct / total
    return avg_loss, acc


def train_model(model, train_loader, val_loader, epochs=20, lr=1e-4):
    device = torch.device(
    "mps" if torch.backends.mps.is_available()
    else "cuda" if torch.cuda.is_available()
    else "cpu"
)
    print(f"Using device: {device}\n")

    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(1, epochs + 1):
        train_loss, train_acc = train_epoch(model, train_loader, optimizer, criterion, device)
        val_loss, val_acc = evaluate(model, val_loader, criterion, device)

        print(f"Epoch {epoch}/{epochs} | "
              f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f} | "
              f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

    return model

In [None]:
# setting a manual seed for reproducibility
torch.manual_seed(0)
model = train_model(model, train_loader, val_loader, epochs=20, lr=1e-4)

Using device: cpu



# Evaluation

## TODO 6

Include any code which you feel is useful for evaluating your model performance below.

In [None]:
# YOUR CODE HERE
import numpy as np
import matplotlib.pyplot as plt

def evaluate(model, loader, device, num_classes=7):
  model.eval()
  all_preds = []
  all_labels = []

  with torch.no_grad():
    for frames, labels in loader:
      frames = frames.to(device)
      labels = labels.to(device)
      logits = model(frames)
      preds = logits.argmax(dim=1)
      all_preds.append(preds.cpu().numpy())
      all_labels.append(labels.cpu().numpy())

  y_pred = np.concatenate(all_preds)
  y_true = np.concatenate(all_labels)

  acc = (y_pred == y_true).mean()
  print(f"Val accuracy: {acc:.3f}")

In [None]:
evaluate(model, val_loader, device)

# Hugging Face

It is a requirement of this assignment that you submit your trained model to a repo on Hugging Face, and make it publicly available. Below, we provide code which should help you do this.

## TODO 7

Upload your model to HuggingFace

Install the dependencies:

In [None]:
!pip install huggingface_hub

You'll now need to log in to Hugging Face via the command line. To do this, you'll need to generate a token on your Hugging Face account. To generate a token, run the below command, and click on the link which appears.

In [None]:
!hf auth login

The below code will only run if you have already trained a model with variable name 'model'.

The below code will take your trained model, and upload it to a *public* HuggingFace repo in your account called "mv-final-assignment".

(Note - in this example, we have set 'private=False' in the upload_to_hub method. This makes your model public).

You should double-check that your model is in fact public. To do that, you can navigate (in an incognito tab, in a browser) to https://huggingface.co/YOUR_USERNAME/YOUR_MODEL_NAME and see if that page loads. If your model is public, it will. (Simply being able to run the below code will not guarantee that your model is in fact public, because, you have now authenticated yourself with the huggingface CLI).

In [None]:
# YOUR HUGGING FACE USERNAME BELOW
hf_username = 'EleftheriaK'

In [None]:
import torch
import torch.nn as nn
from huggingface_hub import HfApi, hf_hub_download


def save_model(model, path="model.pt"):
    """Save the model weights to a file."""
    torch.save(model.state_dict(), path)
    print(f"Model saved to {path}")


def upload_to_hub(local_path="model.pt", repo_id=f"{hf_username}/mv-final-assignment"):
    """
    Upload model to Hugging Face Hub.

    Args:
        local_path: Path to your saved model file
        repo_id: Your repo in format "username/model-name"
    """
    api = HfApi()

    # Create the repo first (if it already exists, this will just skip)
    api.create_repo(
        repo_id=repo_id,
        repo_type="model",
        exist_ok=True,  # Don't error if it already exists
        private=False,  # Make it public so TAs can access
    )

    # Now upload the file
    api.upload_file(
        path_or_fileobj=local_path,
        path_in_repo="model.pt",
        repo_id=repo_id,
        repo_type="model",
    )

    print(f"Model uploaded to https://huggingface.co/{repo_id}")


# =============================================================================
# EXAMPLE USAGE
# =============================================================================

if __name__ == "__main__":

    save_model(model, "mv-final-assignment.pt")

    upload_to_hub("mv-final-assignment.pt", f"{hf_username}/mv-final-assignment")
