In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!unzip /content/drive/MyDrive/HMDB_simp.zip -d /content/HMDB_simp

In [None]:
!pip install transformers torch torchvision datasets evaluate torchmetrics

# Data

## Create dataset

In [1]:
CATEGORY_INDEX = {
    "brush_hair": 0,
    "cartwheel": 1,
    "catch": 2,
    "chew": 3,
    "climb": 4,
    "climb_stairs": 5,
    "draw_sword": 6,
    "eat": 7,
    "fencing": 8,
    "flic_flac": 9,
    "golf": 10,
    "handstand": 11,
    "kiss": 12,
    "pick": 13,
    "pour": 14,
    "pullup": 15,
    "pushup": 16,
    "ride_bike": 17,
    "shoot_bow": 18,
    "shoot_gun": 19,
    "situp": 20,
    "smile": 21,
    "smoke": 22,
    "throw": 23,
    "wave": 24
}

In [5]:
import os
import torch
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from tqdm.notebook import trange, tqdm

# Transformation: Resize to 224x224 and Convert to Tensor
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

def load_sampled_frames(frame_dir, frame_rate=8):
    """
    Load every [frame_rate]-th frame from a directory and apply transformations.
    """
    frame_files = sorted(os.listdir(frame_dir))  # Ensure frames are in order
    sampled_frames = []
    frame_metadata = []

    for i in range(0, len(frame_files), frame_rate):
        frame_path = os.path.join(frame_dir, frame_files[i])
        frame = Image.open(frame_path).convert("RGB")  # Convert to RGB
        frame = transform(frame)  # Apply transformations
        sampled_frames.append(frame)
        frame_metadata.append({'index': i, 'used_in_clip': False, 'file_path': frame_path}) #set up

    return sampled_frames, frame_metadata  # List of torch tensors

def create_clips(frames, frame_metadata, clip_size=8):
    """
    Given a list of sampled frames, create multiple [clip_size]-frame clips.
    """
    clips = []
    clip_indices = []
    updated_metadata = []
    if len(frames) < clip_size:
        return clips, updated_metadata, clip_indices  # Not enough frames to create a clip
    for i in range(0, len(frames) - clip_size + 1, clip_size):  # Sliding window
        clip = torch.stack(frames[i:i + clip_size])  # Stack into (clip_size, 3, 224, 224)
        clips.append(clip)
        clip_indices.append([frame_metadata[j]['index'] for j in range(i, i + clip_size)])
        clip_metadata = [frame_metadata[j]['file_path'] for j in range(i, i + clip_size)]
        updated_metadata.append(clip_metadata)

    return clips, updated_metadata, clip_indices


In [6]:
DATASET_PATH = "/content/HMDB_simp/HMDB_simp"

import random

def split_sources(dataset_path, train_ratio=0.8):
    """
    Splits source folders into train and val sets before processing clips.
    Ensures that all clips from a source video stay in the same set.
    """
    train_sources = {}
    val_sources = {}

    for category in os.listdir(dataset_path):  # Iterate over action categories
        category_path = os.path.join(dataset_path, category)
        if not os.path.isdir(category_path):
            continue

        instances = os.listdir(category_path)  # List all source folders (video IDs)
        random.shuffle(instances)  # Shuffle instances before splitting

        split_idx = int(len(instances) * train_ratio)
        train_sources[category] = instances[:split_idx]  # First 80% for training
        val_sources[category] = instances[split_idx:]  # Last 20% for validation

    return train_sources, val_sources


def process_dataset(dataset_path, sources_dict):
    """
    Processes dataset based on a predefined list of sources.
    """
    dataset = []
    all_updated_metadata = []

    for category, instances in tqdm(sources_dict.items()):
        category_path = os.path.join(dataset_path, category)

        for instance in instances:
            instance_path = os.path.join(category_path, instance)
            if not os.path.isdir(instance_path):
                continue  # Skip non-directory files

            # Load sampled frames
            frames, frame_metadata = load_sampled_frames(instance_path)

            # Create 8-frame clips
            clips, updated_metadata, clip_indices = create_clips(frames, frame_metadata)

            for i, clip in enumerate(clips):
                dataset.append((clip, CATEGORY_INDEX[category]))  # Store (clip, label)
                all_updated_metadata.append(updated_metadata[i])

    return dataset, all_updated_metadata  # List of (clip, label)

class VideoDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        clip, label = self.dataset[idx]
        return clip, torch.tensor(label, dtype=torch.long)

from torch.utils.data import default_collate

class VideoDataCollator:
    """
    Custom data collator for TimeSFormer.
    Converts (clip, label) tuples into a dictionary format.
    """
    def __call__(self, features):
        clips, labels = zip(*features)  # Unpack (clip, label)
        batch = {
            "pixel_values": torch.stack(clips),  # Stack clips into batch
            "labels": torch.tensor(labels, dtype=torch.long)  # Convert labels to tensor
        }
        return batch


In [None]:
from collections import Counter

def count_classes(dataset):
    """
    Function to print the number of clips of 8 created per class
    """
    class_counts = Counter(label for _, label in dataset)
    sorted_class_counts = dict(sorted(class_counts.items()))

    for class_label, count in sorted_class_counts.items():
        print(f"Class {class_label}: {count} clips of 8")

    return sorted_class_counts

#count_classes(train_dataset);


In [7]:
# Split source folders into train & val
train_sources, val_sources = split_sources(DATASET_PATH)

# Process train and val sets separately
train_dataset, train_metadata = process_dataset(DATASET_PATH, train_sources)
val_dataset, val_metadata = process_dataset(DATASET_PATH, val_sources)

dataset_size = len(train_dataset) + len(val_dataset)

print(f"Total clips: {dataset_size}, Train: {len(train_dataset)}, Val: {len(val_dataset)}")

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

Total clips: 1627, Train: 1318, Val: 309


# Train

## TimeSFormer

In [9]:
from transformers import AutoFeatureExtractor, AutoModelForVideoClassification


extractor = AutoFeatureExtractor.from_pretrained("facebook/timesformer-base-finetuned-k400")
model = AutoModelForVideoClassification.from_pretrained(
    "facebook/timesformer-base-finetuned-k400",
    num_labels=len(CATEGORY_INDEX),  # Adjust for our dataset
    ignore_mismatched_sizes=True,
)

# Send model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Model on: {device}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of TimesformerForVideoClassification were not initialized from the model checkpoint at facebook/timesformer-base-finetuned-k400 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([400, 768]) in the checkpoint and torch.Size([25, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([400]) in the checkpoint and torch.Size([25]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model on: cuda


In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchmetrics
import evaluate
import numpy as np
from transformers import AutoFeatureExtractor, AutoModelForVideoClassification, TrainingArguments, Trainer

accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
top5_metric = torchmetrics.classification.Accuracy(top_k=5, task="multiclass", num_classes=len(CATEGORY_INDEX)).to(device)

# Compute Metrics Function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.tensor(logits).argmax(dim=1)

    # Compute Accuracy
    top1_acc = accuracy_metric.compute(predictions=predictions.numpy(), references=labels)["accuracy"]

    # Compute Top-5 Accuracy
    top5_acc = top5_metric(torch.tensor(logits).to(device), torch.tensor(labels).to(device)).item()

    # Compute F1-score (macro)
    f1 = f1_metric.compute(predictions=predictions.numpy(), references=labels, average="macro")["f1"]

    return {
        "accuracy": top1_acc,
        "top-5 accuracy": top5_acc,
        "f1-score": f1
    }

In [11]:
training_args = TrainingArguments(
    output_dir="./timesformer_output",  # Save checkpoints
    evaluation_strategy="epoch",  # Evaluate after every epoch
    save_strategy="epoch",  # Save model after each epoch
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir="./logs",  # TensorBoard logs
    logging_steps=10,
    save_total_limit=2,  # Keep only last 2 checkpoints
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False
)

data_collator = VideoDataCollator()
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=extractor,  # Feature extractor
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

# Train Model
trainer.train()

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mkarkartem[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,Top-5 accuracy,F1-score
1,0.508,0.645068,0.825243,0.970874,0.790915
2,0.0767,0.595146,0.84466,0.977346,0.82891
3,0.0015,0.537473,0.857605,0.980583,0.852075
4,0.0013,0.499854,0.873786,0.977346,0.857364
5,0.0008,0.493607,0.877023,0.977346,0.858724


TrainOutput(global_step=1650, training_loss=0.23152913113689105, metrics={'train_runtime': 3881.9868, 'train_samples_per_second': 1.698, 'train_steps_per_second': 0.425, 'total_flos': 5.77465293172949e+18, 'train_loss': 0.23152913113689105, 'epoch': 5.0})