In [1]:
# Run it once (in one session)
# !pip install decord
# !pip install einops
# !pip install icecream
# !pip install rarfile
# !pip install unrar

In [2]:
# Imports
import torch
from torch import nn, einsum
from torch.nn import functional as F
from torch.utils.tensorboard import SummaryWriter
import torchvision as tv
from torch.utils.data import random_split, DataLoader,Dataset
import time
import random
import math
import decord
import numpy as np
import gc
from einops import rearrange, repeat,reduce
from einops.layers.torch import Rearrange
from PIL import Image
from tqdm.notebook import tqdm
from icecream import ic
from torchvision.datasets import DatasetFolder
from torchvision.transforms import transforms
from torch.utils.data import DataLoader, random_split
import os
import rarfile
from torchvision.datasets import ImageFolder
from torchvision.transforms import transforms
from torch.utils.data import DataLoader, random_split
import torch.utils.data as data
from torchvision import transforms
from PIL import Image
import torch.nn as nn
import torch.optim as optim
from tqdm.notebook import tqdm
import shutil

import matplotlib.pyplot as plt
%matplotlib inline
import cv2
import operator


In [3]:
# set device
frames_per_clip = 8
writer = SummaryWriter()
device ='cuda:0' if torch.cuda.is_available() else 'cpu'
device

'cuda:0'

# DOWNLOADING DATA FROM SOURCE WEBSITE

In [4]:
# # once per session/runtime
# !wget http://serre-lab.clps.brown.edu/wp-content/uploads/2013/10/hmdb51_org.rar 

# DATA DIRECTORY SETTINGS

In [5]:
# # once per session/runtime

# rar_path = './hmdb51_org.rar'  
# extract_path = './dataset' 

# with rarfile.RarFile(rar_path, 'r') as rar:
#     rar.extractall(extract_path)

In [6]:
# # once per session/runtime

# direcs = os.listdir(extract_path)

# for i in direcs:
#   with rarfile.RarFile(f"dataset/{i}", 'r') as rar:
#     rar.extractall(f"data/{i.split('.')[0]}")

In [7]:
# once per session/runtime



In [8]:

class Frame:
    def __init__(self, id, frame, value):
        self.id = id
        self.frame = frame
        self.value = value

    def __lt__(self, other):
        if self.id == other.id:
            return self.id < other.id
        return self.id < other.id

    def __gt__(self, other):
        return other.__lt__(self)

    def __eq__(self, other):
        return self.id == other.id and self.id == other.id

    def __ne__(self, other):
        return not self.__eq__(other)

# DATA LOADING

In [9]:
# Dataset Class
class HMDB51Dataset(data.Dataset):
    def __init__(self, dataset_dir, frames_per_clip=16):
        super().__init__()
        self.dataset_dir = dataset_dir
        self.frames_per_clip = frames_per_clip
        self.video_list = []
        self.labels = []

        # Get the list of video directories
        video_dirs = sorted(os.listdir(dataset_dir))
        
        for label, video_dir in enumerate(video_dirs):
            video_files = os.listdir(os.path.join(dataset_dir, video_dir))
            self.video_list.extend([os.path.join(video_dir, video_file) for video_file in video_files])
            self.labels.extend([label] * len(video_files))

        self.transform = transforms.Compose([
            transforms.Resize((256, 256)),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])

    def __len__(self):
        return len(self.video_list)

    def __getitem__(self, idx):
        video_path = os.path.join(self.dataset_dir, self.video_list[idx])
        cap = cv2.VideoCapture(str(video_path))


        curr_frame = None
        prev_frame = None

        frame_diffs = []
        frames = []
        ret, frame = cap.read()
        i = 1

        while(ret):
            luv = cv2.cvtColor(frame, cv2.COLOR_BGR2LUV)
            curr_frame = luv
            if curr_frame is not None and prev_frame is not None:
                #logic here
                diff = cv2.absdiff(curr_frame, prev_frame)
                count = np.sum(diff)
                frame_diffs.append(count)
                frame = Frame(i, frame, count)
                frames.append(frame)
            prev_frame = curr_frame
            i = i + 1
            ret, frame = cap.read()
        """
            cv2.imshow('frame',luv)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
        """
        cap.release()
        # vid = decord.VideoReader(video_path, ctx=decord.cpu(0))
        # nframes = len(vid)

        # # If the number of frames in the video is less than frames_per_clip, repeat the frames
        # if nframes <= self.frames_per_clip:
        #     frame_idxs = torch.arange(0, self.frames_per_clip) % nframes
        # # Else, sample uniformly separated frames
        # else:
        #     frame_idxs = torch.linspace(0, nframes - 1, self.frames_per_clip).long()

        # frames = []
        # for frame_idx in frame_idxs:
        #     frame_idx = frame_idx.item()  # Convert to scalar value
        #     frame = Image.fromarray(vid[frame_idx].asnumpy())
        #     frame = self.transform(frame)
        #     frames.append(frame)
        frames.sort(key=operator.attrgetter("value"), reverse=True)
        keyframes = []
        for keyframe in frames[:frames_per_clip]:
            frame = Image.fromarray(keyframe.frame)
            frame = self.transform(frame)
            keyframes.append(frame)
        
        keyframes = torch.stack(keyframes)

        label = self.labels[idx]
        #print('Frame for key frame',keyframes.shape)
        return keyframes, label

In [10]:
# Directory of the HMDB51 dataset
dataset_dir = "./data"

# Instantiate the dataset
hmdb51_dataset = HMDB51Dataset(dataset_dir, frames_per_clip=frames_per_clip)

# TRAIN TEST SPLIT

In [11]:
# Split the dataset into train, validation, and test sets
train_len = int(0.7 * len(hmdb51_dataset))
test_len = len(hmdb51_dataset) - train_len - (len(hmdb51_dataset) - train_len)//2
train_data, val_data,test_data = torch.utils.data.random_split(hmdb51_dataset, [train_len,test_len, len(hmdb51_dataset) - test_len - train_len])

# Data loading parameters
batch_size = 32
test_batch_size = 1
num_workers = 0
pin_memory = True
num_classes = len(set(hmdb51_dataset.labels))



In [12]:
# Dataloaders
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=pin_memory)
val_loader = torch.utils.data.DataLoader(val_data, batch_size=batch_size, num_workers=num_workers, pin_memory=pin_memory)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=test_batch_size, num_workers=num_workers, pin_memory=pin_memory)

In [13]:
# Instantiate and create train-val-test split
# train_val_data = HMDB51Dataset(dataset_dir, frames_per_clip=frames_per_clip)
# train_len = int(0.7 * len(hmdb51_dataset))
# test_len = len(hmdb51_dataset) - train_len - (len(hmdb51_dataset) - train_len)//2
# train_val_split = [train_len,test_len, len(hmdb51_dataset) - test_len - train_len]
# train_data, val_data,test_data = random_split(train_val_data, train_val_split)
# test_data = HMDB51Dataset(dataset_dir, frames_per_clip=frames_per_clip)

# Print the number of samples in each split
print(f"Train samples: {len(train_data)}")
print(f"Validation samples: {len(val_data)}")
print(f"Test samples: {len(test_data)}")

Train samples: 4736
Validation samples: 1015
Test samples: 1015


# DEFINING MODEL

In [14]:
class MLP(nn.Module):
    """
    Builds a simple feed forward network
    Args:
    - dim: (int) - inner dimension of embeddings
    - inner_dim: (int) - dimension of transformer head
    - n_class: (int) - number of output classes
    - encoder: the DinoVisionTransformer encoder
    """
    def __init__(self, dim, inner_dim, n_class, encoder):
        super().__init__()
        self.encoder = encoder
        self.mlp = nn.Sequential(
            nn.Linear(dim, n_class)
        )

    def forward(self, x):
        B, T, C, H, W = x.shape
        x = x.reshape(B*T, C, H, W)
        output = self.encoder(x)
        output = output.reshape(B, T, -1)
        avg = output.mean(dim=1)  # Average pooling over time
        return self.mlp(avg)

In [15]:
# Instantiate the model
dinov2_vits14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vits14')
dinov2_vits14.to(device)
for param in dinov2_vits14.parameters():
    param.requires_grad = False
for i in range(12):
    #dinov2_vits14.blocks[i].add_module('ln',nn.LayerNorm(384))
    dinov2_vits14.blocks[i].add_module('fc1',nn.Linear(384, 384, bias=True))
    dinov2_vits14.blocks[i].add_module('conv',nn.Conv3d(384, 384, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0), groups=384))
    dinov2_vits14.blocks[i].add_module('fc2',nn.Linear(384, 384, bias=True))
model = MLP(384, 512, 51, dinov2_vits14)
model.to(device)

Using cache found in /home/z3qian/.cache/torch/hub/facebookresearch_dinov2_main


MLP(
  (encoder): DinoVisionTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 384, kernel_size=(14, 14), stride=(14, 14))
      (norm): Identity()
    )
    (blocks): ModuleList(
      (0-11): 12 x NestedTensorBlock(
        (norm1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
        (attn): MemEffAttention(
          (qkv): Linear(in_features=384, out_features=1152, bias=True)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=384, out_features=384, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (ls1): LayerScale()
        (drop_path1): Identity()
        (norm2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=384, out_features=1536, bias=True)
          (act): GELU(approximate='none')
          (fc2): Linear(in_features=1536, out_features=384, bias=True)
          (drop): Dropout(p=0.0, inplace=False)
        )
        (

## SPECIFYING HYPER-PARAMETERS

In [16]:
# Define the loss function and optimizer
lr=0.01
epochs = 10
decay_rate = 0.95
loss_criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(),lr=lr,momentum=0.9,weight_decay=0.01)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=decay_rate)  # stepwise learning rate decay


# MODEL TRAINING & EVALUATION

In [17]:
# Training loop
val_best = 0
for epoch in range(1, epochs + 1):
    model.train()
    total_epoch_loss = 0
    for batch_id, (video_data, labels) in tqdm(enumerate(train_loader)):
        video_data, labels = video_data.to(device), labels.to(device)
        optimizer.zero_grad()
        prediction = model(video_data)
        loss = loss_criterion(prediction, labels)
        loss.backward()
        optimizer.step()
        total_epoch_loss += loss.item()
        
        corrects = (torch.argmax(prediction,dim=1)==labels).sum()
        bacc = corrects/batch_size
        del video_data
        del labels

        gc.collect()
        print(f"\n[Train Epoch]: {epoch} Train Loss: {loss.item()}, Batch Acc is {bacc.item()}")

        # Add any additional training metrics/logging you need
    scheduler.step()
    # Perform validation at the end of each epoch
    model.eval()
    total_loss = 0
    corrects = 0
    with torch.no_grad():
        for batch_id, (video_data, labels) in enumerate(val_loader):
            video_data, labels = video_data.to(device), labels.to(device)
            prediction = model(video_data)
            loss = loss_criterion(prediction, labels)
            total_loss += loss.item()
            corrects += (torch.argmax(prediction, dim=1) == labels).sum()
            del video_data
            del labels

            gc.collect()
    accuracy = corrects / (len(val_loader) * batch_size)
    writer.add_scalar('Loss', total_loss / len(val_loader), epoch)
    writer.add_scalar('Accuracy', accuracy, epoch)
    print(f"\n[Val Epoch]: {epoch} , Accuracy: {accuracy}, Valid Loss: {total_loss / len(val_loader)}")
    if accuracy > val_best:
        torch.save(model,'best_hmdb_model.pth')
        val_best = accuracy

0it [00:00, ?it/s]


[Train Epoch]: 1 Train Loss: 4.6902079582214355, Batch Acc is 0.0625


RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
# Save the trained model
#torch.save(model, "hmdb_st_model.pth")
model = torch.load('best_hmdb_model.pth')
model.to(device)

MLP(
  (encoder): DinoVisionTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 384, kernel_size=(14, 14), stride=(14, 14))
      (norm): Identity()
    )
    (blocks): ModuleList(
      (0-11): 12 x NestedTensorBlock(
        (norm1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
        (attn): MemEffAttention(
          (qkv): Linear(in_features=384, out_features=1152, bias=True)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=384, out_features=384, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (ls1): LayerScale()
        (drop_path1): Identity()
        (norm2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=384, out_features=1536, bias=True)
          (act): GELU(approximate='none')
          (fc2): Linear(in_features=1536, out_features=384, bias=True)
          (drop): Dropout(p=0.0, inplace=False)
        )
        (

In [None]:
def test_model(loader):

    model.eval()
    corrects=0
    total_loss = 0
    with torch.no_grad():
        for batch_id, (input_data,labels) in enumerate(loader):
            
            input_data = input_data.to(device)
            
            labels = labels.to(device)
            prediction = model(input_data)
            loss = loss_criterion(prediction,labels)
            total_loss += loss.item()
            corrects+= (torch.argmax(prediction,dim=1)==labels).sum()
    
    accuracy = corrects/(len(loader)*test_batch_size)
    print(f"Test Accuracy: {accuracy}, Test Loss: {total_loss}")

    return accuracy

In [None]:
test_model(test_loader)

Test Accuracy: 0.6472906470298767, Test Loss: 1564.0274879383696


tensor(0.6473, device='cuda:0')