# Action recognition course : LAB 2

> Author : BABIN-RIBY Hugo, See LICENSE FIle

This lab is meant to be completed by students. There is a set of challenges they need to overcome by asking questions to gain experience.

> Note that this notebook was meant to be executed in google colab but can easily be ported to other services with minimal work.

At the end of this notebook (~1h30-2h) you will :

- Have a better expertise in video data inference
- Have hands-on experience with
  - action recognition models
  - action recognition datasets
- Know the pros and cons of each type of architecture

In [None]:
!pip install kaggle;
!mkdir -p ~/.kaggle;
!cp kaggle.json ~/.kaggle/;
!chmod 600 ~/.kaggle/kaggle.json;
!kaggle datasets download -d mateohervas/dcsass-dataset -p /content/;
!unzip /content/dcsass-dataset.zip -d /content/;

In [None]:
import torch
!pip install pytorchvideo
model = torch.hub.load('facebookresearch/pytorchvideo', 'slowfast_r50', pretrained=True)

In [None]:
from typing import Dict
import json
import urllib
from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)
from pytorchvideo.data.encoded_video import EncodedVideo
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample,
    UniformCropVideo
)

In [None]:
side_size = 256
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
crop_size = 256
num_frames = 32
sampling_rate = 2
frames_per_second = 30
slowfast_alpha = 4
num_clips = 10
num_crops = 3
clip_duration = (num_frames * sampling_rate)/frames_per_second

class PackPathway(torch.nn.Module):
    """
    Transform for converting video frames as a list of tensors.
    """
    def __init__(self):
        super().__init__()

    def forward(self, frames: torch.Tensor):
        fast_pathway = frames
        # Perform temporal sampling from the fast pathway.
        slow_pathway = torch.index_select(
            frames,
            1,
            torch.linspace(
                0, frames.shape[1] - 1, frames.shape[1] // slowfast_alpha
            ).long(),
        )
        frame_list = [slow_pathway, fast_pathway]
        return frame_list

transform =  ApplyTransformToKey(
    key="video",
    transform=Compose(
        [
            UniformTemporalSubsample(num_frames),
            Lambda(lambda x: x/255.0),
            NormalizeVideo(mean, std),
            ShortSideScale(
                size=side_size
            ),
            CenterCropVideo(crop_size),
            PackPathway()
        ]
    ),
)

In [None]:
import matplotlib.pyplot as plt

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

print(device)

start_sec = 0
end_sec = start_sec + clip_duration

!ls

video = EncodedVideo.from_path("./DCSASS Dataset/Shoplifting/Shoplifting001_x264.mp4/Shoplifting001_x264_19.mp4")
video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)
video_data = transform(video_data)
inputs = video_data["video"]

device = "cpu"

# Check a frame ...
print(inputs[0].shape)
print(inputs[1].shape)
plt.imshow(inputs[0][0][0].to("cpu"), cmap="gray")
plt.plot()

inputs = [i.to(device)[None, ...] for i in inputs]
model = model.to(device)

# Slowfast forward propagation
outputs = model(inputs)

print(outputs.shape)


In [None]:
import torch.nn as nn

class CustomClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dims, num_classes=2):
        super(CustomClassifier, self).__init__()

        layers = []
        prev_dim = input_dim

        for dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, dim),
                nn.ReLU(),
                nn.Dropout(0.5)
            ])
            prev_dim = dim

        layers.append(nn.Linear(prev_dim, num_classes))

        self.classifier = nn.Sequential(*layers)

    def forward(self, x):
        return self.classifier(x)

In [None]:
num_classes = 2 # [prob shoplifting, prob NOT shoplifting]
slowfast_out = model.blocks[-1].proj.out_features
print(slowfast_out)

fine_tune = CustomClassifier(slowfast_out, [512, 128], num_classes)

In [None]:
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import os
import matplotlib.pyplot as plt

import torch.share

start_sec = 0
end_sec = start_sec + clip_duration

class VideoDataset(Dataset):
    def __init__(self, video_folder, label_csv, transform=None, clip_duration=2):
        self.video_folder = video_folder
        self.data = pd.read_csv(label_csv, header=None, names=['Name', 'Type', 'Label'])
        self.data = self.data.drop(columns=['Type'])
        self.transform = transform
        self.clip_duration = clip_duration  # Duration of video clip in seconds

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Get video path and label
        row = self.data.iloc[idx]
        video_path = os.path.join(self.video_folder, row['Name'][0:19] + ".mp4/" + row['Name'] + ".mp4")
        label = row['Label']
        if label :
            label = torch.tensor([1,0])
        else :
            label = torch.tensor([0,1])

        # Load video
        video = EncodedVideo.from_path(video_path)
        # Sample clip (start at 0 for simplicity)
        video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)

        video_data = transform(video_data)['video']
        return video_data, label

dataset = VideoDataset("./DCSASS Dataset/Shoplifting", "./DCSASS Dataset/Labels/Shoplifting.csv", transform, clip_duration)

train_loader = DataLoader(dataset, batch_size=10, shuffle=True)
test_loader = DataLoader(dataset, batch_size=1, shuffle=True)

print(len(train_loader))
print(len(test_loader))

In [None]:
import torch.optim as optim
from torch.optim import Adam
from torch.nn import CrossEntropyLoss

device = "cpu"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
fine_tune = fine_tune.to(device)
model=model.to(device)

criterion = CrossEntropyLoss()
optimizer = Adam(fine_tune.parameters(), lr=0.001)

In [None]:
torch.cuda.empty_cache()

# Initialize a list to store the running loss values
epoch_losses = []
num_epochs = 10

for epoch in range(num_epochs):
    model.eval()
    fine_tune.train()
    running_loss = 0.0

    i = 0

    for inputs, labels in train_loader:
        i += 1
        slow_pathway = inputs[0].to(device)
        fast_pathway = inputs[1].to(device)
        labels = labels.to(device)
        outputs = model([slow_pathway, fast_pathway])

        classification = fine_tune(outputs)

        optimizer.zero_grad()
        loss = criterion(classification, torch.reshape(labels,classification.shape).float())
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        print("Epoch #",epoch, " // Progress : ",i/len(train_loader), "% // Loss :", loss.item())

    # Calculate the average loss for this epoch and store it
    epoch_loss = running_loss / len(train_loader)
    epoch_losses.append(epoch_loss)

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss}")

# Plot the running loss after the training loop
plt.plot(range(1, num_epochs + 1), epoch_losses, marker='o')
plt.title('Training Loss Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(True)
plt.show()

In [None]:
# torch.save(fine_tune.state_dict(), 'fine_tune_model.pth')
# fine_tune.load_state_dict(torch.load('fine_tune_model.pth'))

In [None]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
# device="cpu"

model.to(device)
fine_tune.to(device)

model.eval()
fine_tune.eval()

num_tests = 100
correct_positive = 0
correct_negative = 0
test = 0
false_positive = 0
false_negative = 0


for inputs, labels in test_loader:
    slow_pathway = inputs[0].to(device)
    fast_pathway = inputs[1].to(device)
    labels = labels.to(device)
    outputs = model([slow_pathway, fast_pathway])
    classification = fine_tune(outputs)

    shoplifting_label = bool(labels[0][0])
    classification_label = bool(classification[0][0] > -1) # treshold technique

    correct_positive += shoplifting_label and classification_label
    correct_negative += not shoplifting_label and not classification_label
    false_positive += not shoplifting_label and classification_label
    false_negative += shoplifting_label and not classification_label

    print(test+1,shoplifting_label, classification_label, " // ACCURACY TRACKING : ", correct_positive, correct_negative, false_positive, false_negative)

    test += 1
    if test == 99:
      break