In [1]:
import os
import warnings

import numpy as np
from tqdm import tqdm

import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import torchvision.io as io # for reading video files
import torchvision.transforms as transforms

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device.type == "cuda":
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
    torch.backends.cudnn.enabled = True #Enable cuDNN
    torch.backends.cudnn.benchmark = True #Enable cuDNN benchmark for the best performance
    warnings.filterwarnings("ignore") #I realy don't care that you broke your elbow

print("Using device: ", device)

Using device:  cpu


In [3]:
params = {
    "batch_size": 1,
    "epochs": 50,
    "lr": 0.001,
    "momentum": 0.9,
    "log_interval": 10,
    # "pin_memory": True,
    "patience": 20,
    "frame_size": (224, 224),
}

# Model Architecture

The goal here is to create an embedding for each frame, add a positional encoding and then pass it through a transformer encoder. The output of the transformer encoder is then passed through a linear layer to get the final output.

In [4]:
class VidModel(nn.Module):
    def __init__(self, frame_dims : tuple, embedding_size : int):
        super(VidModel, self).__init__()
        #Input shape is (b, t, h, w, c)

        #Setting up the frame embeddings
        self.embed = nn.Sequential(
            # nn.Conv2d(3, 16, 7, 2, 3, bias=False),
            nn.ConvTranspose2d(3, 16, 7, 2, 3, bias=False),
            nn.ReLU(),
            nn.MaxPool2d(3, 2, 1),
            # nn.Conv2d(16, 32, 5, 1, 2, bias=False),
            nn.ConvTranspose2d(16, 32, 5, 1, 2, bias=False),
            nn.ReLU(),
            nn.MaxPool2d(3, 2, 1),
            # nn.Conv2d(32, 64, 3, 1, 1, bias=False),
            nn.ConvTranspose2d(32, 64, 3, 1, 1, bias=False),
            nn.ReLU(),
            nn.MaxPool2d(3, 2, 1),
            # nn.Conv2d(64, 128, 3, 1, 1, bias=False),
            nn.ConvTranspose2d(64, 128, 3, 1, 1, bias=False),
            nn.ReLU(),
            nn.MaxPool2d(3, 2, 1),

            nn.Flatten(),

            nn.Linear(100352, embedding_size),
        )


        #Shape of the embeddings is (1, embedding_size)

        #Apply a positional encoding to the embeddings
        #TODO: Add positional encoding

        #Self attention layer for the embeddings
        self.self_attn = nn.TransformerEncoderLayer(d_model=embedding_size, nhead=1, dim_feedforward=2048, dropout=0.1, activation='relu', batch_first=True)
        
        #Fully connected layer to get the final output
        self.fc = nn.Linear(embedding_size, 4)


    def forward(self, x):
        # x is of shape (b, t, c, h, w)
        b = x.shape[0]
        t = x.shape[1]
        
        # Transpose it to (b*t, c, h, w)
        x = x.view(b*t, x.shape[2], x.shape[3], x.shape[4])
        
        x = self.embed(x)
        
        #Convert back to (b, t, embedding_size)
        x = x.view(b, t, -1)
        
        #Apply self attention
        x = self.self_attn(x)
        
        #Apply the final fully connected layer
        x = self.fc(x)
        
        return x

    
model = VidModel(params['frame_size'], 256)
print(model)

VidModel(
  (embed): Sequential(
    (0): ConvTranspose2d(3, 16, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (1): ReLU()
    (2): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (3): ConvTranspose2d(16, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2), bias=False)
    (4): ReLU()
    (5): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (6): ConvTranspose2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (7): ReLU()
    (8): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (9): ConvTranspose2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (10): ReLU()
    (11): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (12): Flatten(start_dim=1, end_dim=-1)
    (13): Linear(in_features=100352, out_features=256, bias=True)
  )
  (self_attn): TransformerEncoderLayer(
    (self_attn): MultiheadAt

# Data Preprocessing

In [11]:
class CustomDataSet(Dataset):
    def __init__(self, frame_size):
        self.root_dir = '../data/videos/'
        # Get the list of all files in directory that end in .mp4
        self.files = [f for f in os.listdir(self.root_dir) if f.endswith('.mp4')]

        # Define the transforms
        self.transforms = transforms.Compose([
            transforms.Resize(frame_size),
        ])

        self.data = []

        for file in tqdm(self.files):
            # Read the video file
            video, audio, info = io.read_video(self.root_dir + file, pts_unit='sec', output_format='tchw')
            pin = [int(x) for x in file[0:4]] # Get the pin from the filename (first 4 characters)
            pin = torch.tensor(pin, dtype=torch.int)
            video = torch.tensor(video, dtype=torch.float32)
            
            #If the video is too short, skip it
            if video.shape[0] != 78:
                continue

            # Apply the transforms
            video = self.transforms(video)
            # Append the video to the data
            self.data.append((video, pin))      

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        # Return the video and the pin
        return self.data[idx][0], self.data[idx][1]
    
# Create the datasets
dataset = CustomDataSet(params['frame_size'])
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

100%|██████████| 831/831 [02:43<00:00,  5.08it/s]


In [12]:
train_loader = DataLoader(train_dataset, batch_size=params["batch_size"], shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=params["batch_size"], shuffle=True)

# Training and Testing Loop


In [13]:
def train(model, dataloader, criterion, optimizer):
    model.train()
    running_loss = 0.0
    total_correct = 0

    for videos, labels in tqdm(dataloader, desc="Training", leave=False):
        videos = videos.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)

        # optimizer.zero_grad()
        #This is a bit faster than the above line
        for param in model.parameters():
            param.grad = None

        # Forward pass
        outputs = model(videos)

        # Calculate loss
        loss = criterion(outputs, labels)

        # Backward pass
        loss.backward()
        optimizer.step()

        total_correct += (outputs.argmax(1) == labels).type(torch.float).sum().item()
        running_loss += loss.item()

    avg_loss = running_loss / len(dataloader)
    return avg_loss

In [14]:
def validate(model, dataloader, criterion):
    model.eval()
    total_loss = 0.0
    total_correct = 0

    with torch.no_grad():
        for videos, labels in tqdm(dataloader, desc="Validation", leave=False):
            
            imgs = imgs.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True)

            outputs = model(videos)
            loss = criterion(outputs, labels)

            total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    return avg_loss

In [15]:
def run_epoch(model, train_loader, val_loader, criterion, optimizer, history):
    best_val_loss = float("inf")

    for epoch in range(history["last_epoch"] + 1, params["epochs"] + 1):
        train_loss = train(
            model,
            train_loader,
            criterion,
            optimizer,
        )
        val_loss = validate(model, val_loader, criterion)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            history["best_weights"] = model.state_dict()
            patience = 0
        else:
            patience += 1

        if patience >= params["patience"]:
            print(f"Early stopping at epoch {epoch}")
            break

        history["train_loss"].append(train_loss)
        history["val_loss"].append(val_loss)
        history["last_epoch"] = epoch

        print(f'Epoch:              {epoch}/{params["epochs"]}')
        print(f"Training Loss:      {train_loss:.4f}")
        print(f"Validation Loss:    {val_loss:.4f}")
        print(f'Learning rate:      {optimizer.param_groups[0]["lr"]:.7f}')

    return (model, history)

In [18]:
model = model.to(device)
criterion = nn.MSELoss()
scaler = torch.cuda.amp.GradScaler()
optimizer = torch.optim.Adam(model.parameters(), lr=params["lr"])

best_val_loss = float("inf")
besst_weights = None

train_losses = []
train_accuracies = []
val_losses = []
val_accuracies = []

try:
    model, history = run_epoch(
        model,
        train_loader,
        test_loader,
        criterion,
        optimizer,
        history={
            "train_loss": [],
            "val_loss": [],
            "val_accuracy": [],
            "last_epoch": 0,
            "best_weights": None,
        },
    )

except KeyboardInterrupt:
    print("Training interrupted")

Training:   0%|          | 0/230 [00:00<?, ?it/s]

                                                 

RuntimeError: shape '[16, -1, 256]' is invalid for input of size 39936