In [2]:
import os
import warnings

from vivit import ViViT

import numpy as np
from tqdm import tqdm

import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import torchvision.io as io # for reading video files
import torchvision.transforms as transforms

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device.type == "cuda":
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
    torch.backends.cudnn.enabled = True #Enable cuDNN
    torch.backends.cudnn.benchmark = True #Enable cuDNN benchmark for the best performance
    warnings.filterwarnings("ignore") #I realy don't care that you broke your elbow

print("Using device: ", device)

Using device:  cuda


In [4]:
params = {
    "batch_size": 1,
    "epochs": 50,
    "lr": 0.001,
    "momentum": 0.9,
    "log_interval": 10,
    # "pin_memory": True,
    "patience": 20,
    "frame_size": (224, 224),
}

# Data Preprocessing

In [6]:
class CustomDataSet(Dataset):
    def __init__(self, frame_size):
        self.root_dir = 'data/videos/'
        # Get the list of all files in directory that end in .mp4
        self.files = [f for f in os.listdir(self.root_dir) if f.endswith('.mp4')]

        # Define the transforms
        self.transforms = transforms.Compose([
            transforms.Resize(frame_size),
        ])

        self.data = []

        for file in tqdm(self.files):
            # Read the video file
            video, audio, info = io.read_video(self.root_dir + file, pts_unit='sec', output_format='tchw')
            pin = [int(x) for x in file[0:4]] # Get the pin from the filename (first 4 characters)
            pin = torch.tensor(pin, dtype=torch.float)
            video = torch.tensor(video, dtype=torch.float32)
            
            #If the video is too short, skip it
            if video.shape[0] != 78:
                continue

            # Apply the transforms
            video = self.transforms(video)
            # Append the video to the data
            self.data.append((video, pin))      

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        # Return the video and the pin
        return self.data[idx][0], self.data[idx][1]
    
# Create the datasets
dataset = CustomDataSet(params['frame_size'])
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

100%|██████████| 831/831 [02:34<00:00,  5.36it/s]


In [7]:
train_loader = DataLoader(train_dataset, batch_size=params["batch_size"], shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=params["batch_size"], shuffle=True)

# Training and Testing Loop


In [8]:
def train(model, dataloader, criterion, optimizer):
    model.train()
    running_loss = 0.0
    total_correct = 0

    for videos, labels in tqdm(dataloader, desc="Training", leave=False):
        videos = videos.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)

        # optimizer.zero_grad()
        #This is a bit faster than the above line
        for param in model.parameters():
            param.grad = None

        # Forward pass
        outputs = model(videos)

        # Calculate loss
        loss = criterion(outputs, labels)

        # Backward pass
        loss.backward()
        optimizer.step()

        total_correct += (outputs.argmax(1) == labels).type(torch.float).sum().item()
        running_loss += loss.item()

    avg_loss = running_loss / len(dataloader)
    return avg_loss

In [9]:
def validate(model, dataloader, criterion):
    model.eval()
    total_loss = 0.0
    total_correct = 0

    with torch.no_grad():
        for videos, labels in tqdm(dataloader, desc="Validation", leave=False):
            
            videos = videos.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True)

            outputs = model(videos)
            loss = criterion(outputs, labels)

            total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    return avg_loss

In [10]:
def run_epoch(model, train_loader, val_loader, criterion, optimizer, history):
    best_val_loss = float("inf")

    for epoch in range(history["last_epoch"] + 1, params["epochs"] + 1):
        train_loss = train(
            model,
            train_loader,
            criterion,
            optimizer,
        )
        val_loss = validate(model, val_loader, criterion)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            history["best_weights"] = model.state_dict()
            patience = 0
        else:
            patience += 1

        if patience >= params["patience"]:
            print(f"Early stopping at epoch {epoch}")
            break

        history["train_loss"].append(train_loss)
        history["val_loss"].append(val_loss)
        history["last_epoch"] = epoch

        print(f'Epoch:              {epoch}/{params["epochs"]}')
        print(f"Training Loss:      {train_loss:.4f}")
        print(f"Validation Loss:    {val_loss:.4f}")
        print(f'Learning rate:      {optimizer.param_groups[0]["lr"]:.7f}')

    return (model, history)

In [13]:
model = ViViT(image_size=params["frame_size"][0], patch_size=16, num_classes=4, num_frames=78).to(device)
criterion = nn.MSELoss()
scaler = torch.cuda.amp.GradScaler()
optimizer = torch.optim.Adam(model.parameters(), lr=params["lr"])

patience = 0

best_val_loss = float("inf")
besst_weights = None

train_losses = []
train_accuracies = []
val_losses = []
val_accuracies = []

try:
    model, history = run_epoch(
        model,
        train_loader,
        test_loader,
        criterion,
        optimizer,
        history={
            "train_loss": [],
            "val_loss": [],
            "val_accuracy": [],
            "last_epoch": 0,
            "best_weights": None,
        },
    )

except KeyboardInterrupt:
    print("Training interrupted")

                                                 

RuntimeError: The size of tensor a (2) must match the size of tensor b (4) at non-singleton dimension 1