In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import shutil
import os

# Define the source directory (Google Drive) and destination directory (Colab session)
source_zip_path = '/content/drive/MyDrive/COMP411_Project_Datasets/Something-Something-V2/Top-10-Label-Dataset.zip'
destination_zip_path = '/content/Top-10-Label-Dataset.zip'

shutil.copy(source_zip_path, destination_zip_path)

'/content/Top-10-Label-Dataset.zip'

# USE WHEN NEEDED

In [3]:
import shutil
import os

# Define the source directory (Google Drive) and destination directory (Colab session)
source_zip_path = '/content/drive/MyDrive/COMP411_Project_Models/1-layer-models/best_model_epoch_2.pth'
destination_zip_path = '/content/epoch_2_model'

shutil.copy(source_zip_path, destination_zip_path)

'/content/epoch_2_model'

In [4]:
import zipfile
import os

def unzip_folder(zip_file_path, extract_to):
    """
    Unzips a folder to the specified directory.

    Args:
        zip_file_path (str): Path to the zip file.
        extract_to (str): Directory where the contents will be extracted.
    """
    # Ensure the output directory exists
    os.makedirs(extract_to, exist_ok=True)

    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
        print(f"Extracted all files to {extract_to}")

In [5]:
# Example usage
zip_file_path1 = '/content/Top-10-Label-Dataset.zip'  # Path to your zip file
extract_to1 = '/content/dataset/'  # Directory to extract files into
unzip_folder(zip_file_path1, extract_to1)


zip_file_path2 = '/content/data_labels.zip'  # Path to your zip file
extract_to2 = '/content/data_labels/'  # Directory to extract files into
unzip_folder(zip_file_path2, extract_to2)

Extracted all files to /content/dataset/
Extracted all files to /content/data_labels/


# LIBRARIES

In [6]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset
import time  # Import time module for tracking time.
import torch.multiprocessing as mp
import cv2 # For dividing the video into frames.
from tqdm import tqdm # For better visuals while training the model.
from torch import Tensor
from typing import Optional, Tuple
from PIL import Image

!pip install timm
import timm



# DATASET CLASS

In [7]:
import re
import torch
import numpy as np
import cv2
import os
from torch.utils.data import Dataset
from torchvision import transforms

class TwentyBillionSomethingDataset(Dataset):
    def __init__(self, video_dir, data_set, label_mapping, totalViews=3, target_frames=48,
                 target_size=(224, 224), is_test=False, test_answers_dict=None):
        self.video_dir = video_dir
        self.label_mapping = label_mapping
        self.totalViews = totalViews
        self.target_frames = target_frames
        self.target_size = target_size
        self.is_test = is_test

        # Define transforms for training mode
        self.train_transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.RandomHorizontalFlip(p=0.7),  # Increased flip probability
            transforms.ColorJitter(brightness=0.6, contrast=0.6, saturation=0.6, hue=0.2), # Stronger color jitter with hue
            transforms.RandomErasing(p=0.7, scale=(0.05, 0.6)), # Higher chance of larger erasures
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
            ])

        # Define transforms for test mode - only essential preprocessing
        self.test_transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

        # Get available video IDs
        available_ids = set(int(file.split('.')[0]) for file in os.listdir(video_dir))

        if not is_test:
            self.data_set = [entry for entry in data_set if int(entry["id"]) in available_ids]
        else:
            self.data_set = [
                {"id": video_id, "label": int(label)}
                for video_id, label in test_answers_dict.items()
                if int(video_id) in available_ids
            ]

    def __getitem__(self, idx):
        entry = self.data_set[idx]
        video_id = entry["id"]

        if self.is_test:
            label = entry["label"]
        else:
            template = entry["template"]
            template = re.sub(r'[\[\]]', '', template).strip()
            if template not in self.label_mapping:
                raise KeyError(f"Template '{template}' not found in label_mapping")
            label = int(self.label_mapping[template])

        # Load and process video
        video_path = os.path.join(self.video_dir, f"{video_id}.webm")
        video = self.load_and_transform_video(video_path)

        # Create three identical views
        views = self.create_three_views(video)

        return views, label

    def __len__(self):
        return len(self.data_set)

    def load_and_transform_video(self, video_path):
        """Load video frames and apply transforms frame by frame."""
        cap = cv2.VideoCapture(video_path)
        frames = []

        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            # Preprocess frame
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = cv2.resize(frame, self.target_size)

            # Apply appropriate transform based on mode
            transform = self.test_transform if self.is_test else self.train_transform
            frame = transform(frame)  # Returns tensor of shape (C, H, W)
            frames.append(frame)

        cap.release()

        if not frames:
            # Create empty frame tensor with correct dimensions
            frames = torch.zeros((1, 3, *self.target_size), dtype=torch.float32)
        else:
            frames = torch.stack(frames)  # Shape: (T, C, H, W)

        # Handle frame count
        if len(frames) > self.target_frames:
            indices = np.linspace(0, len(frames)-1, self.target_frames, dtype=int)
            frames = frames[indices]
        elif len(frames) < self.target_frames:
            padding = torch.zeros((
                self.target_frames - len(frames),
                3,
                *self.target_size
            ), dtype=frames.dtype)
            frames = torch.cat([frames, padding], dim=0)

        return frames

    def create_three_views(self, video):
        """Create three identical views of the video."""
        views = []
        for _ in range(self.totalViews):  # Loop self.totalViews times to create the views
            views.append(video.clone())
        return views


# TOKENIZER FUNCTION

In [8]:
class VideoTokenizer(nn.Module):
    def __init__(self, tubelet_sizes, embed_dims, input_channels=3):
        super(VideoTokenizer, self).__init__()
        self.tubelet_sizes = tubelet_sizes
        self.embed_dims = embed_dims
        self.input_channels = input_channels

        # Initialize projection layers for each view
        self.view_projs = nn.ModuleList([
            nn.Linear(t * h * w * input_channels, embed_dim)
            for (t, h, w), embed_dim in zip(tubelet_sizes, embed_dims)
        ])

    def calculate_padding(self, dimension_size, tubelet_size):
        """Calculate padding to make the dimension divisible by the tubelet size."""
        if dimension_size % tubelet_size == 0:
            return 0
        return tubelet_size - (dimension_size % tubelet_size)

    def pad_tensor(self, tensor, padding):
        """Apply padding dynamically to time, height, and width dimensions."""
        t_pad, h_pad, w_pad = padding
        return nn.functional.pad(tensor, (0, 0, 0, w_pad, 0, h_pad, 0, t_pad))

    def create_positional_encoding(self, num_t, num_spatial_tokens, embed_dim):
        """Create separate positional encodings for temporal and spatial tokens."""
        # Temporal positional encoding
        temporal_pos = torch.arange(0, num_t, dtype=torch.float).unsqueeze(1)
        div_term_temporal = torch.exp(torch.arange(0, embed_dim // 2, dtype=torch.float) *
                                     (-torch.log(torch.tensor(10000.0)) / (embed_dim // 2)))
        pe_temporal = torch.zeros(num_t, embed_dim)
        pe_temporal[:, 0::2] = torch.sin(temporal_pos * div_term_temporal)
        pe_temporal[:, 1::2] = torch.cos(temporal_pos * div_term_temporal) # Corrected typo here

        # Spatial positional encoding
        spatial_pos = torch.arange(0, num_spatial_tokens, dtype=torch.float).unsqueeze(1)
        div_term_spatial = torch.exp(torch.arange(0, embed_dim // 2, dtype=torch.float) *
                                     (-torch.log(torch.tensor(10000.0)) / (embed_dim // 2)))
        pe_spatial = torch.zeros(num_spatial_tokens, embed_dim)
        pe_spatial[:, 0::2] = torch.sin(spatial_pos * div_term_spatial)
        pe_spatial[:, 1::2] = torch.cos(spatial_pos * div_term_spatial)

        return pe_temporal, pe_spatial


    def forward(self, views):
        """
        Args:
            views: List of tensors, each with shape (batch_size, frames, height, width, channels)

        Returns:
            all_tokens: List of tensors with shape (batch_size, num_t, num_spatial_tokens, embed_dim)
        """
        assert len(views) == len(self.tubelet_sizes)

        all_tokens = []
        for view, tubelet_size, embed_dim, view_proj in zip(views, self.tubelet_sizes, self.embed_dims, self.view_projs):
            t, h, w = tubelet_size
            view = view.float()
            # Reshape to get the correct dimensions in order.
            reshaped_view = view.permute(0, 1, 4, 3, 2)
            batch_size, frames, height, width, channels = reshaped_view.shape

            # Calculate and apply padding
            t_pad = self.calculate_padding(frames, t)
            h_pad = self.calculate_padding(height, h)
            w_pad = self.calculate_padding(width, w)
            padded_view = self.pad_tensor(reshaped_view, (t_pad, h_pad, w_pad))
            _, padded_frames, padded_height, padded_width, _ = padded_view.shape

            # Calculate dimensions
            num_t = padded_frames // t  # Temporal tokens
            num_spatial_tokens = (padded_height // h) * (padded_width // w)

            # Reshape for tubelet extraction
            tubelets = padded_view.reshape(
                batch_size,
                num_t, t,  # Temporal dimension
                padded_height // h, h,
                padded_width // w, w,
                channels
            )


            # Flatten spatial tubelets
            tubelets = tubelets.reshape(
                batch_size,
                num_t,
                num_spatial_tokens,  # Combine height and width tubelets
                t * h * w * channels  # Features per tubelet
            )

            #print("Before projection tubelets.shape: ", tubelets.shape)
            # Apply projection layer
            tokens = view_proj(tubelets)  # Shape: (batch_size, num_t, num_spatial_tokens, embed_dim)
            #print("After projection tubelets.shape: ", tokens.shape)

            # Create and add positional encodings
            pe_temporal, pe_spatial = self.create_positional_encoding(num_t, num_spatial_tokens, embed_dim)

            # Add positional encodings with correct broadcasting
            tokens = tokens + pe_temporal.unsqueeze(0).unsqueeze(2).to(tokens.device) + pe_spatial.unsqueeze(0).unsqueeze(1).to(tokens.device)

            all_tokens.append(tokens)

        return all_tokens

# LAYER ATTENTION

In [9]:
class LayerAttention(nn.Module):
    def __init__(
        self,
        embed_dim_i: int,
        num_heads: int = 8,
        dropout: float = 0.1,
        batch_first: bool = True
    ):
        super().__init__()

        assert embed_dim_i % num_heads == 0, f"embed_dim_i ({embed_dim_i}) must be divisible by num_heads ({num_heads})"

        self.embed_dim_i = embed_dim_i
        self.num_heads = num_heads
        self.dropout = dropout
        self.head_dim = embed_dim_i // num_heads

        # Linear projections
        self.k_proj = nn.Linear(embed_dim_i, embed_dim_i)
        self.v_proj = nn.Linear(embed_dim_i, embed_dim_i)
        self.W_q_proj = nn.Linear(embed_dim_i, embed_dim_i)
        self.out_proj = nn.Linear(embed_dim_i, embed_dim_i)

    def forward(
        self,
        view_i: Tensor,
        key_padding_mask: Optional[Tensor] = None,
        need_weights: bool = False
    ) -> Tuple[Tensor, Optional[Tensor]]:

        batch_size, seq_len, _ = view_i.shape

        # Project queries, keys, and values
        q = self.W_q_proj(view_i)
        k = self.k_proj(view_i)
        v = self.v_proj(view_i)

        # Reshape for multi-head attention
        q = q.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        k = k.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        v = v.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)

        # Scaled dot-product attention
        attn_scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)

        if key_padding_mask is not None:
            attn_scores = attn_scores.masked_fill(
                key_padding_mask.unsqueeze(1).unsqueeze(2), float('-inf')
            )

        attn_weights = F.softmax(attn_scores, dim=-1)
        attn_weights = F.dropout(attn_weights, p=self.dropout, training=self.training)

        # Weighted sum of values
        attn_output = torch.matmul(attn_weights, v)

        # Reshape and project output
        attn_output = attn_output.transpose(1, 2).contiguous().view(
            batch_size, seq_len, self.embed_dim_i
        )
        output = self.out_proj(attn_output)

        if need_weights:
            return output, attn_weights
        return output, None

# CROSS VIEW ATTENTION PART

In [10]:
# All cros-view attentions' dropout=0.1(Subject to change).
class CrossViewAttention(nn.Module):
    def __init__(
        self,
        embed_dim_i: int,
        embed_dim_i_plus_1: int,
        num_heads: int = 6,
        dropout: float = 0.1,
        batch_first: bool = True
    ):
        """
        Initialize Cross-View Attention module.

        Args:
            embed_dim_i (int): Embedding dimension of view i
            embed_dim_i_plus_1 (int): Embedding dimension of view i+1
            num_heads (int): Number of attention heads
            dropout (float): Dropout probability
            batch_first (bool): If True, input shape is (batch, seq, feature)
        """
        super().__init__()

        assert embed_dim_i % num_heads == 0, f"embed_dim_i ({embed_dim_i}) must be divisible by num_heads ({num_heads})"

        self.embed_dim_i = embed_dim_i
        self.embed_dim_i_plus_1 = embed_dim_i_plus_1
        self.num_heads = num_heads
        self.dropout = dropout
        self.batch_first = batch_first
        self.head_dim = embed_dim_i // num_heads
        self.scaling = self.head_dim ** -0.5

        # Linear projection for K and V
        self.k_proj = nn.Linear(embed_dim_i_plus_1, embed_dim_i)
        self.v_proj = nn.Linear(embed_dim_i_plus_1, embed_dim_i)

        # Linear projections
        self.W_q_proj = nn.Linear(embed_dim_i, embed_dim_i)
        self.W_k_proj = nn.Linear(embed_dim_i, embed_dim_i)
        self.W_v_proj = nn.Linear(embed_dim_i, embed_dim_i)
        self.out_proj = nn.Linear(embed_dim_i, embed_dim_i)

        # Dropout for attention weights
        self.dropout_layer = nn.Dropout(dropout)

        # Layer normalization for both views
        self.norm_i = nn.LayerNorm(embed_dim_i)
        self.norm_i_plus_1 = nn.LayerNorm(embed_dim_i_plus_1)

    def forward(
        self,
        view_i: Tensor,
        view_i_plus_1: Tensor,
        key_padding_mask: Optional[Tensor] = None,
        need_weights: bool = False
    ) -> Tuple[Tensor, Optional[Tensor]]:
        """
        Apply cross-view attention between two views.

        Args:
            view_i (Tensor): Tokens from view i, shape (batch_size, seq_len_i, embed_dim_i)
            view_i_plus_1 (Tensor): Tokens from view i+1, shape (batch_size, seq_len_i_plus_1, embed_dim_i_plus_1)
            key_padding_mask (Optional[Tensor]): Mask for padding tokens
            need_weights (bool): If True, returns attention weights

        Returns:
            Tuple[Tensor, Optional[Tensor]]: (attended_tokens, attention_weights if need_weights else None)
        """
        view_i = self.norm_i(view_i)
        view_i_plus_1 = self.norm_i_plus_1(view_i_plus_1)

        batch_size, seq_len_i, _ = view_i.shape
        _, seq_len_i_plus_1, _ = view_i_plus_1.shape

        k_projected = self.k_proj(view_i_plus_1)
        v_projected = self.v_proj(view_i_plus_1)

        # Project to queries, keys, and values
        q = self.W_q_proj(view_i)
        k = self.W_k_proj(k_projected)
        v = self.W_v_proj(v_projected)

        # Reshape for multi-head attention
        q = q.view(batch_size, seq_len_i, self.num_heads, self.head_dim).transpose(1, 2)
        k = k.view(batch_size, seq_len_i_plus_1, self.num_heads, self.head_dim).transpose(1, 2)
        v = v.view(batch_size, seq_len_i_plus_1, self.num_heads, self.head_dim).transpose(1, 2)

        # Scaled dot-product attention
        attn_weights = torch.matmul(q, k.transpose(-2, -1)) * self.scaling

        # Apply key padding mask if provided
        if key_padding_mask is not None:
            attn_weights = attn_weights.masked_fill(
                key_padding_mask.unsqueeze(1).unsqueeze(2),
                float('-inf')
            )

        attn_weights = torch.softmax(attn_weights, dim=-1)
        attn_weights = self.dropout_layer(attn_weights)

        # Apply attention weights to values
        attn_output = torch.matmul(attn_weights, v)

        # Reshape and project output
        attn_output = attn_output.transpose(1, 2).contiguous().view(
            batch_size, seq_len_i, self.embed_dim_i
        )
        output = self.out_proj(attn_output)

        # Residual connection
        output = output + view_i

        if need_weights:
            return output, attn_weights
        return output, None

# VIEW LAYER

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
# Default dropout is 0.1 but given different dropout values for each view. Views have different complexities.
class ViewLayer(nn.Module):
    def __init__(self, d_model, dropout=0.1, cross_attention_module = None,
                 crossAttentionCalculationNeeded = False, MLP_dim = 256, attention_head_num=8):
        """
        Args:
            d_model: Input feature dimension
            dim_feedforward: Dimension of feedforward MLP
            dropout: Dropout probability
        """
        super(ViewLayer, self).__init__()

        self.dropout_value = dropout

        self.crossAttentionCalculationNeeded = crossAttentionCalculationNeeded
        self.self_attention = LayerAttention(embed_dim_i=d_model, dropout=self.dropout_value, num_heads=attention_head_num)  # Self-attention
        self.cross_attention = cross_attention_module

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)  # For FFN residual

        self.linear1 = nn.Linear(d_model, MLP_dim)  # First linear layer
        self.linear2 = nn.Linear(MLP_dim, d_model)  # Second linear layer

        self.dropout1 = nn.Dropout(self.dropout_value)
        self.dropout2 = nn.Dropout(self.dropout_value)

        self.mid_processed_tokens = None  # Store intermediate tokens


    def forward(self, src, mid_tokens_from_other_view=None, src_mask=None):
        """
        Args:
            src: Input tensor (batch_size, seq_len, d_model)
            src_mask: Optional attention mask
        """
        src2 = self.norm1(src)
        src2, _ = self.self_attention(src2) # Self attention. Only takes one input for self-attention.
        src = src + self.dropout1(src2)


        self.mid_processed_tokens = src # Store intermediate tokens

        # Cross-Attention (if needed)
        if self.crossAttentionCalculationNeeded and mid_tokens_from_other_view is not None:
            src2, _ = self.cross_attention(self.norm1(self.mid_processed_tokens), mid_tokens_from_other_view) # view_i and view_i_plus_one
            src = src + self.dropout1(src2)

        src2 = self.norm2(src)
        src3 = self.linear2(F.relu(self.linear1(src2)))
        src = src + self.dropout2(src3)

        return src

    def get_mid_processed_tokens(self):
      return self.mid_processed_tokens

    def load_pretrained_weights_for_layer(self, weights):
        """
        Load pretrained weights from a ViT model into the ViewLayer with comprehensive error checking.

        Args:
            weights (dict): Dictionary containing pretrained weights from ViT layer

        Raises:
            ValueError: If weights are missing or dimensions don't match
            RuntimeError: If tensor device or dtype mismatches occur
        """
        required_keys = {
            'self_attn.qkv.weight', 'self_attn.qkv.bias',
            'self_attn.proj.weight', 'self_attn.proj.bias',
            'norm1.weight', 'norm1.bias',
            'norm2.weight', 'norm2.bias',
            'mlp.fc1.weight', 'mlp.fc1.bias',
            'mlp.fc2.weight', 'mlp.fc2.bias'
        }

        # Check if all required weights are present
        missing_keys = required_keys - set(weights.keys())
        if missing_keys:
            raise ValueError(f"Missing required weights: {missing_keys}")

        # Get model dimensions for validation
        embed_dim = self.self_attention.embed_dim_i
        mlp_dim = self.linear1.out_features  # MLP hidden dimension

        try:
            # Validate QKV dimensions
            qkv_weight = weights['self_attn.qkv.weight']
            qkv_bias = weights['self_attn.qkv.bias']

            expected_qkv_weight_shape = (3 * embed_dim, embed_dim)
            expected_qkv_bias_shape = (3 * embed_dim,)

            if qkv_weight.shape != expected_qkv_weight_shape:
                raise ValueError(
                    f"QKV weight shape mismatch. Expected {expected_qkv_weight_shape}, "
                    f"got {qkv_weight.shape}"
                )
            if qkv_bias.shape != expected_qkv_bias_shape:
                raise ValueError(
                    f"QKV bias shape mismatch. Expected {expected_qkv_bias_shape}, "
                    f"got {qkv_bias.shape}"
                )

            # Split and validate Q, K, V shapes
            qkv_weight = qkv_weight.reshape(3, embed_dim, embed_dim)
            qkv_bias = qkv_bias.reshape(3, embed_dim)

            # Validate projection dimensions
            if weights['self_attn.proj.weight'].shape != (embed_dim, embed_dim):
                raise ValueError(
                    f"Projection weight shape mismatch. Expected ({embed_dim}, {embed_dim}), "
                    f"got {weights['self_attn.proj.weight'].shape}"
                )

            # Validate norm dimensions
            for norm_key in ['norm1.weight', 'norm1.bias', 'norm2.weight', 'norm2.bias']:
                if weights[norm_key].shape != (embed_dim,):
                    raise ValueError(
                        f"Norm weight/bias shape mismatch for {norm_key}. "
                        f"Expected ({embed_dim},), got {weights[norm_key].shape}"
                    )

            # Validate MLP dimensions
            if weights['mlp.fc1.weight'].shape != (mlp_dim, embed_dim):
                raise ValueError(
                    f"MLP fc1 weight shape mismatch. Expected ({mlp_dim}, {embed_dim}), "
                    f"got {weights['mlp.fc1.weight'].shape}"
                )
            if weights['mlp.fc2.weight'].shape != (embed_dim, mlp_dim):
                raise ValueError(
                    f"MLP fc2 weight shape mismatch. Expected ({embed_dim}, {mlp_dim}), "
                    f"got {weights['mlp.fc2.weight'].shape}"
                )

            # Check devices and dtypes
            device = self.self_attention.W_q_proj.weight.device
            dtype = self.self_attention.W_q_proj.weight.dtype

            # Load weights with device and dtype validation
            def load_tensor(tensor, target):
                if tensor.device != device:
                    tensor = tensor.to(device)
                if tensor.dtype != dtype:
                    tensor = tensor.to(dtype)
                target.data.copy_(tensor)

            # Load Q, K, V weights and biases
            load_tensor(qkv_weight[0], self.self_attention.W_q_proj.weight)
            load_tensor(qkv_weight[1], self.self_attention.k_proj.weight)
            load_tensor(qkv_weight[2], self.self_attention.v_proj.weight)

            load_tensor(qkv_bias[0], self.self_attention.W_q_proj.bias)
            load_tensor(qkv_bias[1], self.self_attention.k_proj.bias)
            load_tensor(qkv_bias[2], self.self_attention.v_proj.bias)

            # Load projection weights
            load_tensor(weights['self_attn.proj.weight'], self.self_attention.out_proj.weight)
            load_tensor(weights['self_attn.proj.bias'], self.self_attention.out_proj.bias)

            # Load normalization weights
            load_tensor(weights['norm1.weight'], self.norm1.weight)
            load_tensor(weights['norm1.bias'], self.norm1.bias)
            load_tensor(weights['norm2.weight'], self.norm2.weight)
            load_tensor(weights['norm2.bias'], self.norm2.bias)

            # Load MLP weights
            load_tensor(weights['mlp.fc1.weight'], self.linear1.weight)
            load_tensor(weights['mlp.fc1.bias'], self.linear1.bias)
            load_tensor(weights['mlp.fc2.weight'], self.linear2.weight)
            load_tensor(weights['mlp.fc2.bias'], self.linear2.bias)

        except RuntimeError as e:
            raise RuntimeError(f"Error during weight loading: {str(e)}")
        except Exception as e:
            raise ValueError(f"Unexpected error during weight loading: {str(e)}")

# GLOBAL ENCODER PART

In [12]:
import torch
import torch.nn as nn

class GlobalEncoder(nn.Module):
    def __init__(self, embed_dim, num_layers):
        super(GlobalEncoder, self).__init__()
        self.global_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(d_model=embed_dim, nhead=8, dim_feedforward=embed_dim*4, dropout=0.1)
            for _ in range(num_layers)
        ])

    def forward(self, tokens):
        """
        Parameters:
            tokens: Input tensor of shape (batch_size, seq_length, embed_dim) or (seq_length, batch_size, embed_dim)
        Returns:
            output: Tensor of same shape as input
        """
        # Check if input needs to be transposed
        if tokens.dim() == 3 and tokens.size(2) == self.global_layers[0].linear1.in_features:
            # Input is (batch_size, seq_length, embed_dim)
            # Transform to (seq_length, batch_size, embed_dim)
            tokens = tokens.transpose(0, 1)
            need_transpose = True
        else:
            need_transpose = False

        # Create attention mask for padding if needed
        # Assuming no padding mask for now, but you can add:
        # mask = torch.ones((tokens.size(0), tokens.size(0))).bool().to(tokens.device)

        # Process through transformer layers
        for layer in self.global_layers:
            tokens = layer(tokens)

        # Restore original shape if needed
        if need_transpose:
            tokens = tokens.transpose(0, 1)

        return tokens

    @staticmethod # NOT USED. IF NEEDED USE IT.
    def create_pad_mask(seq_length, valid_lens, device):
        """
        Create padding mask for sequences of different lengths

        Parameters:
            seq_length: Maximum sequence length
            valid_lens: Tensor of shape (batch_size,) containing valid lengths of each sequence
            device: Device to create mask on
        Returns:
            mask: Boolean mask tensor of shape (seq_length, seq_length)
        """
        mask = torch.ones((seq_length, seq_length), device=device)
        mask = torch.triu(mask) == 1
        return mask

# MULTIVIEW TRANSFORMER PART

In [13]:
class MultiviewTransformer(nn.Module):
    def __init__(self, device, embed_dims_per_view, MLP_dims_per_view, num_of_head_per_view,
                 view_dropout_list, num_layers_per_view, tubelet_sizes, global_embed_dim,
                 global_num_layers, num_classes, cross_view_connections):
        super(MultiviewTransformer, self).__init__()

        self.tubelet_sizes = tubelet_sizes
        self.embed_dims_per_view = embed_dims_per_view
        self.cross_view_connections = cross_view_connections
        self.num_layers_per_view = num_layers_per_view
        self.view_dropout_list = view_dropout_list

        # Tokenizer for video frames into tubelets
        self.tokenizer = VideoTokenizer(tubelet_sizes, embed_dims_per_view)
        """
        # Initialize transformers for each view (Base, Small, Tiny)
        self.view_transformers = nn.ModuleList([
            ViewTransformer(embed_dim=embed_dim, num_layers=num_layers)
            for embed_dim, num_layers in zip(embed_dims_per_view, num_layers_per_view)
        ])


        nn.TransformerEncoderLayer(d_model=embed_dims_per_view[view], nhead=8,
                                     dim_feedforward=512, dropout=0.1,
                                     batch_first=True)  # Set batch_first=True
        """
        # Initialize Cross-View Attention layers
        """
        self.cross_attention_layers = nn.ModuleList([
            CrossViewAttention(in_dim, out_dim).to(device)
            for in_dim, out_dim in embed_dims_pairs_for_cross_att
            ])
        """
        #print(self.cross_attention_layers)

        # list containing layers of different views as its elements. Do the crossAttentionCalculationNeeded check for cross attention calculations.
        self.total_view_num = len(self.embed_dims_per_view)
        """
        self.cls_token = nn.ParameterList([
            nn.Parameter(torch.zeros(1, 1, embed_dims_per_view[cls_num]))
            for cls_num in range(self.total_view_num)
            ])
        """
        # Better initialization(Kaiming/He)
        self.cls_tokens = nn.ParameterList([
            nn.Parameter(torch.nn.init.kaiming_normal_(
                torch.zeros(1, 1, 1, embed_dims_per_view[cls_num]),  # [1, 1, 1, E]
                mode='fan_out',
                nonlinearity='relu'
                ))
            for cls_num in range(len(embed_dims_per_view))
            ])

        self.view_list = nn.ModuleList()  # Use ModuleList to store layers for each view
        for view in range(self.total_view_num):  # Iterate over views
            layer_list = nn.ModuleList()  # Use ModuleList to store layers for a specific view
            for layer_num in range(num_layers_per_view[view]):
                if layer_num in cross_view_connections and view != (self.total_view_num - 1):
                    layer_list.append(
                        ViewLayer(
                            d_model=embed_dims_per_view[view], dropout=self.view_dropout_list[view], # Setting the dropout for each view layers.
                            cross_attention_module=CrossViewAttention(embed_dims_per_view[view], embed_dims_per_view[view + 1]), # Initialize the cross view.
                            crossAttentionCalculationNeeded=True, MLP_dim=MLP_dims_per_view[view], attention_head_num=num_of_head_per_view[view]
                        ).to(device)
                    )
                else:
                    layer_list.append(
                        ViewLayer(
                            d_model=embed_dims_per_view[view], dropout=self.view_dropout_list[view],
                            crossAttentionCalculationNeeded=False,
                            MLP_dim=MLP_dims_per_view[view], attention_head_num=num_of_head_per_view[view]
                        ).to(device)
                    )
            self.view_list.append(layer_list)  # Add the per-view ModuleList to the overall ModuleList

        # Linear projections to unify embedding dimensions
        self.projections = nn.ModuleList([
            nn.Linear(embed_dim, global_embed_dim) for embed_dim in embed_dims_per_view
        ])

        self.global_encoder = GlobalEncoder(embed_dim=global_embed_dim, num_layers=global_num_layers)

        self.classifier = nn.Sequential(
            nn.Linear(global_embed_dim, 512),
            nn.ReLU(),
            nn.Linear(512, num_classes)
        ).to(device)

    def forward(self, videos):
        # Tokenize video frames into tubelets for each view
        # Returns list of tokens with shape [B, T, HW, E] for each view
        video_tokens_per_view = self.tokenizer(videos)
        reversed_tokens = list(reversed(video_tokens_per_view))

        view_execution_order = list(reversed(self.view_list))

        # Prepare CLS tokens for each view
        concatenated_tokens = []
        for cls_token, tokens in zip(reversed(self.cls_tokens), reversed_tokens):
            B, T, HW, E = tokens.shape
            # Expand CLS token to batch and temporal dimensions
            expanded_cls = cls_token.expand(B, T, 1, E)  # [B, T, 1, E]
            # Concatenate with spatial tokens
            concat_tokens = torch.cat((expanded_cls, tokens), dim=2)  # [B, T, 1+HW, E]
            concatenated_tokens.append(concat_tokens)

        # Process through transformer layers
        tokens = []
        for layer in range(self.num_layers_per_view[0]):
            mid_tokens_view_i_plus_one = None
            layer_tokens = []
            for current_view, input_tokens in zip(view_execution_order, concatenated_tokens):
                # Reshape to sequence form for transformer processing
                B, T, N, E = input_tokens.shape
                reshaped_tokens = input_tokens.reshape(B, T * N, E)

                if layer in self.cross_view_connections:
                    processed_token = current_view[layer](reshaped_tokens, mid_tokens_view_i_plus_one)
                    mid_tokens_view_i_plus_one = current_view[layer].get_mid_processed_tokens()
                else:
                    processed_token = current_view[layer](reshaped_tokens)

                # Reshape back to temporal form
                processed_token = processed_token.reshape(B, T, N, E)
                layer_tokens.append(processed_token)
            tokens.append(layer_tokens)

        new_tokens = list(reversed(tokens[self.num_layers_per_view[0] - 1]))

        # Extract and process temporal CLS tokens
        processed_cls_tokens = []
        for i in range(self.total_view_num):
            # Extract CLS tokens (first token of each temporal step)
            cls_tokens = new_tokens[i][:, :, 0, :]  # Shape: [B, T, E]
            # Project the CLS tokens
            projected_cls = self.projections[i](cls_tokens)  # Shape: [B, T, global_embed_dim]
            processed_cls_tokens.append(projected_cls)

        # Combine temporal CLS tokens from all views
        combined_cls_tokens = torch.cat(processed_cls_tokens, dim=1)  # Shape: [B, T_total, global_embed_dim]

        # Process through global encoder and classifier
        global_encoder_output = self.global_encoder(combined_cls_tokens)
        pooled_output = torch.mean(global_encoder_output, dim=1)
        final_output = self.classifier(pooled_output)

        return final_output

    """
    pretrained_weights_list =
    """
    def usePretrainedWeights(self, pretrained_layer_list_for_each_view):
        for view_config in pretrained_layer_list_for_each_view:
            view_idx = view_config[0]
            layer_mappings = view_config[1]

            for our_layer_num, vit_layer_num in layer_mappings:
                print(f"Loading pretrained weights for view {view_idx}, layer {our_layer_num}")
                pretrained_weights = self.get_pretrained_weights_for_layer_from_ViT(view_idx, vit_layer_num)
                self.view_list[view_idx][our_layer_num].load_pretrained_weights_for_layer(pretrained_weights)

    def get_pretrained_weights_for_layer_from_ViT(self, view_idx, ViT_layer_num):
        """
        Get pretrained weights from the appropriate ViT model based on view size.

        Args:
            view_idx (int): Index of the view (0: Base/12 layers, 1: Small/8 layers, 2: Tiny/4 layers)
            ViT_layer_num (int): Layer number from the ViT model to fetch weights from

        Returns:
            dict: Pretrained weights for the specified layer
        """
        # Define model configurations
        vit_configs = {
            0: {'name': 'vit_base_patch16_224', 'num_layers': 12},  # Largest view uses ViT-Base
            1: {'name': 'vit_small_patch16_224', 'num_layers': 12},  # Middle view uses ViT-Small
            2: {'name': 'vit_tiny_patch16_224', 'num_layers': 12}    # Smallest view uses ViT-Tiny
        }

        if view_idx not in vit_configs:
            raise ValueError(f"Invalid view_idx: {view_idx}. Must be 0 (Base), 1 (Small), or 2 (Tiny)")

        config = vit_configs[view_idx]

        # Validate the requested layer number
        if ViT_layer_num >= config['num_layers']:
            raise ValueError(
                f"Invalid layer number {ViT_layer_num} for ViT-{config['name']}. "
                f"This model only has {config['num_layers']} layers (0-{config['num_layers']-1})"
            )

        # Load the appropriate model
        model = timm.create_model(config['name'], pretrained=True)
        encoder_layer = model.blocks[ViT_layer_num]

        # Extract weights
        weights = {
            'self_attn.qkv.weight': encoder_layer.attn.qkv.weight,
            'self_attn.qkv.bias': encoder_layer.attn.qkv.bias,
            'self_attn.proj.weight': encoder_layer.attn.proj.weight,
            'self_attn.proj.bias': encoder_layer.attn.proj.bias,
            'mlp.fc1.weight': encoder_layer.mlp.fc1.weight,
            'mlp.fc1.bias': encoder_layer.mlp.fc1.bias,
            'mlp.fc2.weight': encoder_layer.mlp.fc2.weight,
            'mlp.fc2.bias': encoder_layer.mlp.fc2.bias,
            'norm1.weight': encoder_layer.norm1.weight,
            'norm1.bias': encoder_layer.norm1.bias,
            'norm2.weight': encoder_layer.norm2.weight,
            'norm2.bias': encoder_layer.norm2.bias
        }

        return weights

# GET THE DATA LABELS

In [14]:
import csv
import json
import os

def load_json(file_path):
    """Utility to load a JSON file."""
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    with open(file_path, "r") as f:
        return json.load(f)

def load_csv(file_path, delimiter=";"):
    """Utility to load a CSV file."""
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    with open(file_path, "r") as f:
        reader = csv.reader(f, delimiter=delimiter)
        return list(reader)

def map_test_answers(file_path, label_mapping):
    """Maps test answers from CSV to numerical class IDs."""
    test_answers = {}
    rows = load_csv(file_path)
    for row in rows:
        if len(row) == 2:  # Ensure the row has exactly two elements (video_id and label)
            video_id, label = row
            if label in label_mapping:  # Ensure label exists in the mapping
                test_answers[video_id] = label_mapping[label]
            else:
                print(f"Warning: Label '{label}' not found in label_mapping")
        else:
            print(f"Skipping row due to incorrect format: {row}")
    return test_answers

# Paths to files
base_path = "/content/data_labels/20bn-data-labels/labels"
label_file = os.path.join(base_path, "labels.json")
train_file = os.path.join(base_path, "train.json")
val_file = os.path.join(base_path, "validation.json")
test_file = os.path.join(base_path, "test.json")
test_answers_file = os.path.join(base_path, "test-answers.csv")

# Load label mapping and datasets
label_mapping = load_json(label_file)
train_data = load_json(train_file)
val_data = load_json(val_file)
test_data = load_json(test_file)

# Map test answers to class IDs
test_answers = map_test_answers(test_answers_file, label_mapping)

# Summarize datasets
def summarize_dataset(dataset, keys, num_samples=5):
    """Creates a summary of the first few entries of a dataset."""
    return [{key: entry.get(key, None) for key in keys} for entry in dataset[:num_samples]]

train_summary = summarize_dataset(train_data, ["id", "label", "template"])
val_summary = summarize_dataset(val_data, ["id", "label", "template"])
test_summary = summarize_dataset(test_data, ["id"])
test_answers_summary = {k: test_answers[k] for k in list(test_answers)[:5]}

# Display summaries
print("Train Summary:", train_summary)
print("Validation Summary:", val_summary)
print("Test Summary:", test_summary)
print("Test Answers Sample:", test_answers_summary)

Train Summary: [{'id': '78687', 'label': 'holding potato next to vicks vaporub bottle', 'template': 'Holding [something] next to [something]'}, {'id': '42326', 'label': 'spreading margarine onto bread', 'template': 'Spreading [something] onto [something]'}, {'id': '100904', 'label': 'putting pen on a surface', 'template': 'Putting [something] on a surface'}, {'id': '80715', 'label': 'lifting up one end of bottle, then letting it drop down', 'template': 'Lifting up one end of [something], then letting it drop down'}, {'id': '34899', 'label': 'holding bulb', 'template': 'Holding [something]'}]
Validation Summary: [{'id': '74225', 'label': 'spinning cube that quickly stops spinning', 'template': 'Spinning [something] that quickly stops spinning'}, {'id': '116154', 'label': 'showing clay box on top of wallet', 'template': 'Showing [something] on top of [something]'}, {'id': '198186', 'label': 'wiping words off of a paper', 'template': 'Wiping [something] off of [something]'}, {'id': '13787

# TRAIN CODE

In [15]:
import torch
import torch.nn as nn
import time
from tqdm import tqdm

def train_model(model, dataloader, criterion, optimizer, device,
                current_epoch_num, save_dir, total_num_epochs,
                label_mapping, scheduler=None, accumulation_steps=1):
    """
    Train function with gradient accumulation (normal precision).
    """

    model.to(device)
    model.train()

    epoch_start_time = time.time()
    running_loss = 0.0
    correct_predictions = 0
    total_samples = 0
    optimizer.zero_grad()  # Move this outside the loop for accumulation

    total_batches = len(dataloader)

    with tqdm(enumerate(dataloader), total=total_batches, desc=f"Epoch {current_epoch_num + 1}/{total_num_epochs}") as pbar:
        for batch_idx, (inputs, targets) in pbar:
            # Move inputs and targets to the device
            inputs = [video.to(device) if isinstance(video, torch.Tensor) else torch.tensor(video).to(device) for video in inputs]
            targets = targets.to(device)

            try:
                # Forward pass and loss computation
                outputs = model(inputs)
                loss = criterion(outputs, targets)
                loss = loss / accumulation_steps  # Normalize loss by accumulation steps
            except Exception as e:
                print(f"Error in forward pass: {e}")
                raise

            # Perform backward pass
            loss.backward()

            # Clip gradients for stability
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            # Accumulate gradients for `accumulation_steps` before updating weights
            if (batch_idx + 1) % accumulation_steps == 0 or (batch_idx + 1) == len(dataloader):
                optimizer.step()  # Update weights
                optimizer.zero_grad()  # Reset gradients after step

            # Update running loss
            running_loss += loss.item() * accumulation_steps  # Scale back to original loss

            # Calculate training accuracy
            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == targets).sum().item()
            total_samples += targets.size(0)

            # Update tqdm progress bar
            pbar.set_postfix({
                'loss': running_loss / (batch_idx + 1),
                'accuracy': (correct_predictions / total_samples) * 100
            })

    # Calculate average loss and accuracy for the epoch
    avg_loss = running_loss / len(dataloader)
    epoch_accuracy = (correct_predictions / total_samples) * 100
    epoch_time = time.time() - epoch_start_time

    print(f"################## Train Epoch Summary [{current_epoch_num + 1}/{total_num_epochs}]##################:")
    print(f"Train Average Loss: {avg_loss:.4f}")
    print(f"Accuracy: {epoch_accuracy:.2f}%")
    print(f"Time: {epoch_time:.2f}s")

    return avg_loss, epoch_accuracy

# TEST CODE

In [16]:
# Test code
from tqdm import tqdm

def test_model(model, dataloader, device, label_mapping=None, criterion=None):
    """
    Evaluate the model's accuracy on the test set (normal precision).

    Args:
        model: The trained PyTorch model.
        dataloader: DataLoader for the test set.
        device: Device to run the model on (e.g., 'cpu' or 'cuda').
        label_mapping (optional): A mapping of class indices to labels.
        criterion (optional): A loss function to compute average loss during evaluation.

    Returns:
        avg_loss: The average loss over the test set.
        accuracy: The overall test accuracy as a percentage.
    """
    model.to(device)
    model.eval()

    correct = 0
    total = 0
    running_loss = 0.0  # Initialize running loss

    total_batches = len(dataloader)

    with torch.no_grad():
        with tqdm(enumerate(dataloader), total=total_batches, desc="Testing") as pbar:
            for batch_idx, (inputs, targets) in pbar:

                # Move inputs and targets to the appropriate device
                if isinstance(inputs, list):  # For multi-view inputs
                    inputs = [video.to(device) for video in inputs]
                else:
                    inputs = inputs.to(device)

                targets = targets.to(device)

                try:
                    # Forward pass
                    outputs = model(inputs)
                except Exception as e:
                    print(f"Error in forward pass: {e}")
                    raise

                # Get class indices of max scores
                _, predicted = torch.max(outputs, dim=1)
                total += targets.size(0)  # Accumulate total samples
                correct += (predicted == targets).sum().item()  # Accumulate correct predictions

                if criterion:
                    loss = criterion(outputs, targets)
                    running_loss += loss.item()  # Accumulate loss

                # Update tqdm progress bar
                pbar.set_postfix({
                    'loss': running_loss / (batch_idx + 1) if criterion else "N/A",
                    'accuracy': (correct / total) * 100
                })

    # Calculate overall accuracy and average loss
    accuracy = correct / total if total > 0 else 0.0
    avg_loss = running_loss / len(dataloader) if len(dataloader) > 0 else 0.0

    # Print results
    print(f'\nTest Results for Epoch:')
    print(f'Total Samples: {total}')
    print(f'Correct Predictions: {correct}')
    print(f'Test Average Loss: {avg_loss:.4f}')
    print(f'Accuracy: {accuracy * 100:.2f}%')

    return avg_loss, accuracy * 100

# TRAIN-TEST TOGETHER FUNCTION

In [17]:
import os
import torch
import time

def train_then_test_for_each_epoch(
    model, train_loader, criterion, optimizer, device, total_num_epochs,
    test_loader, save_dir, label_mapping, scheduler, save_per_epoch,
    start_epoch, limit_epoch, early_stopping_patience=None, use_pretrained_weights=False
):
    """
    pretrained_layer_list_for_each_view = [
            # View 0 (Base) - All 4 layers from ViT-Base
            [0, [[0,0], [1,1], [2,3], [3,5], [4,7], [5,8]]],
            # View 1 (Small) - All 4 layers from ViT-Small
            [1, [[0,0], [1,1], [2,3], [3,5], [4,7], [5,8]]],
            # View 2 (Tiny) - All 4 layers from ViT-Tiny
            [2, [[0,0], [1,1], [2,3], [3,5], [4,7], [5,8]]]
        ]
    """
    if use_pretrained_weights:
        print("Using pretrained weights...")
        pretrained_layer_list_for_each_view = [
            # View 0 (Base) - All 4 layers from ViT-Base
            [0, [[0,0]]],
            # View 1 (Small) - All 4 layers from ViT-Small
            [1, [[0,0]]],
            # View 2 (Tiny) - All 4 layers from ViT-Tiny
            [2, [[0,0]]]
        ]
        model.usePretrainedWeights(pretrained_layer_list_for_each_view)

    train_epoch_accuracies = []
    test_epoch_accuracies = []
    best_test_accuracy = -float('inf')
    patience_counter = 0
    best_model_path = None
    total_start_time = time.time()

    for epoch in range(start_epoch, total_num_epochs):
        if (epoch + 1) <= limit_epoch:
            epoch_start_time = time.time()
            current_lr = optimizer.param_groups[0]['lr']
            print(f"\nEpoch {epoch + 1}/{total_num_epochs}  Current Learning Rate: {current_lr:.6f}")

            print("Training...")
            train_epoch_loss, train_epoch_accuracy = train_model(
                model, train_loader, criterion, optimizer, device, (epoch),
                save_dir, total_num_epochs, label_mapping, scheduler, accumulation_steps=16
            )
            train_epoch_accuracies.append(train_epoch_accuracy)

            # CosineAnnealingWarmRestarts scheduler.
            scheduler.step()

            print("Testing...")
            test_loss, test_accuracy = test_model(model, test_loader, device, label_mapping, criterion)
            test_epoch_accuracies.append(test_accuracy)

            epoch_time = time.time() - epoch_start_time
            total_time = time.time() - total_start_time

            print(f"End of Epoch {epoch + 1} - Training Accuracy: {train_epoch_accuracy:.4f}, Training Loss: {train_epoch_loss:.4f}, Test Accuracy: {test_accuracy}, Test Loss: {test_loss:.4f}")
            print(f"Epoch Time: {epoch_time:.2f}s, Total Time: {total_time:.2f}s")

            if early_stopping_patience is not None:
                if test_accuracy > best_test_accuracy:
                    best_test_accuracy = test_accuracy

                    os.makedirs(save_dir, exist_ok=True)
                    new_best_model_path = os.path.join(save_dir, f"best_model_epoch_{epoch + 1}.pth")
                    torch.save({
                        'epoch': epoch + 1,
                        'model_state_dict': model.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict(),
                        'scheduler_state_dict': scheduler.state_dict(),
                        'loss': train_epoch_loss,
                        'test_accuracy': test_accuracy,
                    }, new_best_model_path)
                    print(f"New best model saved at {new_best_model_path}")

                    if best_model_path and best_model_path != new_best_model_path:
                        os.remove(best_model_path)
                        print(f"Deleted previous best model at {best_model_path}")
                    best_model_path = new_best_model_path

                    patience_counter = 0
                else:
                    patience_counter += 1
                    print(f"No improvement. Patience counter: {patience_counter}/{early_stopping_patience}")

                if patience_counter >= early_stopping_patience:
                    final_time = time.time() - total_start_time
                    print(f"Early stopping triggered at epoch {epoch + 1}. Best test accuracy: {best_test_accuracy}%")
                    print(f"Total training time: {final_time:.2f}s")
                    patience_limit_model_path = os.path.join(save_dir, f"patience_limit_model_epoch_{epoch + 1}.pth")
                    torch.save({
                        'epoch': epoch + 1,
                        'model_state_dict': model.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict(),
                        'scheduler_state_dict': scheduler.state_dict(),
                        'loss': train_epoch_loss,
                        'test_accuracy': test_accuracy,
                    }, patience_limit_model_path)
                    print(f"Limit model saved at {patience_limit_model_path}")
                    break
        else:
            final_time = time.time() - total_start_time
            print(f"Total training time: {final_time:.2f}s")
            epoch_limit_model_path = os.path.join(save_dir, f"limit_model_epoch_{epoch}.pth")
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict(),
                'loss': train_epoch_loss,
                'test_accuracy': test_accuracy,
            }, epoch_limit_model_path)
            print(f"Epoch limit model saved at {epoch_limit_model_path}")
            break

    return train_epoch_accuracies, test_epoch_accuracies

# LOAD THE SELECTED MODEL

In [18]:
def load_checkpoint(model, optimizer, scheduler, checkpoint_path, device):
    """Loads a model and optimizer checkpoint from the given path."""
    if os.path.exists(checkpoint_path):
        checkpoint = torch.load(checkpoint_path, map_location=device)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
        start_epoch = checkpoint['epoch']
        loss = checkpoint['loss']
        # test_accuracy = checkpoint['test_accuracy'] USE THIS IN LATER RUNS...
        print(f"Loaded checkpoint from {checkpoint_path} (epoch {start_epoch}, loss {loss:.4f})")
        return start_epoch, loss, False
        #print(f"Loaded checkpoint from {checkpoint_path} (epoch {start_epoch}, loss {loss:.4f},
        #      test accuracy {test_accuracy:.4f})")
        # return start_epoch, loss, test_accuracy USE THIS IN LATER RUNS...
    else:
        print("No checkpoint found, starting from scratch.")
        return 0, None, True  # Start from epoch 0

# USE THIS IF MEMORY IS NOT ENOUGH

In [None]:
# Configuration settings to add at the start of your training script
"""
def configure_memory_settings():
    # Enable memory efficient attention
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

    # Set memory allocator settings
    os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'

    # Enable gradient checkpointing for the model
    def enable_gradient_checkpointing(model):
        if hasattr(model, 'gradient_checkpointing_enable'):
            model.gradient_checkpointing_enable()

    return enable_gradient_checkpointing
"""

# ViT Parameter Check

In [None]:
"""
import timm
import torch

def check_vit_layers(model_name):
    """"""
    Check the number of transformer layers and other architecture details in a ViT model.

    Args:
        model_name (str): Name of the ViT model from timm (e.g., 'vit_tiny_patch16_224')

    Returns:
        dict: Dictionary containing model architecture details
    """"""
    try:
        # Load model
        model = timm.create_model(model_name, pretrained=False)

        # Get the number of heads from the first attention block
        num_heads = model.blocks[0].attn.num_heads

        # Get architecture details
        details = {
            'num_layers': len(model.blocks),
            'embed_dim': model.embed_dim,
            'num_heads': num_heads,
            'mlp_dim': model.blocks[0].mlp.fc1.out_features,  # Size of first MLP layer
            'mlp_ratio': model.blocks[0].mlp.fc1.out_features / model.embed_dim  # Calculate MLP ratio
        }

        return details

    except Exception as e:
        raise RuntimeError(f"Error checking model {model_name}: {str(e)}")

# Check different ViT variants
vit_variants = [
    'vit_tiny_patch16_224',
    'vit_small_patch16_224',
    'vit_base_patch16_224'
]

for variant in vit_variants:
    try:
        details = check_vit_layers(variant)
        print(f"\n{variant}:")
        for key, value in details.items():
            print(f"  {key}: {value}")
    except RuntimeError as e:
        print(f"\n{variant}: {str(e)}")
"""


vit_tiny_patch16_224:
  num_layers: 12
  embed_dim: 192
  num_heads: 3
  mlp_dim: 768
  mlp_ratio: 4.0

vit_small_patch16_224:
  num_layers: 12
  embed_dim: 384
  num_heads: 6
  mlp_dim: 1536
  mlp_ratio: 4.0

vit_base_patch16_224:
  num_layers: 12
  embed_dim: 768
  num_heads: 12
  mlp_dim: 3072
  mlp_ratio: 4.0


# RUNNER

In [None]:
# Learning Rate Scheduling
from torch.optim.lr_scheduler import CosineAnnealingLR, OneCycleLR
from transformers import get_cosine_schedule_with_warmup
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
import torchvision.transforms as transforms


if __name__ == "__main__":
    print("Background training running...")
    import os
    from pathlib import Path

    # Check for CUDA/device availability
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Cuda is available: {torch.cuda.is_available()}")
    print(f"Using the device: {device}")
    """
    # Simpler architecture
    tubelet_sizes = [(16, 16, 16), (32, 16, 16), (64, 16, 16)]
    embed_dims_per_view = [256, 128, 64]
    MLP_dims_per_view = [1024, 512, 256]
    num_layers_per_view = [3, 3, 3]
    view_dropout_list = [0.3, 0.2, 0.1]
    cross_view_connections = [1,2]
    global_embed_dim = 72
    global_num_layers = 1
    num_classes = 174
    """
    # [(16, 16, 16), (32, 16, 16), (64, 16, 16)] Later [(4, 16, 16), (8, 16, 16), (16, 16, 16)]
    # Hyperparameters # Use at least 3 views, less views decreases models capacity a lot. [(64, 64, 64), (32, 32, 32), (16, 16, 16)]
    tubelet_sizes = [(4, 16, 16), (8, 16, 16), (12, 16, 16)] # (Temporal, Height, Width) sizes oldest=[(8, 8, 8), (4, 4, 4), (2, 2, 2)] old=[(32, 32, 32), (16, 16, 16)] ver_3=[(64, 64, 64), (32, 32, 32), (16, 16, 16)]
    embed_dims_per_view = [768, 384, 192]       # Embedding dimensions for each view (Base, Small, Tiny) [768, 512, 256] [384, 256] [64, 32, 16]
    MLP_dims_per_view = [768*4, 384*4, 192*4]       # For view MLP dims. [256, 128, 64]
    num_of_head_per_view = [12, 6, 3]
    #embed_dims_pairs_for_cross_att = [(512,256),(512,256),(256,128),(256,128)]  # [768, 512, 256] [384, 256] [512, 256, 128]
    num_layers_per_view = [1, 1, 1]   # Number of transformer layers per view ver_1=[12, 8, 4]. ver_2=[12, 8]. ver_3=[4, 2] ver_4=[2, 1] ver_5=[4, 2, 1] ver_6=[4, 4, 4] [1, 1, 1]
    view_dropout_list = [0.2, 0.15, 0.1] # view0, view1, view2 ... Bigger views might memorize more than smaller ones. [0.2, 0.1, 0.05] [0.4, 0.3, 0.25]
    #(1, 2) means connect the 2nd layer of view
    #X to the 2nd layer of view X+1.
    #(2, 2) means connect the 3rd layer of view
    #X to the 2nd layer of view X+1. 0,1
    cross_view_connections = [0]     # Cross-view attention connections. The first view layer is named "0" for better readability. [0, 1]
    global_embed_dim = 192             # Global encoder dimension. ver_0=256, ver_1=192 ver_2=144
    global_num_layers = 1              # Number of global transformer blocks. Normally it was 4.
    num_classes = 174                  # Total classes in Something-Something V2. CHANGE THIS TO 10 FOR MAINDATASET OR IT CAN REMAIN 174. ver_0=174

    """
    tubelet_sizes = [(64, 64, 64), (32, 32, 32), (16, 16, 16)]
    embed_dims_per_view = [512, 384, 256]
    embed_dims_per_cross_att = [512, 384, 256]
    num_layers_per_view = [3,2,1]
    global_embed_dim = 512
    global_num_layers = 2
    num_classes = 174
    """

    # Initialize model. Add gradient checkpointing to your model initialization.
    # Use the commented if memory is not enough.
    #enable_gradient_checkpointing = configure_memory_settings()
    model = MultiviewTransformer(device,
                                 embed_dims_per_view,
                                 MLP_dims_per_view,
                                 num_of_head_per_view,
                                 view_dropout_list,
                                 num_layers_per_view,
                                 tubelet_sizes,
                                 global_embed_dim,
                                 global_num_layers,
                                 num_classes,
                                 cross_view_connections).to(device)
    #enable_gradient_checkpointing(model)
    """
    for name, param in model.named_parameters():
      print(f"Parameter: {name} | Device: {param.device}")
    """
    """
    for name, module in model.named_modules():
      print(f"Module: {name} | Device: {next(module.parameters()).device}")
    """
    """
    for name, module in model.named_modules():
      print(f"Module: {name} | Device: {next(module.parameters()).device}")
    """

    # Dataset and video directory paths
    # video_dir = "/content/drive/MyDrive/COMP411_Project_Datasets/Something-Something-V2/20bn-something-something-v2"
    video_dir_train = "/content/dataset/Top-10-Label-Dataset/train"
    video_dir_test = "/content/dataset/Top-10-Label-Dataset/test"
    #video_dir = "/kaggle/input/test-10/test10"

    # Dataset transformations (if any)
    """
    video_transform = transforms.Compose([
        #transforms.ToPILImage(),  # Convert frame to PIL Image (needed for transformations)
        #transforms.Resize((224, 224)),  # Resize each frame to the desired size
        #transforms.RandomHorizontalFlip(),  # Apply random horizontal flip for data augmentation
        transforms.ToTensor(),  # Convert frame to tensor (required for PyTorch models)
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize using ImageNet stats
    ])
    """
    """
    video_transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ColorJitter(0.2, 0.2, 0.2),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                            std=[0.229, 0.224, 0.225])
    ])
    """
    """
    train_frame_transform = transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ColorJitter(0.2, 0.2, 0.2),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])

    test_transform = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])
    """
    # Initialize datasets
    # train_data and test_data are created in "GET THE DATA LABELS" part.
    totalViews = len(embed_dims_per_view)
    train_dataset = TwentyBillionSomethingDataset(
        video_dir=video_dir_train,
        data_set=train_data,
        label_mapping=label_mapping,
        totalViews=totalViews,
        target_frames=48,  # Adjust based on your needs
        target_size=(224, 224),
        is_test=False,
        test_answers_dict=None
        )
    test_dataset = TwentyBillionSomethingDataset(
        video_dir=video_dir_test,
        data_set=test_data,
        label_mapping=label_mapping,
        totalViews=totalViews,
        target_frames=48,  # Adjust based on your needs
        target_size=(224, 224),
        is_test=True,
        test_answers_dict=test_answers
        )

    """
    def get_optimal_batch_size(base_batch_size):
      if(torch.cuda.is_available()):
        total_memory = torch.cuda.get_device_properties(0).total_memory
        if total_memory > 16*1024*1024*1024:  # >16GB
            return base_batch_size * 2
        elif total_memory > 8*1024*1024*1024:  # >8GB
            return base_batch_size
        else:
            return base_batch_size // 2
      else:
        return base_batch_size
    """
    #batch_size = get_optimal_batch_size(16)
    batch_size = 4
    print("Batch size: " + str(batch_size))

    # DataLoader configurations
    """
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
    """
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=4,
        pin_memory=True,  # Add this for faster GPU transfer
        prefetch_factor=2  # Each worker prefetches 2 batches
        )
    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=4,
        pin_memory=True,
        prefetch_factor=2
        )

    # Model save directory.
    # Write here the model save folder. Background-Runner-Test-1
    #model_save_dir = "/kaggle/temp/COMP411_Project_Models/ActionRecognizerModels_20k"
    model_save_dir = "/content/drive/MyDrive/COMP411_Project_Models/1-layer-models"

    # USE THIS IF THERE IS A SAVED MODEL.
    #SELECT_SAVED_MODEL_TO_USE_EPOCH = 5

    # Create save directory for models if it doesn't exist.
    Path(model_save_dir).mkdir(parents=True, exist_ok=True)

    # Model checkpoint file.
    # CHANGE THE MODEL FILE NAME TO CONTINUE FROM THAT MODEL. best_model_epoch_5.pth
    #checkpoint_path = os.path.join(model_save_dir, ("best_model_epoch_" + str(SELECT_SAVED_MODEL_TO_USE_EPOCH) + ".pth"))
    checkpoint_path = "/content/epoch_2_model"

    # Training parameters
    total_num_epochs = 100
    current_model_epoch = 0 # SET THIS EVERY TIME YOU RUN THE RUNNER.##############################
    limit_epoch = 10 # IF CREDIT FOR GPU NOT ENOUGH, DO THE RUN FOR A SMALLER TIME.
    # save_per_epoch = 5 # CHANGE THIS TO 5 OR Something.
    early_stopping_patience = 20

    # Loss function and optimizer
    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)

    # Modify optimizer. USE 1e-3 for weight decay it is better for transformers.
    """
    optimizer = optim.AdamW(model.parameters(),
                            lr=1e-4,  # Increased learning rate. ver_0=1e-3 ver_1=5e-4 ver_2=2e-4
                            weight_decay=1e-3,  # Make it different(1e-3) than 0 to generalize to data. ver_0=1e-4
                            betas=(0.9, 0.999))  # Default AdamW betas
    """

    optimizer = optim.AdamW(
    model.parameters(),
    lr=1e-4,  # Slightly higher
    weight_decay=0.001,
    betas=(0.9, 0.999)
    )
    scheduler = CosineAnnealingLR(optimizer, T_max=total_num_epochs, eta_min=1e-7)
    """
    # Warmup scheduler
    num_warmup_steps = len(train_loader) * 5  # 5 epochs warmup
    total_training_steps = len(train_loader) * total_num_epochs
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=total_training_steps
        )
    """
    #scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=5, T_mult=2)
    # 0-5 6-15 16-35 36-75

    # MIGHT RETURN TO COSINE ANNEALING. IT WORKS GOOD TOO.
    """
    scheduler = OneCycleLR(
        optimizer,
        max_lr=2e-4,
        total_steps=limit_epoch * len(train_loader),
        anneal_strategy='cos',
        div_factor=10,
        final_div_factor=1e4,
        pct_start=0.2
    )
    """
    """
    scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=2, eta_min=1e-5)
    """
    """
    scheduler = OneCycleLR(
    optimizer,
    max_lr=5e-4,  # Lower than single-view due to multiple parallel paths
    total_steps=(limit_epoch - current_model_epoch) * len(train_loader), # Update the current epoch if you continue training from earlier.
    anneal_strategy='cos',
    div_factor=5,
    final_div_factor=1e3,
    pct_start=0.15  # Slightly longer warmup for multi-view stability
    )
    """
    """
    # This is for 100 epochs.
    scheduler = OneCycleLR(
    optimizer,
    max_lr=5e-4,  # Lower than single-view due to multiple parallel paths
    total_steps=total_num_epochs * len(train_loader), # Update the current epoch if you continue training from earlier.
    anneal_strategy='cos',
    div_factor=5,
    final_div_factor=1e3,
    pct_start=0.15  # Slightly longer warmup for multi-view stability
    )
    """
    """ FIND THE BEST HYPERPARAMETERS FOR THIS AND USE THIS.
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_training_steps
        )
    """
    """
    # More aggressive learning rate scheduling
    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer,
        max_lr=5e-3,  # Higher peak learning rate
        steps_per_epoch=len(train_loader),
        epochs=total_num_epochs
        )
    """
    # LOAD MODEL CHECKPOINT
    start_epoch, _, use_pretrained_weights = load_checkpoint(model, optimizer,
                                                             scheduler, checkpoint_path,
                                                             device) # get last trained epoch and start from the next epoch

    """
    start_epoch, _, last_best_test_accuracy = load_checkpoint(model, optimizer,
                                                              scheduler, checkpoint_path,
                                                              device) # get last trained epoch and start from the next epoch
    """

    # Train and test the model
    print("Train data set size: " + str(len(train_loader.dataset)))
    print("Test data set size: " + str(len(test_loader.dataset)))

    train_acc, test_acc = train_then_test_for_each_epoch(
        model, train_loader, criterion, optimizer, device,
        total_num_epochs, test_loader, model_save_dir, label_mapping,
        scheduler, None, start_epoch, limit_epoch, early_stopping_patience,
        use_pretrained_weights
    )
    """ USE THIS AFTER THE FIRST RUN.
    train_acc, test_acc = train_then_test_for_each_epoch(
        model, train_loader, criterion, optimizer, device,
        total_num_epochs, test_loader, model_save_dir, label_mapping,
        scheduler, None, start_epoch, limit_epoch, last_best_test_accuracy,
        early_stopping_patience
    )
    """
    """
    train_acc, test_acc = train_then_test_for_each_epoch(
        model, train_loader, criterion, optimizer, device,
        total_num_epochs, test_loader, model_save_dir, label_mapping,
        scheduler, save_per_epoch, start_epoch, early_stopping_patience
    )
    """

    # Print final accuracies
    print("Train Accuracies:", train_acc)
    print("Test Accuracies:", test_acc)

Background training running...
Cuda is available: True
Using the device: cuda
Batch size: 4


  checkpoint = torch.load(checkpoint_path, map_location=device)


Loaded checkpoint from /content/epoch_2_model (epoch 2, loss 2.8653)
Train data set size: 15518
Test data set size: 1666

Epoch 3/100  Current Learning Rate: 0.000100
Training...


Epoch 3/100: 100%|██████████| 3880/3880 [47:16<00:00,  1.37it/s, loss=2.86, accuracy=13.6]

################## Train Epoch Summary [3/100]##################:
Train Average Loss: 2.8565
Accuracy: 13.57%
Time: 2836.32s
Testing...



Testing: 100%|██████████| 417/417 [01:55<00:00,  3.62it/s, loss=2.89, accuracy=12.5]



Test Results for Epoch:
Total Samples: 1666
Correct Predictions: 208
Test Average Loss: 2.8852
Accuracy: 12.48%
End of Epoch 3 - Training Accuracy: 13.5713, Training Loss: 2.8565, Test Accuracy: 12.48499399759904, Test Loss: 2.8852
Epoch Time: 2951.60s, Total Time: 2951.60s
New best model saved at /content/drive/MyDrive/COMP411_Project_Models/1-layer-models/best_model_epoch_3.pth

Epoch 4/100  Current Learning Rate: 0.000100
Training...


Epoch 4/100: 100%|██████████| 3880/3880 [46:19<00:00,  1.40it/s, loss=2.84, accuracy=14.4]

################## Train Epoch Summary [4/100]##################:
Train Average Loss: 2.8432
Accuracy: 14.43%
Time: 2779.41s
Testing...



Testing: 100%|██████████| 417/417 [01:51<00:00,  3.73it/s, loss=2.87, accuracy=17.1]



Test Results for Epoch:
Total Samples: 1666
Correct Predictions: 285
Test Average Loss: 2.8683
Accuracy: 17.11%
End of Epoch 4 - Training Accuracy: 14.4348, Training Loss: 2.8432, Test Accuracy: 17.106842737094837, Test Loss: 2.8683
Epoch Time: 2891.46s, Total Time: 5843.68s
New best model saved at /content/drive/MyDrive/COMP411_Project_Models/1-layer-models/best_model_epoch_4.pth
Deleted previous best model at /content/drive/MyDrive/COMP411_Project_Models/1-layer-models/best_model_epoch_3.pth

Epoch 5/100  Current Learning Rate: 0.000100
Training...


Epoch 5/100: 100%|██████████| 3880/3880 [46:14<00:00,  1.40it/s, loss=2.82, accuracy=15.8]

################## Train Epoch Summary [5/100]##################:
Train Average Loss: 2.8245
Accuracy: 15.84%
Time: 2774.56s
Testing...



Testing: 100%|██████████| 417/417 [01:51<00:00,  3.75it/s, loss=2.89, accuracy=13.4]


Test Results for Epoch:
Total Samples: 1666
Correct Predictions: 223
Test Average Loss: 2.8934
Accuracy: 13.39%
End of Epoch 5 - Training Accuracy: 15.8397, Training Loss: 2.8245, Test Accuracy: 13.385354141656663, Test Loss: 2.8934
Epoch Time: 2885.96s, Total Time: 8730.23s
No improvement. Patience counter: 1/20

Epoch 6/100  Current Learning Rate: 0.000099
Training...



Epoch 6/100: 100%|██████████| 3880/3880 [46:04<00:00,  1.40it/s, loss=2.82, accuracy=16.2]

################## Train Epoch Summary [6/100]##################:
Train Average Loss: 2.8171
Accuracy: 16.21%
Time: 2764.71s
Testing...



Testing: 100%|██████████| 417/417 [01:54<00:00,  3.63it/s, loss=2.92, accuracy=12.8]


Test Results for Epoch:
Total Samples: 1666
Correct Predictions: 213
Test Average Loss: 2.9196
Accuracy: 12.79%
End of Epoch 6 - Training Accuracy: 16.2134, Training Loss: 2.8171, Test Accuracy: 12.785114045618249, Test Loss: 2.9196
Epoch Time: 2879.74s, Total Time: 11609.97s
No improvement. Patience counter: 2/20

Epoch 7/100  Current Learning Rate: 0.000099
Training...



Epoch 7/100: 100%|██████████| 3880/3880 [46:28<00:00,  1.39it/s, loss=2.8, accuracy=17.4]

################## Train Epoch Summary [7/100]##################:
Train Average Loss: 2.8011
Accuracy: 17.36%
Time: 2788.94s
Testing...



Testing: 100%|██████████| 417/417 [01:50<00:00,  3.76it/s, loss=2.89, accuracy=14.8]


Test Results for Epoch:
Total Samples: 1666
Correct Predictions: 247
Test Average Loss: 2.8902
Accuracy: 14.83%
End of Epoch 7 - Training Accuracy: 17.3605, Training Loss: 2.8011, Test Accuracy: 14.82593037214886, Test Loss: 2.8902
Epoch Time: 2900.02s, Total Time: 14509.99s
No improvement. Patience counter: 3/20

Epoch 8/100  Current Learning Rate: 0.000099
Training...



Epoch 8/100: 100%|██████████| 3880/3880 [47:30<00:00,  1.36it/s, loss=2.79, accuracy=17.8]

################## Train Epoch Summary [8/100]##################:
Train Average Loss: 2.7866
Accuracy: 17.84%
Time: 2850.17s
Testing...



Testing: 100%|██████████| 417/417 [01:55<00:00,  3.60it/s, loss=2.93, accuracy=12.8]


Test Results for Epoch:
Total Samples: 1666
Correct Predictions: 213
Test Average Loss: 2.9264
Accuracy: 12.79%
End of Epoch 8 - Training Accuracy: 17.8438, Training Loss: 2.7866, Test Accuracy: 12.785114045618249, Test Loss: 2.9264
Epoch Time: 2966.25s, Total Time: 17476.24s
No improvement. Patience counter: 4/20

Epoch 9/100  Current Learning Rate: 0.000098
Training...



Epoch 9/100: 100%|██████████| 3880/3880 [47:16<00:00,  1.37it/s, loss=2.77, accuracy=18.9]

################## Train Epoch Summary [9/100]##################:
Train Average Loss: 2.7670
Accuracy: 18.91%
Time: 2836.46s
Testing...



Testing: 100%|██████████| 417/417 [01:51<00:00,  3.75it/s, loss=2.86, accuracy=14.8]


Test Results for Epoch:
Total Samples: 1666
Correct Predictions: 246
Test Average Loss: 2.8555
Accuracy: 14.77%
End of Epoch 9 - Training Accuracy: 18.9071, Training Loss: 2.7670, Test Accuracy: 14.765906362545017, Test Loss: 2.8555
Epoch Time: 2947.86s, Total Time: 20424.10s
No improvement. Patience counter: 5/20

Epoch 10/100  Current Learning Rate: 0.000098
Training...



Epoch 10/100: 100%|██████████| 3880/3880 [47:30<00:00,  1.36it/s, loss=2.76, accuracy=18.9]

################## Train Epoch Summary [10/100]##################:
Train Average Loss: 2.7608
Accuracy: 18.89%
Time: 2850.31s
Testing...



Testing: 100%|██████████| 417/417 [01:51<00:00,  3.75it/s, loss=2.93, accuracy=12.5]



Test Results for Epoch:
Total Samples: 1666
Correct Predictions: 208
Test Average Loss: 2.9260
Accuracy: 12.48%
End of Epoch 10 - Training Accuracy: 18.8942, Training Loss: 2.7608, Test Accuracy: 12.48499399759904, Test Loss: 2.9260
Epoch Time: 2961.46s, Total Time: 23385.56s
No improvement. Patience counter: 6/20
Total training time: 23385.56s
Epoch limit model saved at /content/drive/MyDrive/COMP411_Project_Models/1-layer-models/limit_model_epoch_10.pth
Train Accuracies: [13.57133651243717, 14.434849851785023, 15.839670060574818, 16.21342956566568, 17.360484598530736, 17.84379430338961, 18.907075654079133, 18.894187395282895]
Test Accuracies: [12.48499399759904, 17.106842737094837, 13.385354141656663, 12.785114045618249, 14.82593037214886, 12.785114045618249, 14.765906362545017, 12.48499399759904]


# LIBRARIES(AGAIN)

In [None]:
"""
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset
import time  # Import time module for tracking time.
import torch.multiprocessing as mp
import cv2 # For dividing the video into frames.
from tqdm.notebook import tqdm # For better visuals while training the model.
from torch import Tensor
from typing import Optional, Tuple
"""

# CLEAR RAM MEMORY

In [None]:
"""
import psutil
import os

def clear_system_memory():
    # On Unix-like systems (Linux/macOS)
    if os.name == 'posix':
        os.system('sync; echo 3 > /proc/sys/vm/drop_caches')

    # Print memory usage
    memory = psutil.virtual_memory()
    print(f"Total Memory: {memory.total / (1024**3):.2f} GB")
    print(f"Available Memory: {memory.available / (1024**3):.2f} GB")
    print(f"Used Memory: {memory.used / (1024**3):.2f} GB")


# Call this function to clear memory
clear_system_memory()
"""

In [None]:
"""
import psutil
import os

def clear_ram():
    # Force Python to release memory
    import ctypes
    libc = ctypes.CDLL('libc.so.6')
    libc.malloc_trim(0)

    # Print memory before and after
    print("Memory before clearing:", psutil.virtual_memory().percent, "%")

    # Garbage collection
    import gc
    gc.collect()

    print("Memory after clearing:", psutil.virtual_memory().percent, "%")

# Call the function
clear_ram()
"""

# MAX TRAIN AND TEST ACCURACIES

In [None]:
if train_acc:
  print("Max Train Accuracy: " + str(max(train_acc)) + " at Epoch: " + str(train_acc.index(max(train_acc)) + 1))
if test_acc:
  print("Max Test Accuracy: " + str(max(test_acc)) + " at Epoch: " + str(test_acc.index(max(test_acc)) + 1))

# PLOT THE ACCURACIES

In [None]:
"""
import matplotlib.pyplot as plt

epochs = range(1, len(train_acc) + 1)

plt.plot(epochs, train_acc, label='Training Accuracy')
#plt.plot(epochs, test_acc, label='Test Accuracy')  # Uncomment if you have test accuracy data

plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training Accuracy over Epochs')
plt.legend()
plt.grid(True)
plt.show()
"""

#CLEAR GPU MEMORY

In [None]:
#from IPython import get_ipython

# Clear all variables
#get_ipython().magic('reset -sf')

In [None]:
#torch.cuda.empty_cache()

In [None]:
#print(torch.cuda.memory_summary())