### IMPORT AND SETUP

In [36]:
#!pip install torch torchvision torchaudio

In [37]:
#!pip install opencv-python

In [38]:
# !pip install albumentations

In [39]:
# !pip install --user opencv-python albumentations

In [40]:
# conda install -c conda-forge opencv albumentations

In [41]:
import torch

# Check if CUDA is available
is_available = torch.cuda.is_available()
print(f"Is CUDA available? {is_available}")

if is_available:
    # Get the number of GPUs
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    # Get the name of the current GPU
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")

Is CUDA available? False


In [42]:
# Install PyTorch Geometric dependencies
import torch

def format_pytorch_version(version):
  return version.split('+')[0]

TORCH_version = torch.__version__
PYTORCH_VERSION = format_pytorch_version(TORCH_version)
CUDA_VERSION = torch.version.cuda.replace('.','') if torch.cuda.is_available() else 'cpu'

!pip install torch-scatter torch-sparse torch-cluster torch-spline-conv -f https://data.pyg.org/whl/torch-{PYTORCH_VERSION}+{CUDA_VERSION}.html
!pip install torch-geometric

# --- Now, you can successfully import the GNN components ---
from torch_geometric.nn import GATConv
from torch_geometric.data import Data, Batch
import torch.nn.functional as F

print("PyTorch Geometric installed and imported successfully!")

Looking in links: https://data.pyg.org/whl/torch-2.8.0+cpu.html
PyTorch Geometric installed and imported successfully!


In [43]:
import timm 
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [44]:
# For data handling and processing
import cv2
import numpy as np
import pandas as pd
import albumentations as A
from albumentations.pytorch import ToTensorV2
from tqdm import tqdm
import os
import warnings

In [45]:
# For model backbones
try:
    import timm
except ImportError:
    print("timm not found. Installing...")
    !pip install timm==0.9.12
    import timm

# For Graph Neural Network components
try:
    from torch_geometric.nn import GATConv
    from torch_geometric.data import Data, Batch
except ImportError:
    print("PyTorch Geometric not found. Installing...")
    # Handle installation based on PyTorch and CUDA versions
    pytorch_version = torch.__version__
    cuda_version = torch.version.cuda
    if cuda_version:
        cuda_version_str = "cu" + cuda_version.replace('.', '')
    else:
        cuda_version_str = "cpu"
    !pip install torch-scatter torch-sparse torch-cluster torch-spline-conv -f https://data.pyg.org/whl/torch-{pytorch_version}+{cuda_version_str}.html
    !pip install torch-geometric
    from torch_geometric.nn import GATConv
    from torch_geometric.data import Data, Batch

warnings.filterwarnings("ignore", category=UserWarning) 

### DUMMY DATA GENERATION

In [46]:
# def generate_dummy_data():
#     """
#     Creates dummy video files and annotation CSVs to make the script runnable.
#     """
#     print("Generating dummy data for demonstration...")
#     base_dir = 'video-anomaly-detection'
#     data_dir = os.path.join(base_dir, 'data')
#     video_dir = os.path.join(data_dir, 'dummy_videos')
    
#     os.makedirs(video_dir, exist_ok=True)
#     os.makedirs(os.path.join(base_dir, 'weights'), exist_ok=True)

#     dummy_annotations = {'video_path': [], 'label': []}
#     for i in range(20): # Create 20 dummy videos for a reasonable dataset size
#         path = f'dummy_videos/video_{i}.avi'
#         dummy_annotations['video_path'].append(path)
#         dummy_annotations['label'].append(np.random.choice(["Normal", "Panic", "Violent"]))
        
#         # Create a short, random-noise video file
#         video_full_path = os.path.join(data_dir, path)
#         fourcc = cv2.VideoWriter_fourcc(*'XVID')
#         out = cv2.VideoWriter(video_full_path, fourcc, 25.0, (256, 256))
#         for _ in range(125): # 5 seconds at 25 fps
#             frame = np.random.randint(0, 255, (256, 256, 3), dtype=np.uint8)
#             out.write(frame)
#         out.release()

#     annotations_df = pd.DataFrame(dummy_annotations)
    
#     # Split into train and val and save CSVs
#     train_df = annotations_df.sample(frac=0.8, random_state=42)
#     val_df = annotations_df.drop(train_df.index)
    
#     train_df.to_csv(os.path.join(data_dir, 'train_annotations.csv'), index=False)
#     val_df.to_csv(os.path.join(data_dir, 'val_annotations.csv'), index=False)
    
#     print("Dummy data generation complete.")
#     print(f"Train set size: {len(train_df)}, Validation set size: {len(val_df)}")


In [55]:
# Path to the folder you extracted from movies.rar
YOUR_DATASET_PATH = "C://Users//srish//Downloads//movies" 


In [56]:
YOUR_DATASET_PATH

'C://Users//srish//Downloads//movies'

In [57]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

def create_annotations_from_nested_folders(dataset_root_path):
    """
    Scans a directory with a nested structure like '1/Violence/*.avi',
    creates video annotations, and splits them into train/val CSVs.
    """
    print(f"Scanning for videos in nested folders under: {dataset_root_path}")

    # --- Setup output directories ---
    base_dir = 'video-anomaly-detection'
    data_dir = os.path.join(base_dir, 'data')
    os.makedirs(data_dir, exist_ok=True)
    os.makedirs(os.path.join(base_dir, 'weights'), exist_ok=True)

    video_annotations = {'video_path': [], 'label': []}
    
    # --- Walk through the nested directory structure ---
    if not os.path.isdir(dataset_root_path):
        raise ValueError(f"The provided path is not a directory: {dataset_root_path}")

    for numbered_folder in sorted(os.listdir(dataset_root_path)):
        numbered_folder_path = os.path.join(dataset_root_path, numbered_folder)
        
        if os.path.isdir(numbered_folder_path):
            for label_folder in os.listdir(numbered_folder_path):
                label_folder_path = os.path.join(numbered_folder_path, label_folder)
                
                if os.path.isdir(label_folder_path):
                    for video_file in os.listdir(label_folder_path):
                        if video_file.lower().endswith(('.avi', '.mp4', '.mov')):
                            relative_video_path = os.path.join(numbered_folder, label_folder, video_file)
                            video_annotations['video_path'].append(relative_video_path)
                            video_annotations['label'].append(label_folder.lower())

    if not video_annotations['video_path']:
        raise ValueError(f"No videos found in the expected nested structure under {dataset_root_path}.")

    annotations_df = pd.DataFrame(video_annotations)
    annotations_df = annotations_df.sample(frac=1, random_state=42).reset_index(drop=True)

    train_df, val_df = train_test_split(annotations_df, test_size=0.2, random_state=42, stratify=annotations_df['label'])

    train_csv_path = os.path.join(data_dir, 'train_annotations.csv')
    val_csv_path = os.path.join(data_dir, 'val_annotations.csv')
    
    train_df.to_csv(train_csv_path, index=False)
    val_df.to_csv(val_csv_path, index=False)
    
    print("Annotation files created successfully from nested folders.")
    print(f"Total videos found: {len(annotations_df)}")
    print(f"Training set size: {len(train_df)}")
    print(f"Validation set size: {len(val_df)}")

### DATASET CLASS DEFINITION

In [58]:
class VideoDataset(Dataset):
    """
    PyTorch Dataset for loading, sampling, and transforming video clips.
    """
    def __init__(self, annotations_file, data_root, num_frames=16, frame_size=(224, 224), fps=16, is_train=True):
        self.annotations = pd.read_csv(annotations_file)
        self.data_root = data_root
        self.num_frames = num_frames
        self.frame_size = frame_size
        self.target_fps = fps
        self.labels_map = {"Normal": 0, "Panic": 1, "Violent": 2}
        self.is_train = is_train

        # Define separate augmentations for training and validation
        if is_train:
            self.transform =A.Compose([
                A.RandomResizedCrop(size=self.frame_size, scale=(0.8, 1.0)),
                A.HorizontalFlip(p=0.5),
                A.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1, p=0.8),
                A.GaussNoise(p=0.2),
                A.CoarseDropout(max_holes=8, max_height=24, max_width=24, p=0.5),
                A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
                ToTensorV2(),
            ])
        else:
            self.transform = A.Compose([
                A.Resize(height=self.frame_size[0], width=self.frame_size[1]),
                A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
                ToTensorV2(),
            ])

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        video_path = os.path.join(self.data_root, self.annotations.iloc[idx, 0])
        label_str = self.annotations.iloc[idx, 1]
        label = self.labels_map[label_str]

        frames, frame_diffs = self._load_video_clip(video_path)

        # Apply augmentations to all frames
        transformed_frames = torch.stack([self.transform(image=frame)["image"] for frame in frames])
        
        # Normalize frame differences separately
        frame_diffs = torch.from_numpy(frame_diffs).float().permute(0, 3, 1, 2) # T,H,W,C -> T,C,H,W
        # Simple normalization for diffs
        frame_diffs = frame_diffs / 255.0

        return transformed_frames, frame_diffs, torch.tensor(label, dtype=torch.long)

    def _load_video_clip(self, video_path):
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            raise IOError(f"Cannot open video file: {video_path}")

        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        
        # Uniformly sample indices across the video
        frame_indices = np.linspace(0, total_frames - 1, self.num_frames, dtype=int)

        frames, frame_diffs = [], []
        last_frame_gray = None

        for i in frame_indices:
            cap.set(cv2.CAP_PROP_POS_FRAMES, i)
            ret, frame = cap.read()
            if not ret:
                # If reading fails, duplicate the last valid frame
                if frames:
                    frame = frames[-1].copy()
                else: # Very rare case of video with 0 frames
                    frame = np.zeros((self.frame_size[0], self.frame_size[1], 3), dtype=np.uint8)
            
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = cv2.resize(frame, (self.frame_size[1], self.frame_size[0]))
            frames.append(frame)

            # Compute frame difference (simple motion cue)
            current_frame_gray = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
            if last_frame_gray is not None:
                diff = cv2.absdiff(current_frame_gray, last_frame_gray)
                frame_diffs.append(np.stack([diff]*3, axis=-1)) # Make it 3-channel
            last_frame_gray = current_frame_gray
        
        cap.release()

        # Ensure frame_diffs list has the correct length
        if not frame_diffs: # If only one frame was loaded
            frame_diffs.append(np.zeros_like(frames[0]))
        while len(frame_diffs) < self.num_frames:
            frame_diffs.append(np.zeros_like(frame_diffs[-1])) # Pad with zero motion

        return np.array(frames), np.array(frame_diffs)


### MODEL ARCHITECTURE DEFINITION

In [51]:
class SpatioTemporalGNN(nn.Module):
    def __init__(self, num_frames=16, num_classes=3, feature_dim=256, gnn_layers=2, num_heads=4, dropout=0.3):
        super().__init__()
        self.num_frames = num_frames
        self.feature_dim = feature_dim

        # --- Appearance Stream ---
        self.cnn_backbone = timm.create_model('convnext_tiny', pretrained=True, features_only=True, out_indices=[2])
        # ConvNeXt-T stage 2 has 384 channels, feature map size 14x14 for 224 input
        self.cnn_feature_proj = nn.Conv2d(384, feature_dim, kernel_size=1)

        # --- Motion Stream ---
        self.motion_backbone = timm.create_model('mobilenetv3_small_050', pretrained=True, features_only=True, out_indices=[2])
        # MobileNetV3-S stage 2 has 16 channels, feature map size 14x14
        self.motion_feature_proj = nn.Conv2d(16, feature_dim, kernel_size=1)

        # --- Graph Network ---
        self.num_nodes_per_frame = 14 * 14 
        # GATConv input is concatenated features (appearance + motion)
        gnn_input_dim = feature_dim * 2
        self.gnn_layers = nn.ModuleList([
            GATConv(gnn_input_dim, gnn_input_dim, heads=num_heads, dropout=dropout, concat=False)
            for _ in range(gnn_layers)
        ])
        self.gnn_norm = nn.LayerNorm(gnn_input_dim)
        
        # --- Temporal Attention ---
        self.temporal_attention = nn.MultiheadAttention(embed_dim=gnn_input_dim, num_heads=num_heads, dropout=dropout, batch_first=True)

        # --- Classifier Head ---
        self.fusion_norm = nn.LayerNorm(gnn_input_dim)
        self.classifier = nn.Sequential(
            nn.Linear(gnn_input_dim, gnn_input_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(gnn_input_dim // 2, num_classes)
        )

    def forward(self, rgb_clips, motion_clips):
        batch_size, _, _, height, width = rgb_clips.shape
        
        # Reshape for batch processing: [B, T, C, H, W] -> [B*T, C, H, W]
        rgb_reshaped = rgb_clips.view(-1, 3, height, width)
        motion_reshaped = motion_clips.view(-1, 3, height, width)

        # --- Feature Extraction ---
        appearance_feat = self.cnn_backbone(rgb_reshaped)[0]
        appearance_feat = self.cnn_feature_proj(appearance_feat)
        
        motion_feat = self.motion_backbone(motion_reshaped)[0]
        motion_feat = self.motion_feature_proj(motion_feat)

        # --- Graph Construction ---
        combined_feat = torch.cat([appearance_feat, motion_feat], dim=1)
        
        _, d_combined, h_feat, w_feat = combined_feat.shape
        # Reshape for GNN: [B*T, 2D, H', W'] -> [B, T, N, 2D] where N=H'*W'
        graph_nodes = combined_feat.view(batch_size, self.num_frames, d_combined, -1).permute(0, 1, 3, 2)

        # Create a fully-connected graph edge structure for each frame
        edge_index = self._create_fully_connected_edges(h_feat * w_feat, graph_nodes.device)
        
        # Process each time step through the GNN
        gnn_outputs = []
        for t in range(self.num_frames):
            frame_nodes = graph_nodes[:, t, :, :].reshape(-1, d_combined) # [B*N, 2D]
            
            # Create a batch of graphs for torch_geometric
            batch_indices = torch.arange(batch_size, device=frame_nodes.device).repeat_interleave(h_feat * w_feat)
            
            x = frame_nodes
            for layer in self.gnn_layers:
                x = F.elu(layer(x, edge_index))
            
            x = self.gnn_norm(x)
            x = x.view(batch_size, h_feat * w_feat, d_combined) # Reshape back to [B, N, 2D]
            gnn_outputs.append(x)

        # Stack GNN outputs along the time dimension: [B, T, N, 2D]
        gnn_outputs = torch.stack(gnn_outputs, dim=1)

        # --- Temporal Aggregation ---
        # Pool nodes spatially (average pooling) before temporal attention
        spatially_pooled_features = gnn_outputs.mean(dim=2) # [B, T, 2D]

        # Apply temporal attention
        temporal_features, _ = self.temporal_attention(spatially_pooled_features, spatially_pooled_features, spatially_pooled_features)
        
        # Aggregate across time (average pooling) to get a final clip representation
        final_representation = temporal_features.mean(dim=1) # [B, 2D]
        
        # --- Classification ---
        final_representation = self.fusion_norm(final_representation)
        logits = self.classifier(final_representation)

        return logits

    def _create_fully_connected_edges(self, num_nodes, device):
        # Cache edge_index to avoid re-computation
        if not hasattr(self, '_edge_index') or self._edge_index.device != device:
            edge_list = torch.combinations(torch.arange(num_nodes, device=device), r=2).t()
            # Make it undirected by adding reverse edges
            self._edge_index = torch.cat([edge_list, edge_list.flip(0)], dim=1)
        return self._edge_index


### TRAINING/VALIDATION LOOP FUNCTION

In [52]:
def run_epoch(model, dataloader, optimizer, criterion, device, is_training=True):
    if is_training:
        model.train()
    else:
        model.eval()

    total_loss = 0.0
    correct_predictions = 0
    total_samples = 0

    progress_bar = tqdm(dataloader, desc="Training" if is_training else "Validation")
    
    for rgb_frames, motion_frames, labels in progress_bar:
        rgb_frames, motion_frames, labels = rgb_frames.to(device), motion_frames.to(device), labels.to(device)

        if is_training:
            optimizer.zero_grad()

        with torch.set_grad_enabled(is_training):
            logits = model(rgb_frames, motion_frames)
            loss = criterion(logits, labels)

            if is_training:
                loss.backward()
                optimizer.step()

        total_loss += loss.item() * rgb_frames.size(0)
        _, preds = torch.max(logits, 1)
        correct_predictions += torch.sum(preds == labels.data)
        total_samples += rgb_frames.size(0)
        
        acc = correct_predictions.double() / total_samples
        progress_bar.set_postfix(loss=f"{loss.item():.4f}", acc=f"{acc:.4f}")

    epoch_loss = total_loss / total_samples
    epoch_acc = correct_predictions.double() / total_samples
    return epoch_loss, epoch_acc



### MAIN EXECUTION BLOCK

In [53]:
# # ==============================================================================
# # 6. MAIN EXECUTION BLOCK
# # ==============================================================================
# if __name__ == '__main__':
#     # --- Configuration ---
    
# config = {
#     'data': {
#         'root': 'video-anomaly-detection/data',
#         'train_annotations': 'video-anomaly-detection/data/train_annotations.csv',
#         'val_annotations': 'video-anomaly-detection/data/val_annotations.csv',
#     },
#     'model': {
#         'num_frames': 16,
#         'num_classes': 2, # Make sure this is 2 for your dataset
#         'feature_dim': 128, 
#     },
#     'train': {
#         'batch_size': 4, 
#         'warmup_epochs': 3,
#         'epochs': 10,
#         'lr_heads': 3e-4,
#         'lr_backbone': 1e-5,
#         'weight_decay': 1e-2,
#         'checkpoint_dir': 'video-anomaly-detection/weights'
#     }
# }
#     # --- CHANGE THIS PART ---
#     # OLD CODE:
#     # generate_dummy_data() 
#     # OR
#     # create_annotations_from_csvs('path/to/folder')

#     # --- TO THIS ---
#     # Define the path to the folder containing '1', '2', '3', etc.
#     YOUR_DATASET_PATH = "C://Users//srish//Downloads//movies" # <--- CHANGE THIS PATH
#     create_annotations_from_nested_folders(YOUR_DATASET_PATH)


#     # --- AND CHANGE THIS PART ---
#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#     print(f"\nUsing device: {device}")

#     # --- DataLoaders ---
#     # OLD CODE:
#     # train_dataset = VideoDataset(annotations_file=config['data']['train_annotations'], data_root=config['data']['root'], ...)
    
#     # --- TO THIS ---
#     # Make sure data_root points to your main dataset folder
#     train_dataset = VideoDataset(annotations_file=config['data']['train_annotations'], data_root=YOUR_DATASET_PATH, is_train=True)
#     val_dataset = VideoDataset(annotations_file=config['data']['val_annotations'], data_root=YOUR_DATASET_PATH, is_train=False)

#     # ... (the rest of the script stays the same) ...

In [54]:
if __name__ == '__main__':
# --- Configuration --- 
    config = {
    'data': {
        'root': 'video-anomaly-detection/data',
        'train_annotations': 'video-anomaly-detection/data/train_annotations.csv',
        'val_annotations': 'video-anomaly-detection/data/val_annotations.csv',
    },
    'model': {
        'num_frames': 16,
        'num_classes': 3,
        'feature_dim': 128, # Reduced for faster training on Colab
    },
    'train': {
        'batch_size': 4, 
        'warmup_epochs': 3,
        'epochs': 10,
        'lr_heads': 3e-4,
        'lr_backbone': 1e-5, # Lower LR for fine-tuning
        'weight_decay': 1e-2,
        'checkpoint_dir': 'video-anomaly-detection/weights'
    }
 }
 # --- TO THIS ---
    # Define the path to the folder containing '1', '2', '3', etc.
    YOUR_DATASET_PATH = "C://Users//srish//Downloads//movies" # <--- CHANGE THIS PATH
    create_annotations_from_nested_folders(YOUR_DATASET_PATH)

# Make sure data_root points to your main dataset folder
    train_dataset = VideoDataset(annotations_file=config['data']['train_annotations'], data_root=YOUR_DATASET_PATH, is_train=True)
    val_dataset = VideoDataset(annotations_file=config['data']['val_annotations'], data_root=YOUR_DATASET_PATH, is_train=False)


# --- Main Logic ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

train_dataset = VideoDataset(annotations_file=config['data']['train_annotations'], data_root=config['data']['root'])
val_dataset = VideoDataset(annotations_file=config['data']['val_annotations'], data_root=config['data']['root'])
train_loader = DataLoader(train_dataset, batch_size=config['train']['batch_size'], shuffle=True, num_workers=2, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=config['train']['batch_size'], shuffle=False, num_workers=2, pin_memory=True)

model = SpatioTemporalGNN(
    num_frames=config['model']['num_frames'],
    num_classes=config['model']['num_classes'],
    feature_dim=config['model']['feature_dim']
).to(device)

criterion = nn.CrossEntropyLoss()
best_val_acc = 0.0

# Stage B: Warmup
print("\n--- Stage B: Warmup (Training Classifier Heads) ---")
for param in model.cnn_backbone.parameters(): param.requires_grad = False
for param in model.motion_backbone.parameters(): param.requires_grad = False
optimizer = optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=config['train']['lr_heads'])

for epoch in range(config['train']['warmup_epochs']):
    print(f"\nWarmup Epoch {epoch+1}/{config['train']['warmup_epochs']}")
    train_loss, train_acc = run_epoch(model, train_loader, optimizer, criterion, device, is_training=True)
    print(f"  -> Warmup Train Loss: {train_loss:.4f}, Accuracy: {train_acc:.4f}")
    val_loss, val_acc = run_epoch(model, val_loader, None, criterion, device, is_training=False)
    print(f"  -> Warmup Val Loss  : {val_loss:.4f}, Accuracy: {val_acc:.4f}")

# Stage C: Fine-tuning
print("\n--- Stage C: Fine-tuning (End-to-End) ---")
for param in model.parameters(): param.requires_grad = True
optimizer = optim.AdamW(model.parameters(), lr=config['train']['lr_backbone'], weight_decay=config['train']['weight_decay'])
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=config['train']['epochs'])

for epoch in range(config['train']['epochs']):
    print(f"\nEpoch {epoch+1}/{config['train']['epochs']}")
    train_loss, train_acc = run_epoch(model, train_loader, optimizer, criterion, device, is_training=True)
    print(f"  -> Train Loss: {train_loss:.4f}, Accuracy: {train_acc:.4f}")
    val_loss, val_acc = run_epoch(model, val_loader, None, criterion, device, is_training=False)
    print(f"  -> Val Loss  : {val_loss:.4f}, Accuracy: {val_acc:.4f}")
    scheduler.step()

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), os.path.join(config['train']['checkpoint_dir'], 'best_model.pth'))
        print(f"  -> New best model saved with validation accuracy: {best_val_acc:.4f}")

print("\nTraining complete.")
print(f"Best validation accuracy achieved: {best_val_acc:.4f}")

Scanning for videos in nested folders under: C://Users//srish//Downloads//movies
Annotation files created successfully from nested folders.
Total videos found: 246
Training set size: 196
Validation set size: 50
Using device: cpu

--- Stage B: Warmup (Training Classifier Heads) ---

Warmup Epoch 1/3


Training:   0%|                                                                                 | 0/49 [00:05<?, ?it/s]


RuntimeError: DataLoader worker (pid(s) 22388, 21036) exited unexpectedly

In [None]:
# class VideoDataset(Dataset):
#     def __init__(self, annotations_file, data_root, num_frames=16, frame_size=(224, 224), fps=16):
#         self.annotations = pd.read_csv(annotations_file)
#         self.data_root = data_root
#         self.num_frames = num_frames
#         self.frame_size = frame_size
#         self.target_fps = fps
#         self.labels_map = {"Normal": 0, "Panic": 1, "Violent": 2}

#         self.transform = A.Compose([
#             A.RandomResizedCrop(height=self.frame_size[0], width=self.frame_size[1], scale=(0.8, 1.0)),
#             A.HorizontalFlip(p=0.5),
#             A.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1, p=0.8),
#             A.GaussNoise(p=0.2),
#             A.CoarseDropout(max_holes=8, max_height=16, max_width=16, min_holes=1, min_height=8, min_width=8, p=0.5),
#             A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
#             ToTensorV2(),
#         ])

#     def __len__(self):
#         return len(self.annotations)

#     def __getitem__(self, idx):
#         video_path = os.path.join(self.data_root, self.annotations.iloc[idx, 0])
#         label_str = self.annotations.iloc[idx, 1]
#         label = self.labels_map[label_str]

#         frames, frame_diffs = self._load_video_clip(video_path)

#         transformed_frames = torch.stack([self.transform(image=frame)["image"] for frame in frames])
        
#         frame_diffs = torch.from_numpy(frame_diffs).float().permute(0, 3, 1, 2) # T, H, W, C -> T, C, H, W
#         frame_diffs_normalized = torch.stack([A.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])(image=d.permute(1,2,0).numpy())['image'] for d in frame_diffs])
#         frame_diffs_tensor = ToTensorV2()(image=frame_diffs_normalized.permute(1,2,3,0))['image'].squeeze(0)

#         return transformed_frames, frame_diffs_tensor.permute(3,0,1,2), torch.tensor(label, dtype=torch.long)

#     def _load_video_clip(self, video_path):
#         cap = cv2.VideoCapture(video_path)
#         if not cap.isOpened():
#             raise Exception(f"Could not open video file: {video_path}")

#         total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        
#         frame_indices = np.linspace(0, total_frames - 1, self.num_frames, dtype=int)

#         frames = []
#         frame_diffs = []
#         last_frame_gray = None

#         loaded_frames = 0
#         for i in range(total_frames):
#             ret, frame = cap.read()
#             if not ret:
#                 break
            
#             if i in frame_indices:
#                 frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
#                 frame = cv2.resize(frame, (self.frame_size[1], self.frame_size[0]))
#                 frames.append(frame)
#                 loaded_frames += 1

#                 current_frame_gray = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
#                 if last_frame_gray is not None:
#                     diff = cv2.absdiff(current_frame_gray, last_frame_gray)
#                     frame_diffs.append(np.stack([diff]*3, axis=-1))
#                 last_frame_gray = current_frame_gray
        
#         cap.release()

#         while len(frames) < self.num_frames:
#             frames.append(frames[-1].copy())
        
#         if not frame_diffs:
#             frame_diffs.append(np.zeros_like(frames[0]))

#         while len(frame_diffs) < self.num_frames:
#             frame_diffs.append(np.zeros_like(frame_diffs[-1]))

#         return np.array(frames), np.array(frame_diffs)

In [None]:
# class SpatioTemporalGNN(nn.Module):
#     def __init__(self, num_frames=16, num_classes=3, feature_dim=256, gnn_layers=2, num_heads=4, dropout=0.3):
#         super().__init__()
#         self.num_frames = num_frames
#         self.feature_dim = feature_dim

#         self.cnn_backbone = timm.create_model('convnext_tiny', pretrained=True, features_only=True, out_indices=[2])
#         self.cnn_feature_proj = nn.Conv2d(384, feature_dim, kernel_size=1)

#         self.motion_backbone = timm.create_model('mobilenetv3_small_050', pretrained=True, features_only=True, out_indices=[2])
#         self.motion_feature_proj = nn.Conv2d(16, feature_dim, kernel_size=1)

#         self.num_nodes_per_frame = 14 * 14 
#         self.gnn_layers = nn.ModuleList([
#             GATConv(feature_dim * 2, feature_dim * 2, heads=num_heads, dropout=dropout, concat=False)
#             for _ in range(gnn_layers)
#         ])
        
#         self.temporal_attention = nn.MultiheadAttention(embed_dim=feature_dim * 2, num_heads=num_heads, dropout=dropout, batch_first=True)

#         self.fusion_norm = nn.LayerNorm(feature_dim * 2)
#         self.classifier = nn.Sequential(
#             nn.Linear(feature_dim * 2, feature_dim),
#             nn.ReLU(),
#             nn.Dropout(dropout),
#             nn.Linear(feature_dim, num_classes)
#         )

#     def forward(self, rgb_clips, motion_clips):
#         batch_size, _, _, height, width = rgb_clips.shape
        
#         rgb_clips_reshaped = rgb_clips.view(batch_size * self.num_frames, 3, height, width)
#         motion_clips_reshaped = motion_clips.view(batch_size * self.num_frames, 3, height, width)

#         appearance_features = self.cnn_backbone(rgb_clips_reshaped)[0]
#         appearance_features = self.cnn_feature_proj(appearance_features)
        
#         motion_features = self.motion_backbone(motion_clips_reshaped)[0]
#         motion_features = self.motion_feature_proj(motion_features)

#         combined_features = torch.cat([appearance_features, motion_features], dim=1)
        
#         _, d_combined, h_feat, w_feat = combined_features.shape
#         self.num_nodes_per_frame = h_feat * w_feat
#         graph_nodes = combined_features.view(batch_size * self.num_frames, d_combined, -1).permute(0, 2, 1)
#         graph_nodes = graph_nodes.view(batch_size, self.num_frames, self.num_nodes_per_frame, d_combined)

#         edge_index = self._create_fully_connected_edges(self.num_nodes_per_frame, graph_nodes.device)
        
#         gnn_outputs = []
#         for t in range(self.num_frames):
#             frame_nodes = graph_nodes[:, t, :, :]
            
#             pyg_data_list = [Data(x=frame_nodes[b], edge_index=edge_index) for b in range(batch_size)]
#             pyg_batch = Batch.from_data_list(pyg_data_list)
            
#             x, edge_idx = pyg_batch.x, pyg_batch.edge_index
#             for layer in self.gnn_layers:
#                 x = F.elu(layer(x, edge_idx))
            
#             x = x.view(batch_size, self.num_nodes_per_frame, -1)
#             gnn_outputs.append(x)

#         gnn_outputs = torch.stack(gnn_outputs, dim=1)

#         spatially_pooled_features = gnn_outputs.mean(dim=2)

#         temporal_features, _ = self.temporal_attention(spatially_pooled_features, spatially_pooled_features, spatially_pooled_features)
        
#         final_representation = temporal_features.mean(dim=1)
        
#         final_representation = self.fusion_norm(final_representation)
#         logits = self.classifier(final_representation)

#         return logits

#     def _create_fully_connected_edges(self, num_nodes, device):
#         # This can be pre-computed and stored
#         if not hasattr(self, '_edge_index') or self._edge_index.device != device or self._edge_index_num_nodes != num_nodes:
#             edge_list = torch.combinations(torch.arange(num_nodes, device=device), r=2).t()
#             self._edge_index = torch.cat([edge_list, edge_list.flip(0)], dim=1)
#             self._edge_index_num_nodes = num_nodes
#         return self._edge_index

In [None]:
# def run_epoch(model, dataloader, optimizer, criterion, device, is_training=True):
#     if is_training:
#         model.train()
#     else:
#         model.eval()

#     total_loss = 0.0
#     correct_predictions = 0
#     total_samples = 0

#     progress_bar = tqdm(dataloader, desc="Training" if is_training else "Validation", leave=False)
    
#     for rgb_frames, motion_frames, labels in progress_bar:
#         rgb_frames = rgb_frames.to(device)
#         motion_frames = motion_frames.to(device)
#         labels = labels.to(device)

#         if is_training:
#             optimizer.zero_grad()

#         with torch.set_grad_enabled(is_training):
#             logits = model(rgb_frames, motion_frames)
#             loss = criterion(logits, labels)

#             if is_training:
#                 loss.backward()
#                 optimizer.step()

#         total_loss += loss.item() * rgb_frames.size(0)
#         _, preds = torch.max(logits, 1)
#         correct_predictions += torch.sum(preds == labels.data)
#         total_samples += rgb_frames.size(0)
        
#         acc = correct_predictions.double() / total_samples
#         progress_bar.set_postfix(loss=f"{loss.item():.4f}", acc=f"{acc:.4f}")

#     epoch_loss = total_loss / total_samples
#     epoch_acc = correct_predictions.double() / total_samples
#     return epoch_loss, epoch_acc

In [None]:
# # --- Configuration --- 
# config = {
#     'data': {
#         'root': 'video-anomaly-detection/data',
#         'train_annotations': 'video-anomaly-detection/data/train_annotations.csv',
#         'val_annotations': 'video-anomaly-detection/data/val_annotations.csv',
#     },
#     'model': {
#         'num_frames': 16,
#         'num_classes': 3,
#         'feature_dim': 128, # Reduced for faster training on Colab
#     },
#     'train': {
#         'batch_size': 4, 
#         'warmup_epochs': 3,
#         'epochs': 10,
#         'lr_heads': 3e-4,
#         'lr_backbone': 1e-5, # Lower LR for fine-tuning
#         'weight_decay': 1e-2,
#         'checkpoint_dir': 'video-anomaly-detection/weights'
#     }
# }

# # --- Main Logic ---
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Using device: {device}")

# train_dataset = VideoDataset(annotations_file=config['data']['train_annotations'], data_root=config['data']['root'])
# val_dataset = VideoDataset(annotations_file=config['data']['val_annotations'], data_root=config['data']['root'])
# train_loader = DataLoader(train_dataset, batch_size=config['train']['batch_size'], shuffle=True, num_workers=2, pin_memory=True)
# val_loader = DataLoader(val_dataset, batch_size=config['train']['batch_size'], shuffle=False, num_workers=2, pin_memory=True)

# model = SpatioTemporalGNN(
#     num_frames=config['model']['num_frames'],
#     num_classes=config['model']['num_classes'],
#     feature_dim=config['model']['feature_dim']
# ).to(device)

# criterion = nn.CrossEntropyLoss()
# best_val_acc = 0.0

# # Stage B: Warmup
# print("\n--- Stage B: Warmup (Training Classifier Heads) ---")
# for param in model.cnn_backbone.parameters(): param.requires_grad = False
# for param in model.motion_backbone.parameters(): param.requires_grad = False
# optimizer = optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=config['train']['lr_heads'])

# for epoch in range(config['train']['warmup_epochs']):
#     print(f"\nWarmup Epoch {epoch+1}/{config['train']['warmup_epochs']}")
#     train_loss, train_acc = run_epoch(model, train_loader, optimizer, criterion, device, is_training=True)
#     print(f"  -> Warmup Train Loss: {train_loss:.4f}, Accuracy: {train_acc:.4f}")
#     val_loss, val_acc = run_epoch(model, val_loader, None, criterion, device, is_training=False)
#     print(f"  -> Warmup Val Loss  : {val_loss:.4f}, Accuracy: {val_acc:.4f}")

# # Stage C: Fine-tuning
# print("\n--- Stage C: Fine-tuning (End-to-End) ---")
# for param in model.parameters(): param.requires_grad = True
# optimizer = optim.AdamW(model.parameters(), lr=config['train']['lr_backbone'], weight_decay=config['train']['weight_decay'])
# scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=config['train']['epochs'])

# for epoch in range(config['train']['epochs']):
#     print(f"\nEpoch {epoch+1}/{config['train']['epochs']}")
#     train_loss, train_acc = run_epoch(model, train_loader, optimizer, criterion, device, is_training=True)
#     print(f"  -> Train Loss: {train_loss:.4f}, Accuracy: {train_acc:.4f}")
#     val_loss, val_acc = run_epoch(model, val_loader, None, criterion, device, is_training=False)
#     print(f"  -> Val Loss  : {val_loss:.4f}, Accuracy: {val_acc:.4f}")
#     scheduler.step()

#     if val_acc > best_val_acc:
#         best_val_acc = val_acc
#         torch.save(model.state_dict(), os.path.join(config['train']['checkpoint_dir'], 'best_model.pth'))
#         print(f"  -> New best model saved with validation accuracy: {best_val_acc:.4f}")

# print("\nTraining complete.")
# print(f"Best validation accuracy achieved: {best_val_acc:.4f}")