# Efficient Binary Video Classification on a Local GPU-Enabled PC Using PyTorch

### Import packages

In [20]:
import sys
sys.path.append('pytorchvideo')
from pytorchvideo.data import LabeledVideoDataset

In [21]:
import torch
from glob import glob
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

print("CUDA available:", torch.cuda.is_available())
print("GPU count:", torch.cuda.device_count())
if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))


CUDA available: True
GPU count: 1
GPU name: NVIDIA GeForce RTX 2050


### Data Loading

In [22]:
non = glob('dataset/NonViolence/*')
vio = glob('dataset/Violence/*')
label = [0]*len(non) + [1]*len(vio)
df = pd.DataFrame(zip(non+vio, label), columns=['video', 'label'])

print("Number of videos:", len(df))
print("Number of non-violence videos:", len(non))
print("Number of violence videos:", len(vio))

df.head()

Number of videos: 1000
Number of non-violence videos: 500
Number of violence videos: 500


Unnamed: 0,video,label
0,dataset/NonViolence\NV_1.mp4,0
1,dataset/NonViolence\NV_10.mp4,0
2,dataset/NonViolence\NV_103.mp4,0
3,dataset/NonViolence\NV_104.mp4,0
4,dataset/NonViolence\NV_106.mp4,0


In [23]:
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.2, shuffle=True)
len(train_df), len(val_df)

(800, 200)

### Data augmentation process

In [24]:
from pytorchvideo.data import LabeledVideoDataset, Kinetics, make_clip_sampler, labeled_video_dataset

from pytorchvideo.transforms import (
    ApplyTransformToKey,
    Normalize,
    RandomShortSideScale,
    UniformTemporalSubsample,
    Permute
)

from torchvision.transforms import (
    Compose,
    Lambda,
    RandomCrop,
    RandomHorizontalFlip,
    Resize,
)

from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo
)

In [25]:
video_transform = Compose([
    ApplyTransformToKey(
        key='video',
        transform=Compose([
            UniformTemporalSubsample(20),
            Lambda(lambda x: x / 255.0),
            Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225)),
            RandomShortSideScale(min_size=248, max_size=256),
            CenterCropVideo(224),
            RandomHorizontalFlip(p=0.5),
        ])
    ),
])

In [26]:
train_df

Unnamed: 0,video,label
687,dataset/Violence\V_409.mp4,1
500,dataset/Violence\V_1.mp4,1
332,dataset/NonViolence\NV_703.mp4,0
979,dataset/Violence\V_953.mp4,1
817,dataset/Violence\V_635.mp4,1
...,...,...
835,dataset/Violence\V_669.mp4,1
192,dataset/NonViolence\NV_426.mp4,0
629,dataset/Violence\V_305.mp4,1
559,dataset/Violence\V_201.mp4,1


### Testing data loading function for errors

In [27]:
from pytorchvideo.data import LabeledVideoDataset
from pytorchvideo.data.clip_sampling import make_clip_sampler
from torch.utils.data import DataLoader, IterableDataset

# Convert DataFrame to the required format
video_paths = [
    (row["video"], {"label": row["label"]}) for _, row in train_df.iterrows()
]

video_paths_2 = [
    (row["video"], {"label": row["label"]}) for _, row in val_df.iterrows()
]

clip_sampler = make_clip_sampler("random", 2.0)

# Base dataset
base_train_dataset = LabeledVideoDataset(
    labeled_video_paths=video_paths,
    clip_sampler=clip_sampler,
    transform=video_transform,
    decode_audio=False,
    decoder="pyav"
)

# Iterable wrapper to reshape the 'label' and keep all keys
class ReshapeLabelDataset(IterableDataset):
    def __init__(self, base_dataset):
        self.base_dataset = base_dataset

    def __iter__(self):
        for sample in self.base_dataset:
            new_sample = dict(sample)  # Make a shallow copy
            label = new_sample["label"]
            # Ensure label is a tensor with shape [1]
            if isinstance(label, torch.Tensor):
                new_sample["label"] = label.view(1)
            else:
                new_sample["label"] = torch.tensor([label], dtype=torch.long)
            yield new_sample

# Wrap the dataset
train_dataset = ReshapeLabelDataset(base_train_dataset)

# DataLoader
loader = DataLoader(train_dataset, batch_size=5, num_workers=0, pin_memory=True)

# Check output
batch = next(iter(loader))
print(batch.keys())               # Should show all desired keys
print(batch["video"].shape)       # torch.Size([5, 3, 20, 224, 224])
print(batch["label"].shape)       # torch.Size([5, 1])


dict_keys(['video', 'video_name', 'video_index', 'clip_index', 'aug_index', 'label'])
torch.Size([5, 3, 20, 224, 224])
torch.Size([5, 1])


In [28]:
batch = next(iter(loader))

In [29]:
batch.keys()

dict_keys(['video', 'video_name', 'video_index', 'clip_index', 'aug_index', 'label'])

In [30]:
batch['video'].shape, batch['label'].shape

(torch.Size([5, 3, 20, 224, 224]), torch.Size([5, 1]))

### Model Architecture

In [31]:
import torch.nn as nn
import torch
from pytorch_lightning import LightningModule, seed_everything, Trainer
from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor
from torch.optim.lr_scheduler import CosineAnnealingLR
from sklearn.metrics import classification_report
import torchmetrics

In [32]:
video_model = torch.hub.load('facebookresearch/pytorchvideo', 'efficient_x3d_xs', pretrained=True)

Using cache found in C:\Users\bimsa/.cache\torch\hub\facebookresearch_pytorchvideo_main


In [33]:
class OurModel(LightningModule):
    def __init__(self):
        super().__init__()
        # model architecture
        self.model = torch.hub.load('facebookresearch/pytorchvideo', 'efficient_x3d_xs', pretrained=True)
        self.relu = nn.ReLU()
        self.linear = nn.Linear(400, 1)

        self.lr = 1e-3
        self.batch_size = 4
        self.numworkers = 0
        self.metric = torchmetrics.Accuracy(task='binary')
        self.citerion = nn.BCEWithLogitsLoss()

        self.count = 1
        self.validation_step_outputs = []
        self.train_step_outputs = []

    def forward(self, x):
        x = self.model(x)
        x = self.relu(x)
        x = self.linear(x)
        return x
    
    def configure_optimizers(self):
        opt = torch.optim.Adam(params=self.parameters(), lr=self.lr)
        scheduler = CosineAnnealingLR(opt, T_max=10, eta_min=1e-6, last_epoch=-1)
        return {'optimizer': opt, 'lr_scheduler':scheduler}
    
    def train_dataloader(self):
        base_train_dataset = LabeledVideoDataset(
            labeled_video_paths=video_paths,
            clip_sampler=clip_sampler,
            transform=video_transform,  # your transform function
            decode_audio=False,
            decoder="pyav"
        )
        dataset = ReshapeLabelDataset(base_train_dataset)
        loader = DataLoader(dataset, batch_size=self.batch_size, num_workers=self.numworkers, pin_memory=True)
        return loader
    
    def training_step(self, batch, batch_idx):
        video, label = batch['video'], batch['label']
        label = label.float()
        out = self(video)
        loss = self.citerion(out, label)

        metric = self.metric(out, label.to(torch.int64))
        print("batch passed !!!")
        self.train_step_outputs.append({'loss': loss.detach(), 'metric': metric.detach()})
        
        return {'loss': loss, 'metric': metric.detach()}
    
    def on_train_epoch_end(self):
        loss = torch.stack([x['loss'] for x in self.train_step_outputs]).mean().cpu().numpy().round(2)
        metric = torch.stack([x['metric'] for x in self.train_step_outputs]).mean().cpu().numpy().round(2)
        self.log('training_loss', loss)
        self.log('training_metric', metric)
        print("Epoch:", self.count, " ended with loss:", loss, " and metric:", metric)
        self.count += 1

    def val_dataloader(self):
        base_train_dataset = LabeledVideoDataset(
            labeled_video_paths=video_paths_2,
            clip_sampler=clip_sampler,
            transform=video_transform,  # your transform function
            decode_audio=False,
            decoder="pyav"
        )

        dataset = ReshapeLabelDataset(base_train_dataset)
        loader = DataLoader(dataset, batch_size=self.batch_size, num_workers=self.numworkers, pin_memory=True)
        return loader
    
    def validation_step(self, batch, batch_idx):
        video, label = batch['video'], batch['label']
        label = label.float()
        out = self(video)
        loss = self.citerion(out, label)
        metric = self.metric(out, label.to(torch.int64))

        self.validation_step_outputs.append({'loss': loss.detach(), 'metric': metric.detach()})
        return {'loss': loss, 'metric': metric.detach()}
    
    def on_validation_epoch_end(self):
        loss = torch.stack([x['loss'] for x in self.validation_step_outputs]).mean().cpu().numpy().round(2)
        metric = torch.stack([x['metric'] for x in self.validation_step_outputs]).mean().cpu().numpy().round(2)
        self.log('validation_loss', loss)
        self.log('validation_metric', metric)

    def test_dataloader(self):
        base_train_dataset = LabeledVideoDataset(
            labeled_video_paths=video_paths_2,
            clip_sampler=clip_sampler,
            transform=video_transform,  # your transform function
            decode_audio=False,
            decoder="pyav"
        )

        dataset = ReshapeLabelDataset(base_train_dataset)
        loader = DataLoader(dataset, batch_size=self.batch_size, num_workers=self.numworkers, pin_memory=True)
        print("DataLoader done")
        return loader
    
    def on_test_start(self):
        self.test_outputs = []

    def test_step(self, batch, batch_idx):
        video, label = batch['video'], batch['label']
        label = label.float()
        out = self(video)

        # Save outputs to instance attribute
        self.test_outputs.append({'label': label.detach(), 'pred': out.detach()})

        return None  # No need to return anything now

    def on_test_epoch_end(self):
        # Use self.test_outputs instead of `outputs` argument
        label = torch.cat([x['label'] for x in self.test_outputs]).cpu().numpy()
        pred = torch.cat([x['pred'] for x in self.test_outputs]).cpu().numpy()
        pred = np.where(pred > 0.5, 1, 0)

        print(classification_report(label, pred, target_names=['NonViolence', 'Violence']))


### Model training

In [34]:
checkpoint_callback = ModelCheckpoint(monitor="validation_loss", dirpath="checkpoints", filename="file", save_last=True)
lr_monitor = LearningRateMonitor(logging_interval='epoch')

In [35]:
model = OurModel()
seed_everything(0)
trainer = Trainer(
    max_epochs=8,
    accelerator="gpu",
    devices=1,
    precision=16,
    accumulate_grad_batches=4,
    enable_progress_bar=False,
    num_sanity_val_steps=0,
    callbacks=[lr_monitor, checkpoint_callback],
)

Using cache found in C:\Users\bimsa/.cache\torch\hub\facebookresearch_pytorchvideo_main
Seed set to 0
c:\Users\bimsa\.conda\envs\nlp\lib\site-packages\lightning_fabric\connector.py:571: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [36]:
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

In [37]:
trainer.fit(model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type              | Params | Mode 
-------------------------------------------------------
0 | model    | EfficientX3d      | 3.8 M  | train
1 | relu     | ReLU              | 0      | train
2 | linear   | Linear            | 401    | train
3 | metric   | BinaryAccuracy    | 0      | train
4 | citerion | BCEWithLogitsLoss | 0      | train
-------------------------------------------------------
3.8 M     Trainable params
0         Non-trainable params
3.8 M     Total params
15.179    Total estimated model params size (MB)
845       Modules in train mode
0         Modules in eval mode
c:\Users\bimsa\.conda\envs\nlp\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
c:\Users\bimsa\.conda\envs\nlp\lib\site-p

batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !

mb_type 104 in P slice too large at 98 31
error while decoding MB 98 31


batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !

mb_type 104 in P slice too large at 98 31
error while decoding MB 98 31


batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !

mb_type 104 in P slice too large at 98 31
error while decoding MB 98 31


batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
Epoch: 7  ended with loss: 0.16  and metric: 0.94
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!!
batch passed !!

`Trainer.fit` stopped: `max_epochs=8` reached.


### Model validation

In [41]:
val_results = trainer.validate(model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


c:\Users\bimsa\.conda\envs\nlp\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


In [39]:
trainer.test(model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
c:\Users\bimsa\.conda\envs\nlp\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


DataLoader done
              precision    recall  f1-score   support

 NonViolence       0.96      0.99      0.97        98
    Violence       0.99      0.96      0.98       102

    accuracy                           0.97       200
   macro avg       0.98      0.98      0.97       200
weighted avg       0.98      0.97      0.98       200



[{}]

### Save fine-tuned model locally

In [55]:
# Save the trained model weights
model_path = "trained_model/efficient_x3d_xs_finetuned.pth"
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")

Model saved to trained_model/efficient_x3d_xs_finetuned.pth


### Model testing on single unseen video

Main function

In [50]:
from pytorchvideo.data import LabeledVideoDataset
from torch.utils.data import DataLoader
from IPython.display import HTML, display
from base64 import b64encode
import torch

def predict_and_show_video(video_path):
    print("Video path:", video_path)

    # Helper: Display video in notebook
    def show_video(video_path, width=400):
        mp4 = open(video_path, 'rb').read()
        data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
        return HTML(f"""
        <video width="{width}" controls>
            <source src="{data_url}" type="video/mp4">
        </video>
        """)

    # Prepare sample with dummy label
    sample = [(video_path, {"label": 0})]

    # Dataset and loader
    single_video_dataset = LabeledVideoDataset(
        labeled_video_paths=sample,
        clip_sampler=clip_sampler,
        transform=video_transform,
        decode_audio=False,
        decoder="pyav"
    )
    single_loader = DataLoader(single_video_dataset, batch_size=1, num_workers=0)

    # Get batch
    single_batch = next(iter(single_loader))
    video_tensor = single_batch['video']  # shape: [1, 3, T, H, W]

    # Inference
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    video_tensor = video_tensor.to(device)

    with torch.no_grad():
        logits = model(video_tensor)
        prob = torch.sigmoid(logits)
        pred = (prob > 0.5).long().item()

    # Output
    class_names = ['NonViolence', 'Violence']
    print(f"Predicted class: {class_names[pred]}")

    # Display video
    display(show_video(video_path))


In [53]:
predict_and_show_video("unseen_testing_videos/t1.mp4")

Video path: unseen_testing_videos/t1.mp4
Predicted class: Violence
