In [73]:
import cv2
import os
import shutil

# --- Configuration for Frame Extraction ---
# ==============================================================================
# IMPORTANT: SET YOUR PATHS HERE!
# 1. The path to your dataset containing the 'Violence' and 'NonViolence' video folders.
VIDEO_DATASET_PATH = r"/media/rajendraprasath-m/New Volume/Projects/Final Year Project/Data/Video/Dataset/train"
# 2. The path to the NEW, EMPTY folder where the extracted frames will be saved.
FRAME_OUTPUT_PATH = r"/media/rajendraprasath-m/New Volume/Projects/Final Year Project/Data/Video/Extracted_Frames/train"
# ==============================================================================

# Define the names of your class folders
CLASSES = ["Violence", "NonViolence"]

# It's good practice to start with a clean slate
if os.path.exists(FRAME_OUTPUT_PATH):
    shutil.rmtree(FRAME_OUTPUT_PATH)
    print(f"Removed existing frames directory: {FRAME_OUTPUT_PATH}")

os.makedirs(FRAME_OUTPUT_PATH, exist_ok=True)
print(f"Created main frames directory: {FRAME_OUTPUT_PATH}")

# Loop through each class (e.g., 'Violence', then 'NonViolence')
for category in CLASSES:
    video_class_path = os.path.join(VIDEO_DATASET_PATH, category)
    frame_class_path = os.path.join(FRAME_OUTPUT_PATH, category)

    # Create the output subfolder (e.g., .../Extracted_Frames/Violence)
    os.makedirs(frame_class_path, exist_ok=True)
    
    # Check if the source video directory exists
    if not os.path.exists(video_class_path):
        print(f"[WARNING] Source directory not found: {video_class_path}. Skipping.")
        continue

    # Loop through every video file in the source directory
    for video_name in os.listdir(video_class_path):
        video_path = os.path.join(video_class_path, video_name)

        # Ensure we are processing a video file
        if not video_path.lower().endswith(('.mp4', '.avi', '.mov', '.mkv')):
            continue

        print(f"[INFO] Processing video: {video_name}...")
        
        cap = cv2.VideoCapture(video_path)
        frame_count = 0
        
        while True:
            # Read one frame from the video stream
            success, frame = cap.read()
            
            # If 'success' is False, we have reached the end of the video
            if not success:
                break
            
            # Construct the output filename
            # Example: Violence/V_7_frame_0.jpg
            base_video_name = os.path.splitext(video_name)[0]
            output_filename = f"{base_video_name}_frame_{frame_count}.jpg"
            output_filepath = os.path.join(frame_class_path, output_filename)
            
            # Save the current frame as a JPG image
            cv2.imwrite(output_filepath, frame)
            
            frame_count += 1
            
        cap.release()
        print(f"  -> Extracted {frame_count} frames.")

print("\n[SUCCESS] All videos have been processed and frames have been extracted.")

Removed existing frames directory: /media/rajendraprasath-m/New Volume/Projects/Final Year Project/Data/Video/Extracted_Frames/train
Created main frames directory: /media/rajendraprasath-m/New Volume/Projects/Final Year Project/Data/Video/Extracted_Frames/train
[INFO] Processing video: V_1.mp4...
  -> Extracted 103 frames.
[INFO] Processing video: V_10.mp4...
  -> Extracted 96 frames.
[INFO] Processing video: V_100.mp4...
  -> Extracted 153 frames.
[INFO] Processing video: V_101.mp4...
  -> Extracted 108 frames.
[INFO] Processing video: V_102.mp4...
  -> Extracted 153 frames.
[INFO] Processing video: V_103.mp4...
  -> Extracted 150 frames.
[INFO] Processing video: V_104.mp4...
  -> Extracted 150 frames.
[INFO] Processing video: V_105.mp4...
  -> Extracted 153 frames.
[INFO] Processing video: V_106.mp4...
  -> Extracted 150 frames.
[INFO] Processing video: V_107.mp4...
  -> Extracted 150 frames.
[INFO] Processing video: V_108.mp4...
  -> Extracted 120 frames.
[INFO] Processing video: V_

In [74]:
import cv2
import os
import shutil

# --- Configuration for Frame Extraction ---
# ==============================================================================
# IMPORTANT: SET YOUR PATHS HERE!
# 1. The path to your dataset containing the 'Violence' and 'NonViolence' video folders.
VIDEO_DATASET_PATH = r"/media/rajendraprasath-m/New Volume/Projects/Final Year Project/Data/Video/Dataset/test"
# 2. The path to the NEW, EMPTY folder where the extracted frames will be saved.
FRAME_OUTPUT_PATH = r"/media/rajendraprasath-m/New Volume/Projects/Final Year Project/Data/Video/Extracted_Frames/test"
# ==============================================================================

# Define the names of your class folders
CLASSES = ["Violence", "NonViolence"]

# It's good practice to start with a clean slate
if os.path.exists(FRAME_OUTPUT_PATH):
    shutil.rmtree(FRAME_OUTPUT_PATH)
    print(f"Removed existing frames directory: {FRAME_OUTPUT_PATH}")

os.makedirs(FRAME_OUTPUT_PATH, exist_ok=True)
print(f"Created main frames directory: {FRAME_OUTPUT_PATH}")

# Loop through each class (e.g., 'Violence', then 'NonViolence')
for category in CLASSES:
    video_class_path = os.path.join(VIDEO_DATASET_PATH, category)
    frame_class_path = os.path.join(FRAME_OUTPUT_PATH, category)

    # Create the output subfolder (e.g., .../Extracted_Frames/Violence)
    os.makedirs(frame_class_path, exist_ok=True)
    
    # Check if the source video directory exists
    if not os.path.exists(video_class_path):
        print(f"[WARNING] Source directory not found: {video_class_path}. Skipping.")
        continue

    # Loop through every video file in the source directory
    for video_name in os.listdir(video_class_path):
        video_path = os.path.join(video_class_path, video_name)

        # Ensure we are processing a video file
        if not video_path.lower().endswith(('.mp4', '.avi', '.mov', '.mkv')):
            continue

        print(f"[INFO] Processing video: {video_name}...")
        
        cap = cv2.VideoCapture(video_path)
        frame_count = 0
        
        while True:
            # Read one frame from the video stream
            success, frame = cap.read()
            
            # If 'success' is False, we have reached the end of the video
            if not success:
                break
            
            # Construct the output filename
            # Example: Violence/V_7_frame_0.jpg
            base_video_name = os.path.splitext(video_name)[0]
            output_filename = f"{base_video_name}_frame_{frame_count}.jpg"
            output_filepath = os.path.join(frame_class_path, output_filename)
            
            # Save the current frame as a JPG image
            cv2.imwrite(output_filepath, frame)
            
            frame_count += 1
            
        cap.release()
        print(f"  -> Extracted {frame_count} frames.")

print("\n[SUCCESS] All videos have been processed and frames have been extracted.")

Removed existing frames directory: /media/rajendraprasath-m/New Volume/Projects/Final Year Project/Data/Video/Extracted_Frames/test
Created main frames directory: /media/rajendraprasath-m/New Volume/Projects/Final Year Project/Data/Video/Extracted_Frames/test
[INFO] Processing video: V_401.mp4...
  -> Extracted 126 frames.
[INFO] Processing video: V_402.mp4...
  -> Extracted 111 frames.
[INFO] Processing video: V_403.mp4...
  -> Extracted 102 frames.
[INFO] Processing video: V_404.mp4...
  -> Extracted 126 frames.
[INFO] Processing video: V_405.mp4...
  -> Extracted 129 frames.
[INFO] Processing video: V_406.mp4...
  -> Extracted 114 frames.
[INFO] Processing video: V_407.mp4...
  -> Extracted 138 frames.
[INFO] Processing video: V_408.mp4...
  -> Extracted 105 frames.
[INFO] Processing video: V_409.mp4...
  -> Extracted 147 frames.
[INFO] Processing video: V_451.mp4...
  -> Extracted 120 frames.
[INFO] Processing video: V_452.mp4...
  -> Extracted 141 frames.
[INFO] Processing video: 

In [None]:
import os
import torch
import numpy as np
from datasets import load_dataset, Image
from transformers import (
    AutoImageProcessor,
    AutoModelForImageClassification,
    TrainingArguments,
    Trainer,
    DefaultDataCollator
)
import evaluate


FINETUNED_MODEL_PATH = "/media/rajendraprasath-m/New Volume/Projects/Final Year Project/vit_final_model"
BASE_MODEL_NAME = "google/vit-base-patch16-224-in21k"


# --- Configuration ---
# --- START OF CRITICAL CHANGE: Using a standard Google ViT model ---
MODEL_NAME = "google/vit-base-patch16-224-in21k"
# --- END OF CRITICAL CHANGE ---

# Path to your dataset (This is correct)
DATASET_PATH = "/media/rajendraprasath-m/New Volume/Projects/Final Year Project/Data/Video/Extracted_Frames"

# The name for your NEW model
FINETUNED_MODEL_PATH = os.path.expanduser("/media/rajendraprasath-m/New Volume/Projects/Final Year Project/Model")

# --- GPU Setup ---
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"[INFO] Using device: {device}")

[INFO] Using device: cuda:0


In [76]:
dataset = load_dataset("imagefolder", data_dir=DATASET_PATH)

labels = dataset["train"].features["label"].names
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for i, label in label2id.items()}

image_processor = AutoImageProcessor.from_pretrained(MODEL_NAME)


def transform(example_batch):
    images = [img.convert("RGB") for img in example_batch["image"]]
    inputs = image_processor(images, return_tensors="pt")
    inputs["labels"] = example_batch["label"]
    return inputs



# âœ… IMPORTANT: no assignment here
dataset.set_transform(transform)

data_collator = DefaultDataCollator()

dataset["train"] = dataset["train"].shuffle(seed=42).select(range(2000))
dataset["test"]  = dataset["test"].shuffle(seed=42).select(range(500))



Downloading data: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 54351/54351 [00:03<00:00, 17077.51files/s]
Downloading data: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 21937/21937 [00:01<00:00, 16851.73files/s]
Generating train split: 54351 examples [00:02, 20769.84 examples/s]
Generating test split: 21937 examples [00:01, 20264.19 examples/s]
Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


In [None]:
# --- Define Metrics Function ---
accuracy_metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

# --- Load the Standard Google ViT Model for Fine-Tuning ---
model = AutoModelForImageClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(labels),
    label2id=label2id,
    id2label=id2label,
)
model.to(device)

# --- Define Training Arguments ---
training_args = TrainingArguments(
    output_dir=FINETUNED_MODEL_PATH,
    eval_strategy="epoch",
    save_strategy="epoch",

    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,

    num_train_epochs=10,
    learning_rate=1e-5,

    remove_unused_columns=False,
    fp16=True,                 # ðŸš€ HUGE SPEEDUP
    report_to="none",
)


# --- Initialize the Trainer (FIXED) ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.save_model(FINETUNED_MODEL_PATH)
print(f"\n[SUCCESS] GPU Training complete. Model saved to: {FINETUNED_MODEL_PATH}")

print("[INFO] Trainer is set up and ready.")

Downloading builder script: 4.20kB [00:00, 161kB/s]
Loading weights: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 198/198 [00:00<00:00, 633.59it/s, Materializing param=vit.layernorm.weight]                                 
[1mViTForImageClassification LOAD REPORT[0m from: google/vit-base-patch16-224-in21k
Key                 | Status     | 
--------------------+------------+-
pooler.dense.bias   | UNEXPECTED | 
pooler.dense.weight | UNEXPECTED | 
classifier.weight   | MISSING    | 
classifier.bias     | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


[INFO] Trainer is set up and ready.


In [78]:
for param in model.vit.parameters():
    param.requires_grad = False

for param in model.vit.encoder.layer[-1].parameters():
    param.requires_grad = True


In [83]:
print("[INFO] Starting GPU fine-tuning...")
trainer.train()



[INFO] Starting GPU fine-tuning...


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.456881,0.816
2,No log,0.425943,0.832
3,No log,0.400782,0.836
4,No log,0.381054,0.838
5,No log,0.365963,0.838
6,No log,0.355486,0.842
7,No log,0.348466,0.842
8,0.179144,0.343824,0.844
9,0.179144,0.341501,0.844
10,0.179144,0.340715,0.844


Writing model shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:02<00:00,  2.06s/it]
Writing model shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:01<00:00,  1.95s/it]
Writing model shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:01<00:00,  1.86s/it]
Writing model shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:01<00:00,  1.82s/it]
Writing model shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:02<00:00,  2.07s/it]
Writing model shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:01<00:00,  1.86s/it]
Writing model shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:02<00:00,  2.09s/it]
Writing model shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:01<00:00,  1.91s/it]
Writing model shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:02<00:00,  2.34s/it]
Writing model shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:02<00:00,  2.01s/it]


TrainOutput(global_step=630, training_loss=0.16366966187007845, metrics={'train_runtime': 305.5882, 'train_samples_per_second': 65.448, 'train_steps_per_second': 2.062, 'total_flos': 1.54983979229184e+18, 'train_loss': 0.16366966187007845, 'epoch': 10.0})

In [None]:
# trainer.save_model(FINETUNED_MODEL_PATH)
# print(f"\n[SUCCESS] GPU Training complete. Model saved to: {FINETUNED_MODEL_PATH}")

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:02<00:00,  2.15s/it]


[SUCCESS] GPU Training complete. Model saved to: /media/rajendraprasath-m/New Volume/Projects/Final Year Project/Model





In [84]:
trainer.save_model("vit_final_model")
image_processor.save_pretrained("vit_final_model")


Writing model shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:02<00:00,  2.15s/it]


['vit_final_model/preprocessor_config.json']

In [90]:
from transformers import AutoModelForImageClassification, AutoImageProcessor

MODEL_PATH = "vit_final_model"

processor = AutoImageProcessor.from_pretrained(MODEL_PATH)
model = AutoModelForImageClassification.from_pretrained(MODEL_PATH)

print(model.config.id2label)


Loading weights: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 200/200 [00:00<00:00, 536.45it/s, Materializing param=vit.layernorm.weight]                                 


{0: 'NonViolence', 1: 'Violence'}


In [96]:
import cv2
import torch
import numpy as np
from transformers import AutoImageProcessor, AutoModelForImageClassification
from PIL import Image

# ---------------- CONFIG ----------------
MODEL_PATH = "/media/rajendraprasath-m/New Volume/Projects/Final Year Project/vit_final_model"
VIDEO_PATH = "/media/rajendraprasath-m/New Volume/Projects/Final Year Project/Data/Video/Violence/V_206.mp4"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
FRAME_INTERVAL = 30   # ~1 frame per second (for 30 FPS video)
# ---------------------------------------

# Load model & processor
processor = AutoImageProcessor.from_pretrained(MODEL_PATH)
model = AutoModelForImageClassification.from_pretrained(MODEL_PATH)
model.to(DEVICE)
model.eval()

# Label mapping
id2label = model.config.id2label
label2id = model.config.label2id
print("Labels:", id2label)

# Storage for probabilities
prob_storage = {label: [] for label in id2label.values()}

# Read video
cap = cv2.VideoCapture(VIDEO_PATH)
frame_count = 0

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    if frame_count % FRAME_INTERVAL == 0:
        # BGR â†’ RGB â†’ PIL
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        image = Image.fromarray(frame_rgb)

        # Preprocess
        inputs = processor(image, return_tensors="pt").to(DEVICE)

        with torch.no_grad():
            outputs = model(**inputs)
            probs = torch.softmax(outputs.logits, dim=1)[0].cpu().numpy()

        # Store probabilities using label names
        for idx, prob in enumerate(probs):
            label = id2label[idx]
            prob_storage[label].append(prob)

    frame_count += 1

cap.release()

# ---------------- RESULTS ----------------
avg_probs = {
    label: np.mean(values) * 100
    for label, values in prob_storage.items()
}

final_label = max(avg_probs, key=avg_probs.get)

print("\nðŸŽ¥ VIDEO ANALYSIS RESULT")
for label, prob in avg_probs.items():
    print(f"{label:15}: {prob:.2f}%")

print(f"\nâœ… FINAL PREDICTION: {final_label.upper()}")


Loading weights: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 200/200 [00:00<00:00, 669.21it/s, Materializing param=vit.layernorm.weight]                                 


Labels: {0: 'NonViolence', 1: 'Violence'}

ðŸŽ¥ VIDEO ANALYSIS RESULT
NonViolence    : 9.72%
Violence       : 90.28%

âœ… FINAL PREDICTION: VIOLENCE
