### Importing Libraries

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import cv2
import os
import numpy as np

ModuleNotFoundError: No module named 'torchvision'

In [2]:
# VRAM Clear Karne Ke Liye Ye Commands Run Kar
torch.cuda.empty_cache()
torch.cuda.memory_summary(device=None, abbreviated=False)



### Global Variables

In [3]:
DATASET = "dataset"
NON_VIOLENCE = "NonViolence"
VIOLENCE = "Violence"
BATCH_SIZE = 1
NUM_WORKERS = 0
LEARNING_RATE = 5e-5

### Dataset Loader (Video to Frames)

In [4]:
class VideoDataset(torch.utils.data.Dataset):
    def __init__(self, root_dir, sequence_length=20, transform=None):
        self.root_dir = root_dir
        self.sequence_length = sequence_length
        self.transform = transform
        self.video_paths = []
        self.labels = []
        
        # Get all video file paths and labels
        for label, category in enumerate([NON_VIOLENCE, VIOLENCE]):
            folder_path = os.path.join(root_dir, category, "RLVS")
            for file in os.listdir(folder_path):
                if file.endswith(".mp4"):
                    self.video_paths.append(os.path.join(folder_path, file))
                    self.labels.append(label)  # 0 = NonViolence, 1 = Violence

    def __len__(self):
        return len(self.video_paths)

    def __getitem__(self, idx):
        video_path = self.video_paths[idx]
        label = self.labels[idx]

        cap = cv2.VideoCapture(video_path)
        frames = []
        frame_count = 0

        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.resize(frame, (224, 224))  # Resize
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert to RGB
            if self.transform:
                frame = self.transform(frame)
            frames.append(frame)
            frame_count += 1

        cap.release()

        # Fix sequence length
        if len(frames) > self.sequence_length:
            frames = frames[:self.sequence_length]
        else:
            while len(frames) < self.sequence_length:
                frames.append(frames[-1])

        frames = torch.stack(frames)  # Convert list to tensor
        label = torch.tensor(label, dtype=torch.long)

        return frames, label


### Transformations & Dataloader

In [5]:
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])  # Normalize for CNN/LSTM
])

dataset = VideoDataset(root_dir=DATASET, sequence_length=20, transform=transform)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)


### LSTM Model

In [6]:
# class LSTMModel(nn.Module):
#     def __init__(self, hidden_size=64, num_layers=2):
#         super(LSTMModel, self).__init__()
        
#         self.conv = nn.Sequential(
#             nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1),
#             nn.ReLU(),
#             nn.MaxPool2d(2, 2),  # Output: 112x112 -> 56x56
            
#             nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
#             nn.ReLU(),
#             nn.MaxPool2d(2, 2),  # Output: 56x56 -> 28x28

#             nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
#             nn.ReLU(),
#             nn.MaxPool2d(2, 2),  # Output: 28x28 -> 14x14
#         )
        
#         self.lstm = nn.LSTM(128 * 14 * 14, hidden_size, num_layers, batch_first=True)
        
#         self.fc = nn.Sequential(
#             nn.Linear(hidden_size, 32),
#             nn.ReLU(),
#             nn.Linear(32, 1)  # Last layer
#         )

#     def forward(self, x):
#         batch_size, seq_len, c, h, w = x.shape
#         x = x.view(batch_size * seq_len, c, h, w)
#         x = self.conv(x)
#         x = x.view(batch_size, seq_len, -1)  
#         x, _ = self.lstm(x)
#         x = self.fc(x[:, -1, :])
#         return x  # No sigmoid here, use BCEWithLogitsLoss

# model = LSTMModel().to("cuda")


In [7]:
# class LSTMModel(nn.Module):
#     def __init__(self, hidden_size=64, num_layers=2):
#         super(LSTMModel, self).__init__()
#         self.conv = nn.Sequential(
#             nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1),  
#             nn.ReLU(),
#             nn.MaxPool2d(2, 2),  # 112x112 -> 56x56
            
#             nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
#             nn.ReLU(),
#             nn.MaxPool2d(2, 2)  # 56x56 -> 28x28
#         )
        
#         self.lstm_input_size = 128 * 28 * 28  # CNN output size
#         self.lstm = nn.LSTM(self.lstm_input_size, hidden_size, num_layers, batch_first=True)
        
#         self.fc = nn.Linear(hidden_size, 1)
#         self.sigmoid = nn.Sigmoid()

#     def forward(self, x):
#         batch_size, seq_len, c, h, w = x.shape  

#         x = x.view(batch_size * seq_len, c, h, w)  # Reshape for CNN
#         x = self.conv(x)  # CNN Output: (batch_size * seq_len, 128, 28, 28)
        
#         x = x.view(batch_size, seq_len, -1)  # Reshape for LSTM: (batch, seq, 128*28*28)
#         x, _ = self.lstm(x)  
        
#         x = self.fc(x[:, -1, :])  # Last timestep ka output
#         return self.sigmoid(x)

# # Model initialization
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = LSTMModel().to(device)

# # Loss function & optimizer
# criterion = nn.BCEWithLogitsLoss()
# optimizer = optim.Adam(model.parameters(), lr=0.001)

# # Dummy input test
# dummy_input = torch.randn(1, 8, 3, 112, 112).to(device)
# output = model(dummy_input)
# print(output.shape)  # Expected: torch.Size([1, 1])


In [8]:
# -------------------------------------------------------------------WORKING-----------------------------------


class LSTMModel(nn.Module):
    def __init__(self):
        super(LSTMModel, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=4, stride=4),
            nn.BatchNorm2d(128)
        )

        self.lstm_input_size = 128 * 28 * 28  # ✅ Corrected LSTM input size
        self.lstm_hidden_size = 256

        self.lstm = nn.LSTM(input_size=self.lstm_input_size, hidden_size=self.lstm_hidden_size, batch_first=True)
        self.dropout = nn.Dropout(0.3)  # ✅ Added Dropout
        self.fc = nn.Linear(self.lstm_hidden_size, 1)  

        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        batch_size, seq_len, C, H, W = x.shape  
        x = x.view(batch_size * seq_len, C, H, W)  

        x = self.conv(x)  
        # print(f"CNN output shape: {x.shape}")  # 🛠️ Debugging output

        x = x.view(batch_size, seq_len, -1)  
        # print(f"LSTM input shape: {x.shape}")  # 🛠️ Debugging output

        x, _ = self.lstm(x)  
        x = self.dropout(x) 
        x = self.fc(x[:, -1, :])  
        return self.sigmoid(x)


In [9]:

# class LSTMModel(nn.Module):
#     def __init__(self):
#         super(LSTMModel, self).__init__()
#         self.conv = nn.Sequential(
#             nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1),
#             nn.ReLU(),
#             nn.MaxPool2d(2, 2),  # Output: (batch*seq, 64, 56, 56)
#             nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
#             nn.ReLU(),
#             nn.MaxPool2d(2, 2)   # Output: (batch*seq, 128, 28, 28)
#         )
        
#         self.lstm_input_size = 128 * 28 * 28
#         self.lstm = nn.LSTM(input_size=self.lstm_input_size, hidden_size=512, num_layers=2, batch_first=True)

#         self.fc = nn.Linear(512, 1)  
#         # self.sigmoid = nn.Sigmoid()  # Remove this if using BCEWithLogitsLoss
        
#     def forward(self, x):
#         batch_size, seq_len, C, H, W = x.shape
#         x = x.view(batch_size * seq_len, C, H, W)  # Flatten for CNN
#         x = self.conv(x)  # Output: (batch*seq, 128, 28, 28)
#         x = x.view(batch_size, seq_len, -1)  # Reshape for LSTM
#         x, _ = self.lstm(x)  
#         x = self.fc(x[:, -1, :])  # Last timestep output
#         return x  # Return logits directly

In [10]:
# class LSTMModel(nn.Module):
#     def __init__(self):
#         super(LSTMModel, self).__init__()
#         self.conv = nn.Sequential(
#             nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1),
#             nn.ReLU(),
#             nn.MaxPool2d(2, 2),  # Output: (batch*seq, 64, 56, 56)
#             nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
#             nn.ReLU(),
#             nn.MaxPool2d(2, 2),  # Output: (batch*seq, 128, 28, 28)
#             nn.BatchNorm2d(128)   # ADDED BATCHNORM
#         )
        
#         self.lstm_input_size = 128 * 28 * 28
#         self.lstm = nn.LSTM(input_size=self.lstm_input_size, hidden_size=512, num_layers=2, batch_first=True)

#         self.fc = nn.Linear(512, 1)
#         # self.sigmoid = nn.Sigmoid()  # Remove this if using BCEWithLogitsLoss
        
#     def forward(self, x):
#         batch_size, seq_len, C, H, W = x.shape
#         x = x.view(batch_size * seq_len, C, H, W)  # Flatten for CNN
#         x = self.conv(x)  # Output: (batch*seq, 128, 28, 28)
#         x = x.view(batch_size, seq_len, -1)  # Reshape for LSTM
#         x, _ = self.lstm(x)  
#         x = self.fc(x[:, -1, :])  # Last timestep output
#         return x  # Return logits directly

# # Xavier Initialization
# def init_weights(m):
#     if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d):
#         nn.init.xavier_uniform_(m.weight)

### Training Loop

In [None]:
# device = "cuda" if torch.cuda.is_available() else "cpu"
# model = LSTMModel().to(device)

# criterion = nn.BCEWithLogitsLoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

# for epoch in range(5):
#     for batch_idx, (videos, labels) in enumerate(dataloader):
#         videos, labels = videos.to(device), labels.float().to(device)

#         optimizer.zero_grad()
#         outputs = model(videos).squeeze(1)
#         loss = criterion(outputs, labels)
#         loss.backward()
#         optimizer.step()

#         if batch_idx % 10 == 0:
#             print(f"Epoch [{epoch+1}/5], Batch [{batch_idx}/{len(dataloader)}], Loss: {loss.item():.4f}")




# Initialize model, loss, optimizer
device = "cuda" if torch.cuda.is_available() else "cpu"
model = LSTMModel().to(device)
criterion = nn.BCEWithLogitsLoss()  # Since sigmoid is removed from model
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# Training loop
for epoch in range(2):
    for batch_idx, (videos, labels) in enumerate(dataloader):
        videos, labels = videos.to(device), labels.float().to(device)

        optimizer.zero_grad()
        outputs = model(videos).squeeze(1)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        if batch_idx % 10 == 0:
            print(f"Epoch [{epoch+1}/5], Batch [{batch_idx}/{len(dataloader)}], Loss: {loss.item():.4f}")





# Initialize model, loss, optimizer
# device = "cuda" if torch.cuda.is_available() else "cpu"
# model = LSTMModel().to(device)
# model.apply(init_weights)  # Apply weight initialization

# criterion = nn.BCEWithLogitsLoss()  # Since sigmoid is removed from model
# optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# # Training loop
# for epoch in range(5):
#     for batch_idx, (videos, labels) in enumerate(dataloader):
#         videos, labels = videos.to(device), labels.float().to(device)

#         optimizer.zero_grad()
#         outputs = model(videos).squeeze(1)
        
#         # DEBUGGING: Print few outputs
#         if batch_idx % 50 == 0:
#             print(f"Raw Model Output (Logits): {outputs[:5].detach().cpu().numpy()}")
        
#         loss = criterion(outputs, labels)
#         loss.backward()
#         optimizer.step()

#         if batch_idx % 10 == 0:
#             print(f"Epoch [{epoch+1}/5], Batch [{batch_idx}/{len(dataloader)}], Loss: {loss.item():.4f}")

Epoch [1/5], Batch [0/2951], Loss: 0.5370
Epoch [1/5], Batch [10/2951], Loss: 0.4592
Epoch [1/5], Batch [20/2951], Loss: 0.9592
Epoch [1/5], Batch [30/2951], Loss: 0.9850
Epoch [1/5], Batch [40/2951], Loss: 1.0767
Epoch [1/5], Batch [50/2951], Loss: 0.9535
Epoch [1/5], Batch [60/2951], Loss: 1.0542
Epoch [1/5], Batch [70/2951], Loss: 0.9852
Epoch [1/5], Batch [80/2951], Loss: 0.9027
Epoch [1/5], Batch [90/2951], Loss: 0.4489
Epoch [1/5], Batch [100/2951], Loss: 0.9773
Epoch [1/5], Batch [110/2951], Loss: 0.9938
Epoch [1/5], Batch [120/2951], Loss: 0.5563
Epoch [1/5], Batch [130/2951], Loss: 0.4749
Epoch [1/5], Batch [140/2951], Loss: 0.8351
Epoch [1/5], Batch [150/2951], Loss: 0.8665
Epoch [1/5], Batch [160/2951], Loss: 0.9533
Epoch [1/5], Batch [170/2951], Loss: 0.4897
Epoch [1/5], Batch [180/2951], Loss: 0.5045
Epoch [1/5], Batch [190/2951], Loss: 0.4783
Epoch [1/5], Batch [200/2951], Loss: 1.0369
Epoch [1/5], Batch [210/2951], Loss: 0.4579
Epoch [1/5], Batch [220/2951], Loss: 0.8973

[h264 @ 0x565371d15bc0] mb_type 104 in P slice too large at 98 31
[h264 @ 0x565371d15bc0] error while decoding MB 98 31


Epoch [1/5], Batch [1330/2951], Loss: 0.9655
Epoch [1/5], Batch [1340/2951], Loss: 0.5363
Epoch [1/5], Batch [1350/2951], Loss: 0.4182
Epoch [1/5], Batch [1360/2951], Loss: 0.8977
Epoch [1/5], Batch [1370/2951], Loss: 0.3976
Epoch [1/5], Batch [1380/2951], Loss: 0.4649
Epoch [1/5], Batch [1390/2951], Loss: 0.5093
Epoch [1/5], Batch [1400/2951], Loss: 0.7994
Epoch [1/5], Batch [1410/2951], Loss: 0.4649
Epoch [1/5], Batch [1420/2951], Loss: 0.5070
Epoch [1/5], Batch [1430/2951], Loss: 0.5733
Epoch [1/5], Batch [1440/2951], Loss: 0.7801
Epoch [1/5], Batch [1450/2951], Loss: 0.4585
Epoch [1/5], Batch [1460/2951], Loss: 0.8105
Epoch [1/5], Batch [1470/2951], Loss: 0.9014
Epoch [1/5], Batch [1480/2951], Loss: 0.7522
Epoch [1/5], Batch [1490/2951], Loss: 0.4563
Epoch [1/5], Batch [1500/2951], Loss: 0.4760
Epoch [1/5], Batch [1510/2951], Loss: 0.9267
Epoch [1/5], Batch [1520/2951], Loss: 0.9792
Epoch [1/5], Batch [1530/2951], Loss: 0.9574
Epoch [1/5], Batch [1540/2951], Loss: 0.8660
Epoch [1/5

### Model Evaluation (Validation Accuracy)

In [None]:
# Model Evaluation on Validation Set
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for videos, labels in val_dataloader:  # Use validation dataloader
        videos, labels = videos.to(device), labels.to(device)
        outputs = model(videos).squeeze(1)
        preds = (torch.sigmoid(outputs) > 0.5).float()  # Convert logits to 0/1
        correct += (preds == labels).sum().item()
        total += labels.size(0)

accuracy = correct / total * 100
print(f"Validation Accuracy: {accuracy:.2f}%")


### Testing on New Videos

In [None]:
def predict_video(video_tensor):
    model.eval()
    video_tensor = video_tensor.to(device)
    with torch.no_grad():
        output = model(video_tensor.unsqueeze(0)).squeeze(1)  # Add batch dim
        prob = torch.sigmoid(output).item()  # Convert logits to probability
        print(f"Violence Probability: {prob:.4f}")
        return "Violence Detected" if prob > 0.5 else "No Violence"

# Example Test (Ek video tensor input de)
video_sample, _ = next(iter(test_dataloader))
print(predict_video(video_sample[0]))  # First sample ka prediction
