In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
import cv2
import os
import numpy as np

# Define the ConvLSTM model
class ConvLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers):
        super(ConvLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.conv_lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)

    def forward(self, x, hidden):
        h0, c0 = hidden
        out, _ = self.conv_lstm(x, (h0, c0))
        return out[:, -1]


# Load pre-trained VGG19 model
vgg19 = models.vgg19(pretrained=True)
vgg19_features = vgg19.features


# Freeze the parameters in VGG19
for param in vgg19_features.parameters():
    param.requires_grad = False

# Combine VGG19 and ConvLSTM
class ViolenceDetectionModel(nn.Module):
    def __init__(self, vgg_features, hidden_dim, num_layers, num_classes):
        super(ViolenceDetectionModel, self).__init__()
        self.vgg_features = vgg_features
        self.conv_lstm = ConvLSTM(512, hidden_dim, num_layers)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        features = self.vgg_features(x)
        features = torch.reshape(features, (features.size(0), -1, features.size(1)))
        h0 = torch.zeros(self.conv_lstm.num_layers, features.size(0), self.conv_lstm.hidden_dim).to(x.device)
        c0 = torch.zeros(self.conv_lstm.num_layers, features.size(0), self.conv_lstm.hidden_dim).to(x.device)
        convlstm_out = self.conv_lstm(features, (h0, c0))  # Pass the hidden tuple
        out = self.fc(convlstm_out)
        return out


# Set hyperparameters
input_dim = 512  # Dimensionality of input features from VGG19
hidden_dim = 128  # Hidden dimension of ConvLSTM
num_layers = 1  # Number of ConvLSTM layers
num_classes = 2  # Number of output classes (violence, non-violence)
learning_rate = 0.001
num_epochs = 10
batch_size = 8

# Instantiate the model
model = ViolenceDetectionModel(vgg19_features, hidden_dim, num_layers, num_classes)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Dataset directory containing video clips
dataset_dir = 'C:/APIIT BENG SE Degree/Year 3/Final Year Project/Hockey'

# List video files in the dataset directory
violent_files = [f for f in os.listdir(os.path.join(dataset_dir, 'violent')) if f.endswith('.avi')]
non_violent_files = [f for f in os.listdir(os.path.join(dataset_dir, 'non-violent')) if f.endswith('.avi')]

# Combine the lists
video_files = violent_files + non_violent_files

# Create labels for the videos
labels = [1] * len(violent_files) + [0] * len(non_violent_files)

# Training loop
for epoch in range(num_epochs):
    for i, video_file in enumerate(video_files):
        # Determine the label for the video
        label = labels[i]

        # Open video file
        if label == 1:
            video_path = os.path.join(dataset_dir, 'violent', video_file)
        else:
            video_path = os.path.join(dataset_dir, 'non-violent', video_file)

        video = cv2.VideoCapture(video_path)

        frames = []
        while True:
            ret, frame = video.read()
            if not ret:
                break
            # Preprocess frame (resize, normalize, etc.)
            frame = cv2.resize(frame, (224, 224))  # Adjust size as per VGG19 input requirements
            frame = frame.astype(np.float32) / 255.0  # Normalize pixel values between 0 and 1
            frame = torch.from_numpy(frame).permute(2, 0, 1).unsqueeze(0).to(device)  # Convert to tensor

            frames.append(frame)

        video.release()

        # Stack frames to create a tensor with shape (num_frames, channels, height, width)
        frames = torch.cat(frames, dim=0)

        # Forward pass
        outputs = model(frames)

        # Generate target labels
        target = torch.tensor([label] * outputs.size(0), dtype=torch.long).to(device)

        # Compute loss
        loss = criterion(outputs, target)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Print training progress
        if (i + 1) % 10 == 0:
            print(f'Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{len(video_files)}], Loss: {loss.item():.4f}')

# Save the trained model
torch.save(model.state_dict(), 'violence_detection_model_v2.pth')




Epoch [1/10], Step [10/800], Loss: 0.0571
Epoch [1/10], Step [20/800], Loss: 0.0013
Epoch [1/10], Step [30/800], Loss: 0.0004
Epoch [1/10], Step [40/800], Loss: 0.0002
Epoch [1/10], Step [50/800], Loss: 0.0002
Epoch [1/10], Step [60/800], Loss: 0.0001
Epoch [1/10], Step [70/800], Loss: 0.0001
Epoch [1/10], Step [80/800], Loss: 0.0001
Epoch [1/10], Step [90/800], Loss: 0.0001
Epoch [1/10], Step [100/800], Loss: 0.0001
Epoch [1/10], Step [110/800], Loss: 0.0001
Epoch [1/10], Step [120/800], Loss: 0.0001
Epoch [1/10], Step [130/800], Loss: 0.0001
Epoch [1/10], Step [140/800], Loss: 0.0001
Epoch [1/10], Step [150/800], Loss: 0.0001
Epoch [1/10], Step [160/800], Loss: 0.0001
Epoch [1/10], Step [170/800], Loss: 0.0001
Epoch [1/10], Step [180/800], Loss: 0.0001
Epoch [1/10], Step [190/800], Loss: 0.0001
Epoch [1/10], Step [200/800], Loss: 0.0001
Epoch [1/10], Step [210/800], Loss: 0.0001
Epoch [1/10], Step [220/800], Loss: 0.0001
Epoch [1/10], Step [230/800], Loss: 0.0001
Epoch [1/10], Step [