In [1]:
import torch
import torch.nn as nn
import torchvision.models as models
import cv2
import os
import numpy as np

# Define the ConvLSTM model
class ConvLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers):
        super(ConvLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.conv_lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)

    def forward(self, x, hidden):
        h0, c0 = hidden
        out, _ = self.conv_lstm(x, (h0, c0))
        return out[:, -1]

# Load pre-trained VGG19 model
vgg19 = models.vgg19(pretrained=True)
vgg19_features = vgg19.features


# Freeze the parameters in VGG19
for param in vgg19_features.parameters():
    param.requires_grad = False
    
# Combine VGG19 and ConvLSTM
class ViolenceDetectionModel(nn.Module):
    def __init__(self, vgg_features, hidden_dim, num_layers, num_classes):
        super(ViolenceDetectionModel, self).__init__()
        self.vgg_features = vgg_features
        self.conv_lstm = ConvLSTM(512, hidden_dim, num_layers)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        features = self.vgg_features(x)
        features = torch.reshape(features, (features.size(0), -1, features.size(1)))
        h0 = torch.zeros(self.conv_lstm.num_layers, features.size(0), self.conv_lstm.hidden_dim).to(x.device)
        c0 = torch.zeros(self.conv_lstm.num_layers, features.size(0), self.conv_lstm.hidden_dim).to(x.device)
        convlstm_out = self.conv_lstm(features, (h0, c0))  # Pass the hidden tuple
        out = self.fc(convlstm_out)
        return out

# Set hyperparameters
input_dim = 512  # Dimensionality of input features from VGG19
hidden_dim = 128  # Hidden dimension of ConvLSTM
num_layers = 1  # Number of ConvLSTM layers
num_classes = 2  # Number of output classes (violence, non-violence)

# Instantiate the model
model = ViolenceDetectionModel(vgg19_features, hidden_dim, num_layers, num_classes)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Load the trained model
model.load_state_dict(torch.load('violence_detection_model_v2.pth'))
model.eval()

# Dataset directory containing testing video clips
testing_dir = 'C:/APIIT BENG SE Degree/Year 3/Final Year Project/Hockey Testing'

# List video files in the testing dataset directory
violent_files = [f for f in os.listdir(os.path.join(testing_dir, 'violent')) if f.endswith('.avi')]
non_violent_files = [f for f in os.listdir(os.path.join(testing_dir, 'non-violent')) if f.endswith('.avi')]

# Combine the lists
video_files = violent_files + non_violent_files

# Create labels for the videos
labels = [1] * len(violent_files) + [0] * len(non_violent_files)

# Testing loop
total_videos = len(video_files)
correct_predictions = 0

for i, video_file in enumerate(video_files):
    # Determine the label for the video
    label = labels[i]

    # Open video file
    if label == 1:
        video_path = os.path.join(testing_dir, 'violent', video_file)
    else:
        video_path = os.path.join(testing_dir, 'non-violent', video_file)

    video = cv2.VideoCapture(video_path)

    frames = []
    while True:
        ret, frame = video.read()
        if not ret:
            break
        # Preprocess frame (resize, normalize, etc.)
        frame = cv2.resize(frame, (224, 224))  # Adjust size as per VGG19 input requirements
        frame = frame.astype(np.float32) / 255.0  # Normalize pixel values between 0 and 1
        frame = torch.from_numpy(frame).permute(2, 0, 1).unsqueeze(0).to(device)  # Convert to tensor

        frames.append(frame)

    video.release()

    # Stack frames to create a tensor with shape (num_frames, channels, height, width)
    frames = torch.cat(frames, dim=0)

    # Forward pass
    with torch.no_grad():
        outputs = model(frames)

    # Get the predicted labels
    _, predicted_labels = torch.max(outputs, 1)

    # Convert the predicted labels to 0 or 1
    predicted_label = 1 if torch.any(predicted_labels == 1) else 0

    # Convert the predicted label and ground truth label to CPU tensors
    predicted_label = torch.tensor(predicted_label)
    label_tensor = torch.tensor(label)

    # Print predicted label and ground truth label
    print(f'Video: {video_file}')
    print(f'Predicted Label: {predicted_label.item()}')
    print(f'Ground Truth Label: {label_tensor.item()}')

    # Compare predicted label with ground truth label
    if predicted_label.item() == label_tensor.item():
        correct_predictions += 1

    # Print testing progress
    print(f'Testing Progress: [{i + 1}/{total_videos}]')

# Calculate accuracy
accuracy = correct_predictions / total_videos * 100
print(f'Test Accuracy: {accuracy:.2f}%')

  warn(


Video: fi401_xvid.avi
Predicted Label: 1
Ground Truth Label: 1
Testing Progress: [1/200]
Video: fi402_xvid.avi
Predicted Label: 1
Ground Truth Label: 1
Testing Progress: [2/200]
Video: fi403_xvid.avi
Predicted Label: 0
Ground Truth Label: 1
Testing Progress: [3/200]
Video: fi404_xvid.avi
Predicted Label: 0
Ground Truth Label: 1
Testing Progress: [4/200]
Video: fi405_xvid.avi
Predicted Label: 0
Ground Truth Label: 1
Testing Progress: [5/200]
Video: fi406_xvid.avi
Predicted Label: 0
Ground Truth Label: 1
Testing Progress: [6/200]
Video: fi407_xvid.avi
Predicted Label: 0
Ground Truth Label: 1
Testing Progress: [7/200]
Video: fi408_xvid.avi
Predicted Label: 0
Ground Truth Label: 1
Testing Progress: [8/200]
Video: fi409_xvid.avi
Predicted Label: 0
Ground Truth Label: 1
Testing Progress: [9/200]
Video: fi410_xvid.avi
Predicted Label: 1
Ground Truth Label: 1
Testing Progress: [10/200]
Video: fi411_xvid.avi
Predicted Label: 1
Ground Truth Label: 1
Testing Progress: [11/200]
Video: fi412_xvid.a