In [1]:
import torch
import torch.nn as nn

slowfast = torch.hub.load('facebookresearch/pytorchvideo', 'slowfast_r50', pretrained=True)

# Setup fine tuning layer
class CustomClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dims, num_classes=2):
        super(CustomClassifier, self).__init__()

        layers = []
        prev_dim = input_dim

        for dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, dim),
                nn.ReLU(),
                nn.Dropout(0.5)
            ])
            prev_dim = dim

        layers.append(nn.Linear(prev_dim, num_classes))

        self.classifier = nn.Sequential(*layers)

    def forward(self, x):
        return self.classifier(x)

# Import weights
fine_tune = CustomClassifier(400, [512, 128], 1)
weights_path = "../fine_tune_400_512-128_1.pth"
fine_tune.load_state_dict(torch.load(weights_path, weights_only=True))

Using cache found in C:\Users\babin/.cache\torch\hub\facebookresearch_pytorchvideo_main


<All keys matched successfully>

In [2]:
# Declare a master model

# Setup fine tuning layer
class CrimeDetector(nn.Module):
    def __init__(self, slowfast, fine_tuned):
        super(CrimeDetector, self).__init__()
        self.slowfast = slowfast
        self.fine_tuned = fine_tuned

    def forward(self, x):
        x = self.slowfast(x)
        return self.fine_tuned(x)
    
crime_detector = CrimeDetector(slowfast, fine_tune)

## Setup the data

To setup data, we need a few things :

- A transform to scale and crop our video (pytorch video utils)
- A transform to pack data for slowfast double channel

In [3]:
# Imports

from typing import Dict
import json
import urllib
from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)
from pytorchvideo.data.encoded_video import EncodedVideo
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample,
    UniformCropVideo
)



In [7]:
side_size = 256
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
crop_size = 256
num_frames = 32
sampling_rate = 2
frames_per_second = 30
slowfast_alpha = 4
num_clips = 10
num_crops = 3

class PackPathway(torch.nn.Module):
    """
    Transform for converting video frames as a list of tensors. 
    """
    def __init__(self):
        super().__init__()
        
    def forward(self, frames: torch.Tensor):
        fast_pathway = frames
        # Perform temporal sampling from the fast pathway.
        slow_pathway = torch.index_select(
            frames,
            1,
            torch.linspace(
                0, frames.shape[1] - 1, frames.shape[1] // slowfast_alpha
            ).long(),
        )
        frame_list = [slow_pathway.unsqueeze(0), fast_pathway.unsqueeze(0)]
        return frame_list

transform=Compose(
        [
            UniformTemporalSubsample(num_frames),
            Lambda(lambda x: x/255.0),
            NormalizeVideo(mean, std),
            ShortSideScale(
                size=side_size
            ),
            CenterCropVideo(crop_size),
            PackPathway()
        ]
    )

# The duration of the input clip is also specific to the model.
clip_duration = (num_frames * sampling_rate)/frames_per_second

## Test inference of classifier

Using snippets from "live demo"

In [4]:
import cv2
import torch

cap = cv2.VideoCapture(1)
if not cap.isOpened():
    print("Error: Could not open webcam")
try:
    while True:
        ret, frame = cap.read()
        print(torch.tensor(frame))
        cv2.imshow('Webcam Feed', frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break  
finally:
    cap.release()
    cv2.destroyAllWindows()

tensor([[[ 84,  89, 110],
         [ 91,  97, 119],
         [ 84,  93, 117],
         ...,
         [118, 133, 143],
         [117, 134, 142],
         [119, 137, 144]],

        [[ 84,  93, 116],
         [ 86,  96, 120],
         [ 81,  92, 118],
         ...,
         [118, 133, 143],
         [118, 134, 143],
         [119, 136, 143]],

        [[ 76,  94, 121],
         [ 75,  92, 119],
         [ 72,  88, 117],
         ...,
         [119, 132, 143],
         [118, 133, 142],
         [119, 134, 143]],

        ...,

        [[ 98,  97, 110],
         [ 97,  98, 110],
         [ 93,  95, 106],
         ...,
         [216, 232, 239],
         [214, 231, 238],
         [213, 230, 237]],

        [[ 94,  93, 106],
         [ 94,  94, 107],
         [ 93,  95, 107],
         ...,
         [218, 233, 240],
         [213, 229, 236],
         [211, 227, 234]],

        [[ 94,  93, 107],
         [ 92,  92, 105],
         [ 94,  96, 108],
         ...,
         [217, 231, 239],
        

In [14]:
import cv2
import torch
import time
from torchvision import transforms
from collections import deque
import numpy as np

THRESHOLD = -1.075

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

class WebcamProcessor:
    def __init__(self, model, device, transform, buffer_size=32):
        self.model = model
        self.device = device
        self.transform = transform
        self.buffer_size = buffer_size
        self.frame_buffer = deque(maxlen=buffer_size)
        self.pred_text = ""
        
    def process_frame(self, frame):
        """Convert OpenCV BGR frame to tensor and normalize"""
        return torch.tensor(frame)
        
    def process_buffer(self):
        """Process the accumulated frames with the model"""
        frames = torch.stack(list(self.frame_buffer))
        frames = frames.permute((3,0,1,2))
        # print(frames.shape) # Dimensions : [C,T,H,W]
        inputs = self.transform(frames)
        inputs = [i.to(device) for i in inputs]
        # print(inputs[0].shape, inputs[1].shape)
        with torch.no_grad():
            predictions = self.model(inputs)
        # clear the buffer
        del self.frame_buffer
        self.frame_buffer = deque(maxlen=self.buffer_size)
        return predictions
    
    def create_text_display(self, text, width=500, height=50):
        # Create a black image for text
        text_display = np.zeros((height, width, 3), np.uint8)
        
        # Add text to the image
        font = cv2.FONT_HERSHEY_COMPLEX
        font_scale = 0.5
        font_thickness = 1
        font_color = (255, 255, 255)  # White text
        
        # Calculate text size and position to center it
        (text_width, text_height), _ = cv2.getTextSize(text, font, font_scale, font_thickness)
        text_x = (width - text_width) // 2
        text_y = (height + text_height) // 2
        
        cv2.putText(text_display, text, (text_x, text_y), font, font_scale, font_color)
        return text_display
        
    def start_capture(self):
        cap = cv2.VideoCapture(1)
        
        if not cap.isOpened():
            print("Error: Could not open webcam")
            return
        
        frame_counter = 0
        
        try:
            while True:
                ret, frame = cap.read()

                if frame_counter % 1 == 0: #process 1/2 of frames, more or less, depends on your computing power
                    processed_frame = self.process_frame(frame)
                    self.frame_buffer.append(processed_frame)
                    
                    # Process buffer if we have enough frames
                    if len(self.frame_buffer) == self.frame_buffer.maxlen:
                        predictions = self.process_buffer()

                        # Map the predicted classes to the label names
                        self.pred_text = ("Stealing ? : " + str(predictions[0].item()) + "   FINAL ALARM = " + str(predictions[0].item() > THRESHOLD))
                
                # Create text display
                frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
                text_display = self.create_text_display(self.pred_text, width=frame_width)
                
                # Combine frame and text display vertically
                combined_display = np.vstack((frame, text_display))
                
                # Show combined display
                cv2.imshow('Webcam Feed with Predictions', combined_display)
                frame_counter += 1
                
                if cv2.waitKey(1) & 0xFF == ord('q'):
                    break
                    
        finally:
            cap.release()
            cv2.destroyAllWindows()

print(device)

crime_detector.eval()

processor = WebcamProcessor(
    model=crime_detector,
    device=device,
    transform=transform,
    buffer_size=num_frames
)

# Start processing
processor.start_capture()

cuda
cuda
