In [2]:
import cv2
import numpy as np
import torch
import torch.nn as nn
from torchvision import models, transforms
from PIL import Image
import os

In [5]:
def load_yolo_model(cfg_path, weights_path, names_path):
    net = cv2.dnn.readNet(weights_path, cfg_path)
    with open(names_path, 'r') as f:
        classes = f.read().strip().split('\n')
    return net, classes

# Paths to YOLO files
cfg_path = 'yolov3.cfg'
weights_path = 'yolov3.weights'
names_path = 'coco.names'

# Load YOLO model
yolo_model, classes = load_yolo_model(cfg_path, weights_path, names_path)
print("YOLO model loaded.")

YOLO model loaded.


In [6]:
def detect_objects(net, classes, frame, conf_threshold=0.5, nms_threshold=0.4):
    blob = cv2.dnn.blobFromImage(frame, 1/255.0, (416, 416), swapRB=True, crop=False)
    net.setInput(blob)
    layer_names = net.getLayerNames()
    output_layers = [layer_names[i - 1] for i in net.getUnconnectedOutLayers()]
    detections = net.forward(output_layers)

    boxes = []
    confidences = []
    class_ids = []
    h, w = frame.shape[:2]

    for output in detections:
        for detection in output:
            scores = detection[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]
            if confidence > conf_threshold:
                center_x = int(detection[0] * w)
                center_y = int(detection[1] * h)
                width = int(detection[2] * w)
                height = int(detection[3] * h)
                x = int(center_x - width / 2)
                y = int(center_y - height / 2)
                boxes.append([x, y, x + width, y + height])
                confidences.append(float(confidence))
                class_ids.append(class_id)

    indices = cv2.dnn.NMSBoxes(boxes, confidences, conf_threshold, nms_threshold)
    filtered_boxes = []
    if len(indices) > 0:
        for i in indices.flatten():
            box = boxes[i]
            filtered_boxes.append(box + [confidences[i], class_ids[i]])

    return filtered_boxes

In [7]:
class I3DFeatureExtractor(nn.Module):
    def __init__(self):
        super(I3DFeatureExtractor, self).__init__()
        self.i3d = models.video.r3d_18(pretrained=True)
        self.i3d.fc = nn.Identity()  # Remove the classification layer

    def forward(self, x):
        return self.i3d(x)

# Initialize I3D model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
i3d_model = I3DFeatureExtractor().to(device)
i3d_model.eval()
print("I3D model loaded.")

Downloading: "https://download.pytorch.org/models/r3d_18-b3b3357e.pth" to /Users/punsisikiridana/.cache/torch/hub/checkpoints/r3d_18-b3b3357e.pth
100%|██████████| 127M/127M [08:39<00:00, 257kB/s]    


I3D model loaded.


In [12]:
def process_video(video_path, yolo_model, classes, i3d_model, output_dir, target_fps=20):
    cap = cv2.VideoCapture(video_path)
    frame_features = []
    temporal_edges = []

    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    frame_count = 0
    original_fps = cap.get(cv2.CAP_PROP_FPS)
    frame_interval = int(original_fps / target_fps)
    prev_boxes = []

    while cap.isOpened() and frame_count < 50:
        ret, frame = cap.read()
        if not ret:
            break

        if frame_count % frame_interval == 0:
            # Extract global features using I3D
            img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
            img_tensor = transform(img).unsqueeze(0).to(device)
            img_tensor = img_tensor.unsqueeze(2).repeat(1, 1, 16, 1, 1)  # Create a fake video clip with 16 frames
            with torch.no_grad():
                feature = i3d_model(img_tensor).cpu().numpy().flatten()
            frame_features.append(feature)

            # Detect objects
            boxes = detect_objects(yolo_model, classes, frame)

            # Calculate temporal edges between objects
            if prev_boxes:
                frame_temporal_edges = []
                for box1 in prev_boxes:
                    for box2 in boxes:
                        edge = calculate_edge(box1, box2)
                        frame_temporal_edges.append(edge)
                temporal_edges.append(frame_temporal_edges)
            prev_boxes = boxes

        frame_count += 1

    cap.release()

    # Ensure we have exactly 50 frames
    while len(frame_features) < 50:
        frame_features.append(np.zeros(2048))
        temporal_edges.append(np.zeros((19, 19)))

    frame_features = np.array(frame_features)
    temporal_edges = np.array(temporal_edges)

    np.save(os.path.join(output_dir, 'frame_features.npy'), frame_features)
    np.save(os.path.join(output_dir, 'temporal_edges.npy'), temporal_edges)
    print("NPY files successfully created.")

def calculate_edge(box1, box2):
    # Calculate the distance between the centers of two bounding boxes
    x1_center = (box1[0] + box1[2]) / 2
    y1_center = (box1[1] + box1[3]) / 2
    x2_center = (box2[0] + box2[2]) / 2
    y2_center = (box2[1] + box2[3]) / 2
    distance = np.sqrt((x1_center - x2_center) ** 2 + (y1_center - y2_center) ** 2)
    return distance

In [13]:
# Directory to save frames with bounding boxes
output_dir = 'output_features'
os.makedirs(output_dir, exist_ok=True)

In [14]:
# Process the video
video_path = 'input_video.mp4'
process_video(video_path, yolo_model, classes, i3d_model, output_dir, target_fps=20)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (40,) + inhomogeneous part.

In [1]:
import torch
import torchvision
from torchvision import models

# Assuming you have a pre-trained I3D model. Here, we use a placeholder for simplicity.
class InceptionI3d(torch.nn.Module):
    def __init__(self, num_classes=400, in_channels=3):
        super(InceptionI3d, self).__init__()
        self.model = models.video.r3d_18(pretrained=True)  # Placeholder for actual I3D

    def forward(self, x):
        return self.model(x)

i3d_model = InceptionI3d()
i3d_model.eval()




InceptionI3d(
  (model): VideoResNet(
    (stem): BasicStem(
      (0): Conv3d(3, 64, kernel_size=(3, 7, 7), stride=(1, 2, 2), padding=(1, 3, 3), bias=False)
      (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
    )
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Sequential(
          (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
          (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU(inplace=True)
        )
        (conv2): Sequential(
          (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
          (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (relu): ReLU(inplace=True)
      )
      (1): BasicBlock(
        (conv1): Sequential(
          (0): Conv3DSimple(64, 64, kernel_size=(3, 3

In [5]:
import cv2
import numpy as np

def preprocess_video(video_path, num_frames=100, frame_size=(224, 224)):
    cap = cv2.VideoCapture(video_path)
    frames = []
    while len(frames) < num_frames:
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.resize(frame, frame_size)
        frame = frame / 255.0  # Normalize to [0, 1]
        frames.append(frame)

    cap.release()

    # Convert list of frames to numpy array and transpose to (C, T, H, W)
    frames = np.array(frames)
    frames = frames.transpose(3, 0, 1, 2)  # (T, H, W, C) -> (C, T, H, W)
    return torch.tensor(frames, dtype=torch.float32).unsqueeze(0)  # Add batch dimension

video_tensor = preprocess_video('./input_clips/positive/clip_0014.mp4')


In [6]:
with torch.no_grad():
    features = i3d_model(video_tensor)
    features = features.squeeze(0).cpu().numpy()  # Remove batch dimension and convert to numpy array


In [7]:
np.save('video_features.npy', features)


In [11]:
import torch
from pytorchvideo.models.hub import i3d_r50

# Load the pre-trained I3D model from PyTorchVideo
model = i3d_r50(pretrained=True)
model.eval()

Downloading: "https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo/kinetics/I3D_8x8_R50.pyth" to /Users/sacithrangana/.cache/torch/hub/checkpoints/I3D_8x8_R50.pyth
100%|██████████| 214M/214M [02:52<00:00, 1.30MB/s] 


Net(
  (blocks): ModuleList(
    (0): ResNetBasicStem(
      (conv): Conv3d(3, 64, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=(2, 3, 3), bias=False)
      (norm): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (activation): ReLU()
      (pool): MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=[0, 1, 1], dilation=1, ceil_mode=False)
    )
    (1): ResStage(
      (res_blocks): ModuleList(
        (0): ResBlock(
          (branch1_conv): Conv3d(64, 256, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
          (branch1_norm): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (branch2): BottleneckBlock(
            (conv_a): Conv3d(64, 64, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0), bias=False)
            (norm_a): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (act_a): ReLU()
            (conv_b): Conv3d(64, 64, kernel_size=(1,

In [24]:
import cv2
import numpy as np

def preprocess_video(video_path, num_frames=100, frame_size=(224, 224)):
    cap = cv2.VideoCapture(video_path)
    frames = []
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    step = max(1, frame_count // num_frames)

    for i in range(0, frame_count, step):
        cap.set(cv2.CAP_PROP_POS_FRAMES, i)
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.resize(frame, frame_size)
        frame = frame / 255.0  # Normalize to [0, 1]
        frames.append(frame)

    cap.release()

    # Ensure we have exactly num_frames by padding or trimming
    while len(frames) < num_frames:
        frames.append(frames[-1])  # Repeat last frame if necessary
    frames = frames[:num_frames]

    # Convert list of frames to numpy array and transpose to (C, T, H, W)
    frames = np.array(frames)
    frames = frames.transpose(3, 0, 1, 2)  # (T, H, W, C) -> (C, T, H, W)
    return torch.tensor(frames, dtype=torch.float32).unsqueeze(0)  # Add batch dimension


video_tensor = preprocess_video('./input_clips/positive/clip_0014.mp4')


In [25]:
with torch.no_grad():
    features = model(video_tensor)
    print("Model output shape:", features.shape)


Model output shape: torch.Size([1, 400])


In [27]:
import torch
import numpy as np
import cv2
from pytorchvideo.models.hub import i3d_r50

# Load the pre-trained I3D model
model = i3d_r50(pretrained=True)
model.eval()

def preprocess_video(video_path, num_frames=100, frame_size=(224, 224)):
    cap = cv2.VideoCapture(video_path)
    frames = []
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    step = max(1, frame_count // num_frames)

    for i in range(0, frame_count, step):
        cap.set(cv2.CAP_PROP_POS_FRAMES, i)
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.resize(frame, frame_size)
        frame = frame / 255.0  # Normalize to [0, 1]
        frames.append(frame)

    cap.release()

    while len(frames) < num_frames:
        frames.append(frames[-1])  # Repeat last frame if necessary
    frames = frames[:num_frames]

    frames = np.array(frames)
    frames = frames.transpose(3, 0, 1, 2)  # (T, H, W, C) -> (C, T, H, W)
    return torch.tensor(frames, dtype=torch.float32).unsqueeze(0)  # Add batch dimension

# Load and preprocess the video
video_tensor = preprocess_video('./input_clips/positive/clip_0014.mp4')

# Extract features
with torch.no_grad():
    features = model(video_tensor)
    print("Model output shape:", features.shape)

    # Convert features to NumPy array and save
    features_np = features.cpu().numpy().squeeze(0)  # Remove the batch dimension
    np.save('video_features.npy', features_np)


Model output shape: torch.Size([1, 400])
