In [None]:
%pip install torch torchvision opencv-python pillow numpy matplotlib deep-sort-realtime

In [None]:
import os
import cv2
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.models as models
import numpy as np
from PIL import Image

In [None]:
VIDEO_NAME = 'DSC_2411.MOV'
video_path = fr"tracking_rukomet\{VIDEO_NAME}"
output_txt_path = fr"tracking_rukomet\predictions\{VIDEO_NAME.replace('.MOV', '_siamese.txt')}"

# Load Siamese model (ResNet18 for feature extraction)
class SiameseNetwork(nn.Module):
    def __init__(self):
        super(SiameseNetwork, self).__init__()
        self.feature_extractor = models.resnet18(pretrained=True)
        self.feature_extractor.fc = nn.Identity()  # Remove classification layer

    def forward(self, x):
        return self.feature_extractor(x)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
snn = SiameseNetwork().to(device).eval()

In [None]:


# Load model and set to eval mode
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
snn = SiameseNetwork().to(device).eval()

# Define preprocessing
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Function to get feature embedding
def get_embedding(image):
    image = transform(image).unsqueeze(0).to(device)
    with torch.no_grad():
        embedding = snn(image).cpu().numpy()
    return embedding

# Initialize video capture
cap = cv2.VideoCapture(0)

# Select object to track
ret, frame = cap.read()
bbox = cv2.selectROI("Select Object", frame, fromCenter=False, showCrosshair=True)
cv2.destroyWindow("Select Object")

# Get reference embedding of selected object
x, y, w, h = map(int, bbox)
reference_patch = frame[y:y+h, x:x+w]
reference_embedding = get_embedding(reference_patch)

# Main tracking loop
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Define search region (expand around previous bbox)
    search_x, search_y, search_w, search_h = max(0, x - 20), max(0, y - 20), w + 40, h + 40
    search_patch = frame[search_y:search_y+search_h, search_x:search_x+search_w]

    # Divide search area into smaller patches and compare embeddings
    best_match, best_score = None, float("inf")
    step = 10
    for i in range(0, search_w - w, step):
        for j in range(0, search_h - h, step):
            candidate_patch = search_patch[j:j+h, i:i+w]
            if candidate_patch.shape[0] != h or candidate_patch.shape[1] != w:
                continue
            
            candidate_embedding = get_embedding(candidate_patch)
            score = np.linalg.norm(reference_embedding - candidate_embedding)
            
            if score < best_score:
                best_score = score
                best_match = (search_x + i, search_y + j, w, h)

    # Update tracking box
    if best_match:
        x, y, w, h = best_match
        cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)

    # Display frame
    cv2.imshow("Siamese Tracker", frame)

    # Exit on 'q' key
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()
