In [3]:
import os
import json
import torch
from PIL import Image
from tqdm import tqdm
from torchvision.transforms import functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

# === 1. –ó–∞–≥—Ä—É–∑–∫–∞ –∞–Ω–Ω–æ—Ç–∞—Ü–∏–π ===
with open("eccv_18_annotation_files/train_annotations.json") as f:
    data = json.load(f)

# === 2. –ü–æ—Å—Ç—Ä–æ–µ–Ω–∏–µ —Å–æ–ø–æ—Å—Ç–∞–≤–ª–µ–Ω–∏—è category_id ‚Üí class_index ===
# üîπ –í—ã–±–∏—Ä–∞–µ–º –Ω—É–∂–Ω—ã–µ –∫–ª–∞—Å—Å—ã
selected_class_names = ["deer", "raccoon", "coyote", "dog"]

# üîπ –°–æ–ø–æ—Å—Ç–∞–≤–ª–µ–Ω–∏–µ –Ω–∞–∑–≤–∞–Ω–∏–π –∫–ª–∞—Å—Å–æ–≤ —Å –∏—Ö –∏—Å—Ö–æ–¥–Ω—ã–º–∏ id
all_categories = data["categories"]
name_to_id = {c["name"]: c["id"] for c in all_categories}
selected_class_ids = [name_to_id[name] for name in selected_class_names]

# üîπ –ù–æ–≤—ã–π –∫–æ–º–ø–∞–∫—Ç–Ω—ã–π –º–∞–ø–ø–∏–Ω–≥: old_id ‚Üí new_index
id_to_idx = {cid: idx for idx, cid in enumerate(selected_class_ids)}

# üîπ –§–∏–ª—å—Ç—Ä–∞—Ü–∏—è –∞–Ω–Ω–æ—Ç–∞—Ü–∏–π –ø–æ –Ω—É–∂–Ω—ã–º –∫–ª–∞—Å—Å–∞–º
filtered_annotations = [ann for ann in data["annotations"] if ann["category_id"] in selected_class_ids]

# üîπ –û—Ç–±–æ—Ä –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏–π, –≥–¥–µ –µ—Å—Ç—å —ç—Ç–∏ –∞–Ω–Ω–æ—Ç–∞—Ü–∏–∏
used_image_ids = set(ann["image_id"] for ann in filtered_annotations)
filtered_images = [img for img in data["images"] if img["id"] in used_image_ids]

# üîπ –û—Å—Ç–∞–≤–ª—è–µ–º —Ç–æ–ª—å–∫–æ 100 –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏–π
filtered_images = filtered_images[:100]

# üîπ –ü–µ—Ä–µ—Ñ–∏–ª—å—Ç—Ä—É–µ–º –∞–Ω–Ω–æ—Ç–∞—Ü–∏–∏ –ø–æ —ç—Ç–∏–º –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏—è–º
selected_image_ids = set(img["id"] for img in filtered_images)
filtered_annotations = [ann for ann in filtered_annotations if ann["image_id"] in selected_image_ids]


In [4]:
# === 3. –ö–ª–∞—Å—Å Dataset ===
class AnimalDataset(Dataset):
    def __init__(self, images, annotations, image_dir, category_id_mapping, transforms=None):
        self.images = images
        self.transforms = transforms
        self.image_dir = image_dir
        self.category_id_mapping = category_id_mapping

        self.image_id_to_annotations = {}
        for ann in annotations:
            image_id = ann.get("image_id")
            if image_id is not None:
                self.image_id_to_annotations.setdefault(image_id, []).append(ann)

    def __getitem__(self, idx):
        image_info = self.images[idx]
        image_id = image_info["id"]
        file_name = image_info["file_name"]
        image_path = os.path.join(self.image_dir, file_name)

        try:
            image = Image.open(image_path).convert("RGB")
        except Exception as e:
            raise RuntimeError(f"–ù–µ —É–¥–∞–ª–æ—Å—å –∑–∞–≥—Ä—É–∑–∏—Ç—å –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏–µ: {image_path}") from e

        boxes = []
        labels = []
        for ann in self.image_id_to_annotations.get(image_id, []):
            if "bbox" not in ann or "category_id" not in ann:
                continue
            x, y, w, h = ann["bbox"]
            if w <= 0 or h <= 0:
                continue
            class_id = self.category_id_mapping.get(ann["category_id"])
            if class_id is None:
                continue
            boxes.append([x, y, x + w, y + h])
            labels.append(class_id)

        if len(boxes) == 0:
            boxes = [[0, 0, 1, 1]]
            labels = [0]

        boxes = torch.tensor(boxes, dtype=torch.float32)
        labels = torch.tensor(labels, dtype=torch.int64)

        target = {
            "boxes": boxes,
            "labels": labels,
            "image_id": torch.tensor([idx])
        }

        if self.transforms:
            image = self.transforms(image)
        else:
            image = F.to_tensor(image)

        return image, target

    def __len__(self):
        return len(self.images)



In [7]:
# === 4. –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö –∏ –º–æ–¥–µ–ª–∏ ===
image_dir = "eccv_18_all_images_sm/eccv_18_all_images_sm"  # –ø—É—Ç—å –∫ –ø–∞–ø–∫–µ —Å –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏—è–º–∏
dataset = AnimalDataset(
    filtered_images,
    filtered_annotations,
    image_dir=image_dir,
    category_id_mapping=id_to_idx
)
data_loader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = fasterrcnn_resnet50_fpn(pretrained=True)
in_features = model.roi_heads.box_predictor.cls_score.in_features
num_classes = len(id_to_idx) + 1  # +1 –¥–ª—è background
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
model.to(device)

optimizer = torch.optim.SGD(model.parameters(), lr=0.0, momentum=0.9)

# === 5. –û–±—É—á–µ–Ω–∏–µ —Å tqdm ===
model.train()
num_epochs = 5

for epoch in range(num_epochs):
    epoch_loss = 0.0
    print(f"\nüß† Epoch {epoch + 1}/{num_epochs}")

    for images, targets in tqdm(data_loader, desc=f"Epoch {epoch + 1}"):
        images = list(img.to(device) for img in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())
        epoch_loss += losses.item()

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

    avg_loss = epoch_loss / len(data_loader)
    print(f"‚úÖ Epoch {epoch + 1} –∑–∞–≤–µ—Ä—à–µ–Ω–∞. –°—Ä–µ–¥–Ω–∏–π Loss: {avg_loss:.4f}")


üß† Epoch 1/5


Epoch 1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:44<00:00,  2.08s/it]


‚úÖ Epoch 1 –∑–∞–≤–µ—Ä—à–µ–Ω–∞. –°—Ä–µ–¥–Ω–∏–π Loss: 16.8335

üß† Epoch 2/5


Epoch 2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:43<00:00,  2.06s/it]


‚úÖ Epoch 2 –∑–∞–≤–µ—Ä—à–µ–Ω–∞. –°—Ä–µ–¥–Ω–∏–π Loss: 18.1189

üß† Epoch 3/5


Epoch 3: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:43<00:00,  2.08s/it]


‚úÖ Epoch 3 –∑–∞–≤–µ—Ä—à–µ–Ω–∞. –°—Ä–µ–¥–Ω–∏–π Loss: 16.8346

üß† Epoch 4/5


Epoch 4: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:45<00:00,  2.12s/it]


‚úÖ Epoch 4 –∑–∞–≤–µ—Ä—à–µ–Ω–∞. –°—Ä–µ–¥–Ω–∏–π Loss: 17.0745

üß† Epoch 5/5


Epoch 5: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:45<00:00,  2.12s/it]

‚úÖ Epoch 5 –∑–∞–≤–µ—Ä—à–µ–Ω–∞. –°—Ä–µ–¥–Ω–∏–π Loss: 17.2626





In [None]:
import cv2
from torchvision.transforms import functional as F
from PIL import Image

# === idx_to_name: —Å–ª–æ–≤–∞—Ä—å –∏–Ω–¥–µ–∫—Å–æ–≤ –≤ –∏–º–µ–Ω–∞ –∫–ª–∞—Å—Å–æ–≤ (—Å–æ–∑–¥–∞–Ω —Ä–∞–Ω–µ–µ) ===
# –ü—Ä–∏–º–µ—Ä: idx_to_name = {0: "deer", 1: "raccoon", 2: "coyote", 3: "dog"}

def detect_on_video(video_path, model, device, idx_to_name, output_path="output.avi", conf_thresh=0.5):
    model.eval()
    cap = cv2.VideoCapture(video_path)
    
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)

    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    with torch.no_grad():
        while True:
            ret, frame = cap.read()
            if not ret:
                break

            # –ü—Ä–µ–æ–±—Ä–∞–∑—É–µ–º –∫–∞–¥—Ä –≤ PIL ‚Üí Tensor
            image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
            image_tensor = F.to_tensor(image).unsqueeze(0).to(device)

            predictions = model(image_tensor)[0]

            for box, label, score in zip(predictions['boxes'], predictions['labels'], predictions['scores']):
                if score < conf_thresh:
                    continue
                x1, y1, x2, y2 = map(int, box.tolist())
                class_name = idx_to_name.get(label.item(), "unknown")
                color = (0, 255, 0)
                cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
                text = f"{class_name}: {score:.2f}"
                cv2.putText(frame, text, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1)

            out.write(frame)

    cap.release()
    out.release()
    print(f"üé¨ –û–±—Ä–∞–±–æ—Ç–∫–∞ –∑–∞–≤–µ—Ä—à–µ–Ω–∞. –°–æ—Ö—Ä–∞–Ω–µ–Ω–æ –≤: {output_path}")


In [None]:
detect_on_video(
    video_path="your_input_video.mp4",
    model=model,
    device=device,
    idx_to_name=idx_to_name,
    output_path="detection_output.avi",
    conf_thresh=0.5
)
