In [15]:
import os
import cv2
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models, transforms
from PIL import Image
from tqdm import tqdm
import shutil

# ================= CONFIG =================
input_root = "dataset"          # original dataset root (e.g., with class folders)
output_root = "new_dataset"     # folder to save non-duplicates
num_sample_frames = 8           # how many frames to sample per video
similarity_threshold = 0.9999     # similarity cutoff to consider duplicates
device = "cuda" if torch.cuda.is_available() else "cpu"
# ==========================================

# Pretrained ResNet18 feature extractor
resnet = models.resnet18(pretrained=True)
resnet.fc = nn.Identity()  # remove final classification layer
resnet = resnet.to(device)
resnet.eval()

# Image transform
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])

def get_video_embedding(video_path, num_frames=8):
    """Extract a compact embedding for the entire video."""
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    if total_frames <= 0:
        cap.release()
        return None

    indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
    feats = []

    for idx in indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = cap.read()
        if not ret:
            continue
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        img = Image.fromarray(frame)
        img_t = transform(img).unsqueeze(0).to(device)
        with torch.no_grad():
            feat = resnet(img_t).squeeze().cpu()
        feats.append(feat)

    cap.release()
    if len(feats) == 0:
        return None
    return torch.stack(feats).mean(dim=0)

def ensure_dir(path):
    if not os.path.exists(path):
        os.makedirs(path)

# Collect all videos and embeddings
video_info = []  # [(video_path, class_name, emb)]
classes = [d for d in os.listdir(input_root) if os.path.isdir(os.path.join(input_root, d))]

for cls in classes:
    class_dir = os.path.join(input_root, cls)
    videos = [v for v in os.listdir(class_dir) if v.endswith(('.mp4', '.avi', '.mov'))]
    for vid in tqdm(videos, desc=f"Embedding {cls}"):
        vpath = os.path.join(class_dir, vid)
        emb = get_video_embedding(vpath, num_frames=num_sample_frames)
        if emb is not None:
            video_info.append((vpath, cls, emb))

print(f"\n✅ Collected embeddings for {len(video_info)} videos.")

# Compare all pairs (cosine similarity)
unique_videos = []
removed_videos = set()

for i, (path_i, cls_i, emb_i) in enumerate(tqdm(video_info, desc="Checking duplicates")):
    if path_i in removed_videos:
        continue
    duplicate_found = False
    for j in range(i):
        path_j, cls_j, emb_j = video_info[j]
        if path_j in removed_videos:
            continue
        sim = F.cosine_similarity(emb_i, emb_j, dim=0).item()
        if sim >= similarity_threshold:
            duplicate_found = True
            removed_videos.add(path_i)
            break
    if not duplicate_found:
        unique_videos.append((path_i, cls_i))

print(f"\n✅ Unique videos kept: {len(unique_videos)} / {len(video_info)}")

# Copy unique videos to new_dataset/
for vpath, cls in tqdm(unique_videos, desc="Copying unique videos"):
    dst_dir = os.path.join(output_root, cls)
    ensure_dir(dst_dir)
    shutil.copy2(vpath, dst_dir)

print(f"\n🎉 Finished! Unique videos saved in: {output_root}")


Embedding no_theft: 100%|██████████| 531/531 [01:07<00:00,  7.92it/s]
Embedding store_theft: 100%|██████████| 324/324 [00:56<00:00,  5.78it/s]



✅ Collected embeddings for 855 videos.


Checking duplicates: 100%|██████████| 855/855 [00:02<00:00, 400.55it/s] 



✅ Unique videos kept: 637 / 855


Copying unique videos: 100%|██████████| 637/637 [00:00<00:00, 806.37it/s]


🎉 Finished! Unique videos saved in: new_dataset



