### Imports

In [13]:
import os
import re
import cv2
import numpy as np
import torch
import faiss
from PIL import Image
from transformers import AutoProcessor, AutoModel
import av
import math
from utils import *
import pandas as pd

VIDEO_DIR = "input_videos"
EMB_FILE  = "embeddings.npy"
ID_FILE   = "embedding_ids.npy"
INDEX_PATH = "clip_index.faiss"

### Load GPU

In [6]:
device = "cuda:1" if torch.cuda.is_available() else "cpu"
if device == "cuda:1":
    print(f"running on GPU: {device}")
else:
    print("running on CPU")

running on GPU: cuda:1


### Infer, index, store 

In [3]:
# Model config 
CKPT      = "microsoft/xclip-base-patch32"
processor = AutoProcessor.from_pretrained(CKPT)
model     = AutoModel.from_pretrained(CKPT).to(device)
dim = 512
BATCH = 256

# Create memmaps
max_rows = scan_dir(VIDEO_DIR)
print(f"number of total segments in directory: {max_rows}")
emb_memmap = create_memap(file_path=EMB_FILE, 
                          dtype=np.float32,
                          shape=(max_rows, dim),
                          init_value=0.0)

id_memmap = create_memap(file_path=ID_FILE,
                        dtype=np.int64,
                        shape=(max_rows,),
                        init_value=-1)

# Create index 
index = faiss.IndexFlatL2(dim)
index = faiss.IndexIDMap2(index)

# Run infernece and index 
total_clips, final_write_ptr = process_video_directory(
    VIDEO_DIR, processor, model, index, emb_memmap, id_memmap, BATCH
)

print(f"Successfully processed {total_clips} video clips")
print(f"Data written to positions 0-{final_write_ptr-1} in memory arrays")
assert final_write_ptr == total_clips, "Mismatch between clips and write position!"

faiss.write_index(index, INDEX_PATH)
print("FAISS index saved to disk.")

number of total segments in directory: 1737
[Batch 1] Inference on 256 IDs: 10 … 1626 → writing at rows 0 … 255
[Batch 1] added to FAISS (index size now: 256)
[Batch 2] Inference on 256 IDs: 1627 … 227 → writing at rows 256 … 511
[Batch 2] added to FAISS (index size now: 512)
[Batch 3] Inference on 256 IDs: 228 … 2796 → writing at rows 512 … 767
[Batch 3] added to FAISS (index size now: 768)
[Batch 4] Inference on 256 IDs: 2797 … 314 → writing at rows 768 … 1023
[Batch 4] added to FAISS (index size now: 1024)
[Batch 5] Inference on 256 IDs: 315 … 394 → writing at rows 1024 … 1279
[Batch 5] added to FAISS (index size now: 1280)
[Batch 6] Inference on 256 IDs: 395 … 643 → writing at rows 1280 … 1535
[Batch 6] added to FAISS (index size now: 1536)
[Batch 7] Inference on 200 IDs: 644 … 897 → writing at rows 1536 … 1735
[Batch 7] added to FAISS (index size now: 1736)
Total segments written: 1736
FAISS index size: 1736
Successfully processed 1736 video clips
Data written to positions 0-1735 

### Find duplicates

In [17]:
# Load FAISS index & embedddings 
index = faiss.read_index("clip_index.faiss")
emb_matrix = np.load("embeddings.npy")      # float32, shape=(N, D)
id_array   = np.load("embedding_ids.npy")    # int64 or int32, shape=(N,)

# Brute force query for entire embedding matrix
radius = 0.001 
lim, distance_matrix, identity_matrix = index.range_search(emb_matrix, radius)

pairs = find_duplicates(lim, distance_matrix, identity_matrix)

# Reconstruct filenames 
clip_id_lookup = build_clip_id_lookup()

for a, b, distance in pairs:
    file_a, idx_a = clip_id_lookup.get(a, ("UNKNOWN", -1))
    file_b, idx_b = clip_id_lookup.get(b, ("UNKNOWN", -1))
    print(f"{file_a} [clip {idx_a}] <-> {file_b} [clip {idx_b}] (distance: {distance:.4f})")

TNS_0024_V.mp4 [clip 0] <-> TNS_0025_V.mp4 [clip 0] (distance: 0.0000)
TNS_0024_V.mp4 [clip 1] <-> TNS_0025_V.mp4 [clip 1] (distance: 0.0000)
TNS_0024_V.mp4 [clip 2] <-> TNS_0025_V.mp4 [clip 2] (distance: 0.0000)
TNS_0024_V.mp4 [clip 3] <-> TNS_0025_V.mp4 [clip 3] (distance: 0.0001)
TNS_0024_V.mp4 [clip 4] <-> TNS_0025_V.mp4 [clip 4] (distance: 0.0000)
TNS_0024_V.mp4 [clip 5] <-> TNS_0025_V.mp4 [clip 5] (distance: 0.0004)
TNS_0024_V.mp4 [clip 6] <-> TNS_0025_V.mp4 [clip 6] (distance: 0.0000)
TNS_0024_V.mp4 [clip 7] <-> TNS_0025_V.mp4 [clip 7] (distance: 0.0000)
TNS_0024_V.mp4 [clip 8] <-> TNS_0025_V.mp4 [clip 8] (distance: 0.0000)
TNS_0024_V.mp4 [clip 9] <-> TNS_0025_V.mp4 [clip 9] (distance: 0.0001)
TNS_0024_V.mp4 [clip 10] <-> TNS_0025_V.mp4 [clip 10] (distance: 0.0001)
TNS_0024_V.mp4 [clip 11] <-> TNS_0025_V.mp4 [clip 11] (distance: 0.0000)
TNS_0024_V.mp4 [clip 12] <-> TNS_0025_V.mp4 [clip 12] (distance: 0.0001)
TNS_0024_V.mp4 [clip 13] <-> TNS_0025_V.mp4 [clip 13] (distance: 0.0000