### Imports

In [1]:
import os
import re
import cv2
import numpy as np
import torch
import faiss
from PIL import Image
from transformers import AutoProcessor, AutoModel
import av
import math
from utils import *
import pandas as pd
from collections import defaultdict


VIDEO_DIR = "input_videos"
EMB_FILE  = "embeddings.npy"
ID_FILE   = "embedding_ids.npy"
INDEX_PATH = "clip_index.faiss"

  from .autonotebook import tqdm as notebook_tqdm


### Load GPU

In [2]:
device = "cuda:1" if torch.cuda.is_available() else "cpu"
if device == "cuda:1":
    print(f"running on GPU: {device}")
else:
    print("running on CPU")

running on GPU: cuda:1


### Infer, index, store 

In [3]:
# Model config 
CKPT      = "microsoft/xclip-base-patch32"
processor = AutoProcessor.from_pretrained(CKPT)
model     = AutoModel.from_pretrained(CKPT).to(device)
dim = 512
BATCH = 256

# Create memmaps
max_rows = scan_dir(VIDEO_DIR)
print(f"number of total segments in directory: {max_rows}")
emb_memmap = create_memap(file_path=EMB_FILE, 
                          dtype=np.float32,
                          shape=(max_rows, dim),
                          init_value=0.0)

id_memmap = create_memap(file_path=ID_FILE,
                        dtype=np.int64,
                        shape=(max_rows,),
                        init_value=-1)

# Create index 
index = faiss.IndexFlatL2(dim)
index = faiss.IndexIDMap2(index)

# Run infernece and index 
total_clips, final_write_ptr = process_video_directory(
    VIDEO_DIR, processor, model, index, emb_memmap, id_memmap, BATCH
)

print(f"Successfully processed {total_clips} video clips")
print(f"Data written to positions 0-{final_write_ptr-1} in memory arrays")
assert final_write_ptr == total_clips, "Mismatch between clips and write position!"

faiss.write_index(index, INDEX_PATH)
print("FAISS index saved to disk.")

number of total segments in directory: 1737
[Batch 1] Inference on 256 IDs: 10 … 1626 → writing at rows 0 … 255
[Batch 1] added to FAISS (index size now: 256)
[Batch 2] Inference on 256 IDs: 1627 … 227 → writing at rows 256 … 511
[Batch 2] added to FAISS (index size now: 512)
[Batch 3] Inference on 256 IDs: 228 … 2796 → writing at rows 512 … 767
[Batch 3] added to FAISS (index size now: 768)
[Batch 4] Inference on 256 IDs: 2797 … 314 → writing at rows 768 … 1023
[Batch 4] added to FAISS (index size now: 1024)
[Batch 5] Inference on 256 IDs: 315 … 394 → writing at rows 1024 … 1279
[Batch 5] added to FAISS (index size now: 1280)
[Batch 6] Inference on 256 IDs: 395 … 643 → writing at rows 1280 … 1535
[Batch 6] added to FAISS (index size now: 1536)
[Batch 7] Inference on 200 IDs: 644 … 897 → writing at rows 1536 … 1735
[Batch 7] added to FAISS (index size now: 1736)
Total segments written: 1736
FAISS index size: 1736
Successfully processed 1736 video clips
Data written to positions 0-1735 

### Find duplicates

In [2]:
# Load FAISS index & embedddings 
index = faiss.read_index("clip_index.faiss")
emb_matrix = np.load("embeddings.npy")      # float32, shape=(N, D)
id_array   = np.load("embedding_ids.npy")    # int64 or int32, shape=(N,)

# Brute force query for entire embedding matrix
radius = 0.1 
print(f"radius: {radius}")
lim, distance_matrix, identity_matrix = index.range_search(emb_matrix, radius)

from scipy.spatial.distance import cdist
all_distances = cdist(emb_matrix, emb_matrix, metric='sqeuclidean') # Compute all pairwise squared L2 distances
np.fill_diagonal(all_distances, np.nan) # Ignore self-distances (diagonal)
min_dist = np.nanmin(all_distances)
max_dist = np.nanmax(all_distances)
print("Global min distance:", min_dist)
print("Global max distance:", max_dist)

# Find duplicates
pairs = find_duplicates(lim, distance_matrix, identity_matrix, id_array)

# Reconstruct filenames 
clip_id_lookup = clip_id_lookup(VIDEO_DIR)
print(clip_id_lookup)
# Find length of each video
video_lengths = {}
for fn in os.listdir(VIDEO_DIR):
    if not fn.lower().endswith((".mp4", ".mov", ".avi")):
        continue
    file_path = os.path.join(VIDEO_DIR, fn)
    try:
        container = av.open(file_path)
        duration = container.duration
        if duration is not None:
            video_lengths[fn] = duration / 1e6  # seconds
        else:
            video_lengths[fn] = 0
        container.close()
    except Exception:
        video_lengths[fn] = 0

# Dictionary to count duplicate clips between file pairs
file_pair_counts = defaultdict(int)

for a, b, distance in pairs:
    file_a, idx_a = clip_id_lookup.get(a, ("unknown", -1)) # unknown is fallback
    file_b, idx_b = clip_id_lookup.get(b, ("unknown", -1))
    if file_a == file_b:
        continue  # Skip self-matches
    #print(f"{file_a} [clip {idx_a}] <-> {file_b} [clip {idx_b}] (distance: {distance:.4f})")
    
    # Sort file names to avoid (A,B) and (B,A) being counted separately
    file_pair = tuple(sorted([file_a, file_b]))
    file_pair_counts[file_pair] += 1
    
print("Total unique file pairs with duplicate clips:", len(file_pair_counts))
# Print out duplicate file pairs and the number of duplicate clips
for (file1, file2), dup_count in file_pair_counts.items():
    len1 = video_lengths.get(file1, "unknown")
    len2 = video_lengths.get(file2, "unknown")
    print(f"{file1} (len: {len1:.1f}s) <-> {file2} (len: {len2:.1f}s): {dup_count} duplicate clips")


radius: 0.1
Global min distance: 0.0
Global max distance: 1019.1456805885576
{10: ('TNS_0001_V.mp4', 0), 11: ('TNS_0001_V.mp4', 1), 12: ('TNS_0001_V.mp4', 2), 13: ('TNS_0001_V.mp4', 3), 14: ('TNS_0001_V.mp4', 4), 15: ('TNS_0001_V.mp4', 5), 16: ('TNS_0001_V.mp4', 6), 17: ('TNS_0001_V.mp4', 7), 18: ('TNS_0001_V.mp4', 8), 19: ('TNS_0001_V.mp4', 9), 110: ('TNS_0011_V.MP4', 0), 111: ('TNS_0011_V.MP4', 1), 112: ('TNS_0011_V.MP4', 2), 113: ('TNS_0011_V.MP4', 3), 114: ('TNS_0011_V.MP4', 4), 115: ('TNS_0011_V.MP4', 5), 116: ('TNS_0011_V.MP4', 6), 117: ('TNS_0001_V.mp4', 17), 118: ('TNS_0001_V.mp4', 18), 119: ('TNS_0001_V.mp4', 19), 120: ('TNS_0012_V.MP4', 0), 121: ('TNS_0012_V.MP4', 1), 122: ('TNS_0012_V.MP4', 2), 123: ('TNS_0012_V.MP4', 3), 124: ('TNS_0012_V.MP4', 4), 125: ('TNS_0012_V.MP4', 5), 126: ('TNS_0012_V.MP4', 6), 127: ('TNS_0001_V.mp4', 27), 128: ('TNS_0001_V.mp4', 28), 129: ('TNS_0001_V.mp4', 29), 130: ('TNS_0013_V.mp4', 0), 131: ('TNS_0013_V.mp4', 1), 132: ('TNS_0013_V.mp4', 2), 13

In [4]:
from torch.utils.tensorboard import SummaryWriter

# Load data
embeddings = np.load("embeddings.npy")
ids = np.load("embedding_ids.npy")

# Create a TensorBoard writer
log_dir = "runs/embeds" 
writer = SummaryWriter(log_dir)

# Get a mapping from embedding ID to (filename, clip_idx)
id_to_filename = {}
for k, v in clip_id_lookup.items():
    id_to_filename[k] = v[0]  # v[0] is the filename

# Build metadata: for each embedding id, get the filename
metadata = [id_to_filename.get(i, "unknown") for i in ids]

writer.add_embedding(
    embeddings,
    metadata=metadata,
    tag="my_embeddings"
)

writer.close()

print(f"Done. Run:\n  tensorboard --logdir={log_dir}\nThen open http://localhost:6006/#projector")

Done. Run:
  tensorboard --logdir=runs/embeds
Then open http://localhost:6006/#projector
