### Imports

In [1]:
import os
import re
import cv2
import numpy as np
import torch
import faiss
from PIL import Image
from transformers import AutoProcessor, AutoModel
import av
import math
from utils import *
import pandas as pd
from collections import defaultdict
from torch.utils.tensorboard import SummaryWriter
from scipy.spatial.distance import cdist
import pickle




VIDEO_DIR = "/Volumes/Sudan/sudan_stuff"
#VIDEO_DIR = "input_videos"
EMB_FILE  = "embeddings.npy"
ID_FILE   = "embedding_ids.npy"
INDEX_PATH = "clip_index.faiss"

  from .autonotebook import tqdm as notebook_tqdm


### Load GPU

In [2]:
if torch.cuda.is_available() : 
    device = "cuda:1"
    print(f"device:{device}")
elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device("mps")
    print(f"device:{device}")
else:
    print(f"Plain ol' CPU")

device:mps


### Infer, index, store 

In [None]:
# Model config 
CKPT      = "microsoft/xclip-base-patch32"
processor = AutoProcessor.from_pretrained(CKPT)
model     = AutoModel.from_pretrained(CKPT).to(device)
dim = 512
BATCH = 256
clip_time = 10 # In seconds

# Create memmaps
max_rows = scan_dir(VIDEO_DIR, clip_time)
print(f"number of total segments in directory: {max_rows}")
emb_memmap = create_memap(file_path=EMB_FILE, 
                          dtype=np.float32,
                          shape=(max_rows, dim),
                          init_value=0.0)

id_memmap = create_memap(file_path=ID_FILE,
                        dtype=np.int64,
                        shape=(max_rows,),
                        init_value=-1)

# Create index 
index = faiss.IndexFlatL2(dim)
index = faiss.IndexIDMap2(index)

name_dict, total_clips, final_write_ptr = process_video_directory(
    VIDEO_DIR, processor,
    model, index, emb_memmap,
    id_memmap, BATCH, clip_time
)

print(f"Successfully processed {total_clips} video clips")
print(f"Data written to positions 0-{final_write_ptr-1} in memory arrays")
assert final_write_ptr == total_clips, "Mismatch between clips and write position!"

faiss.write_index(index, INDEX_PATH)
print("FAISS index saved to disk.")

number of total segments in directory: 46555
[Batch 1] Inference on 256 IDs: 0 … 255 → writing at rows 0 … 255
[Batch 1] added to FAISS (index size now: 256)
[Batch 2] Inference on 256 IDs: 256 … 511 → writing at rows 256 … 511
[Batch 2] added to FAISS (index size now: 512)
[Batch 3] Inference on 256 IDs: 512 … 767 → writing at rows 512 … 767
[Batch 3] added to FAISS (index size now: 768)
[Batch 4] Inference on 256 IDs: 768 … 1023 → writing at rows 768 … 1023
[Batch 4] added to FAISS (index size now: 1024)
[Batch 5] Inference on 256 IDs: 1024 … 1279 → writing at rows 1024 … 1279
[Batch 5] added to FAISS (index size now: 1280)
[Batch 6] Inference on 256 IDs: 1280 … 1535 → writing at rows 1280 … 1535


### Find duplicates

In [None]:
# Load FAISS index & embedddings 
index = faiss.read_index("clip_index.faiss")
emb_matrix = np.load("embeddings.npy")      # float32, shape=(N, D)
id_array   = np.load("embedding_ids.npy")    # int64 or int32, shape=(N,)

valid      = id_array >= 0
emb_matrix = emb_matrix[valid]
id_array   = id_array[valid]

# Brute force query for entire embedding matrix
radius = 0.001 
print(f"radius: {radius}")
lim, distance_matrix, identity_matrix = index.range_search(emb_matrix, radius)

# Find duplicates
pairs = find_duplicates(lim, distance_matrix, identity_matrix, id_array)

# Reconstruct filenames 
with open("clip_name_dict.pkl", "wb") as f:
    pickle.dump(name_dict, f)
print(f"Saved clip‐ID lookup to clip_name_dict.pkl ({len(name_dict)} entries)")

# Find length of each video
video_lengths = {}
for fn in os.listdir(VIDEO_DIR):
    if not fn.lower().endswith((".mp4", ".mov", ".avi")):
        continue
    file_path = os.path.join(VIDEO_DIR, fn)
    try:
        container = av.open(file_path)
        duration = container.duration
        if duration is not None:
            video_lengths[fn] = duration / 1e6  # seconds
        else:
            video_lengths[fn] = 0
        container.close()
    except Exception:
        video_lengths[fn] = 0

# Dictionary to count duplicate clips between file pairs
file_pair_counts = defaultdict(int)

for a, b, distance in pairs:
    file_a, idx_a = name_dict.get(a, ("unknown", -1)) # unknown is fallback
    file_b, idx_b = name_dict.get(b, ("unknown", -1))
    if file_a == file_b:
        continue  # Skip self-matches
    #print(f"{file_a} [clip {idx_a}] <-> {file_b} [clip {idx_b}] (distance: {distance:.4f})")
    
    # Sort file names to avoid (A,B) and (B,A) being counted separately
    file_pair = tuple(sorted([file_a, file_b]))
    file_pair_counts[file_pair] += 1
    
print("Total unique file pairs with duplicate clips:", len(file_pair_counts))
# Print out duplicate file pairs and the number of duplicate clips
for (file1, file2), dup_count in file_pair_counts.items():
    len1 = video_lengths.get(file1, "unknown")
    len2 = video_lengths.get(file2, "unknown")
    print(f"{file1} (len: {len1:.1f}s) <-> {file2} (len: {len2:.1f}s): {dup_count} duplicate clips")


NameError: name 'faiss' is not defined

In [None]:
# Load data
embeddings = np.load("embeddings.npy")      # shape = (max_rows, dim)
ids        = np.load("embedding_ids.npy")    # shape = (max_rows,)

# Filter out unused slots (id_memmap was init to -1)
valid_mask = ids >= 0
embeddings = embeddings[valid_mask]
ids        = ids[valid_mask]

# Create a TensorBoard writer
log_dir = "runs/embeds"
writer  = SummaryWriter(log_dir)

# build human-readable labels ───
id_to_label = {
    cid: f"{fn}[{idx}]"
    for cid, (fn, idx) in name_dict.items()
}

# metadata: only one entry per valid embedding
metadata = [ id_to_label.get(i, "unknown") for i in ids ]

# Sanity check
assert embeddings.shape[0] == len(metadata), (
    f"Embeddings rows {embeddings.shape[0]} != metadata lines {len(metadata)}"
)

writer.add_embedding(
    embeddings,
    metadata=metadata,
    tag="my_embeddings")

writer.close()

print(f"Done. Run:\n  tensorboard --logdir={log_dir}\nThen open http://localhost:6006/#projector")