### Imports

In [1]:
import os
import re
import cv2
import numpy as np
import torch
import faiss
from PIL import Image
from transformers import AutoProcessor, AutoModel
import av
import math
from utils import *
import pandas as pd
from collections import defaultdict
from torch.utils.tensorboard import SummaryWriter
from scipy.spatial.distance import cdist
import pickle




#VIDEO_DIR = "/Volumes/Sudan/sudan_stuff"
VIDEO_DIR = "input_videos"
EMB_FILE  = "embeddings.npy"
ID_FILE   = "embedding_ids.npy"
INDEX_PATH = "clip_index.faiss"

  from .autonotebook import tqdm as notebook_tqdm


### Load GPU

In [2]:
if torch.cuda.is_available() : 
    device = "cuda:1"
    print(f"device:{device}")
elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device("mps")
    print(f"device:{device}")
else:
    print(f"Plain ol' CPU")

device:mps


### Infer, index, store 

In [3]:
# Model config 
CKPT      = "microsoft/xclip-base-patch32"
processor = AutoProcessor.from_pretrained(CKPT)
model     = AutoModel.from_pretrained(CKPT).to(device)
dim = 512
BATCH = 256
clip_time = 30 # In seconds

# Create memmaps
max_rows = scan_dir(VIDEO_DIR, clip_time)
print(f"number of total segments in directory: {max_rows}")

emb_memmap = create_memap(file_path=EMB_FILE, 
                          dtype=np.float32,
                          shape=(max_rows, dim),
                          init_value=0.0)

# Create index 
index = faiss.IndexFlatL2(dim)

name_dict, total_clips, final_write_ptr = process_video_directory(
    VIDEO_DIR, processor,
    model, index, emb_memmap,
    BATCH, clip_time)

with open("clip_name_list.pkl", "wb") as f:
    pickle.dump(name_dict, f)

print(f"Successfully processed {total_clips} video clips")
print(f"Data written to positions 0-{final_write_ptr-1} in memory arrays")

faiss.write_index(index, INDEX_PATH)
print("FAISS index saved to disk.")

assert final_write_ptr == total_clips, "Mismatch between clips and write position!"

number of total segments in directory: 605
Start process_video_directory
    - embedding batch 1: adding 256 clips (total after this: 256)
Indexed batch #1, total vectors indexed: 256
    - embedding batch 2: adding 256 clips (total after this: 512)
Indexed batch #2, total vectors indexed: 512
    ‑- embedding batch 3 (final)
✅ done: 604 clips added
    final index.ntotal = 604
Successfully processed 604 video clips
Data written to positions 0-603 in memory arrays
FAISS index saved to disk.


**Sanity check**

In [4]:
with open("clip_name_list.pkl", "rb") as f:
    name_dict = pickle.load(f)

emb = np.load(EMB_FILE)   # shape = (N, D)

print(f"names: {len(name_dict)}, embeds: {emb.shape[0]}")
if emb.shape[0] > len(name_dict):
    # drop any trailing embeddings with no name
    extra = list(range(len(name_dict), emb.shape[0]))
    print(f"⚠️ Dropping orphan embedding index(es): {extra}")
    emb = emb[:len(name_dict)]
elif emb.shape[0] < len(name_dict):
    raise ValueError(f"ⓧ Too few embeddings: {emb.shape[0]} vs {len(name_dict)} names")

# Basic length check
assert len(name_dict) == emb.shape[0], (
    f"❌ Length mismatch: {len(name_dict)} names vs {emb.shape[0]} embeddings"
)
print(f"✅ name_list and embeddings both have {len(name_dict)} entries")

# Print a few sample mappings
for idx in [0, len(name_dict)//2, len(name_dict)-1]:
    fn, clip_idx = name_dict[idx]
    print(f"ID {idx:4d} → file: {fn!r}, clip index: {clip_idx}")


# Load FAISS index & confirm it matches our embeddings
index = faiss.read_index(INDEX_PATH)
print(f"Index.ntotal = {index.ntotal}, embeddings loaded = {emb.shape[0]}")
assert index.ntotal == emb.shape[0], (
    f"❌ Index contains {index.ntotal} vectors but we have {emb.shape[0]} embeddings"
)

names: 604, embeds: 605
⚠️ Dropping orphan embedding index(es): [604]
✅ name_list and embeddings both have 604 entries
ID    0 → file: 'TNS_0001_V.mp4', clip index: 0
ID  302 → file: 'TNS_0030_V.mp4', clip index: 25
ID  603 → file: 'TNS_0089_V.mp4', clip index: 2
Index.ntotal = 604, embeddings loaded = 604


### Find duplicates

In [5]:
with open("clip_name_list.pkl", "rb") as f:
    name_dict = pickle.load(f)
print(f"Loaded name_list ({len(name_dict)} entries)")

# Load FAISS index & embeddings
index      = faiss.read_index(INDEX_PATH)
emb_matrix = np.load(EMB_FILE)[:len(name_dict)]      # float32, shape=(N, D)

# Radius search
radius = 0.01
print(f"Searching for pairs in a radius of: {radius}")
lim, distance_matrix, identity_matrix = index.range_search(emb_matrix, radius)

# Find duplicate pairs (IDs are positions in the index)
query_ids = np.arange(emb_matrix.shape[0])               # one ID per embedding
pairs     = find_duplicates(
    lim,
    distance_matrix,
    identity_matrix,
    query_ids
)

# Gather video durations
video_lengths = {}
for fn in os.listdir(VIDEO_DIR):
    if not fn.lower().endswith((".mp4", ".mov", ".avi")):
        continue
    file_path = os.path.join(VIDEO_DIR, fn)
    try:
        container = av.open(file_path)
        dur = container.duration or 0
        video_lengths[fn] = dur / 1e6
        container.close()
    except Exception:
        video_lengths[fn] = 0

import cv2
# Compute Laplacian variance (sharpness) for each video
video_variance = {}
for fn in os.listdir(VIDEO_DIR):
    if not fn.lower().endswith((".mp4", ".mov", ".avi")):
        continue
    file_path = os.path.join(VIDEO_DIR, fn)
    try:
        container = av.open(file_path)
        vs = container.streams.video[0]
        # grab the first video frame
        frame = next(container.decode(v=vs))
        img = frame.to_ndarray(format="bgr24")
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        # compute Laplacian variance
        lap = cv2.Laplacian(gray, cv2.CV_64F)
        video_variance[fn] = float(lap.var())
        container.close()
    except Exception:
        video_variance[fn] = None

# Count duplicates per file‐pair
file_pair_counts = defaultdict(int)
for a, b, distance in pairs:
    fn_a, idx_a = name_dict[a]
    fn_b, idx_b = name_dict[b]
    if fn_a == fn_b:
        continue
    pair = tuple(sorted([fn_a, fn_b]))
    file_pair_counts[pair] += 1

# Print summary
print("Total unique file pairs with duplicate clips:", len(file_pair_counts))
for (file1, file2), dup_count in file_pair_counts.items():
    len1 = video_lengths.get(file1, 0)
    len2 = video_lengths.get(file2, 0)
    qual1 = video_variance.get(file1, 0)
    qual2 = video_variance.get(file1, 0)
    print(f"{file1} (len: {len1:.1f}s) <-> {file2} (len: {len2:.1f}s): {dup_count} duplicate clips")

Loaded name_list (604 entries)
Searching for pairs in a radius of: 0.01
Total unique file pairs with duplicate clips: 2
TNS_0024_V.mp4 (len: 594.6s) <-> TNS_0025_V.mp4 (len: 594.6s): 20 duplicate clips
TNS_0030_V.mp4 (len: 2052.6s) <-> TNS_0031_V.mp4 (len: 2052.6s): 69 duplicate clips


In [6]:
# Reload your positional name_list
with open("clip_name_list.pkl", "rb") as f:
    name_dict = pickle.load(f)

emb_matrix = np.load(EMB_FILE)[:len(name_dict)]

# Build metadata from name_list
metadata = [ f"{fn}[{idx}]" for fn, idx in name_dict ]

# Sanity check lengths match
assert emb_matrix.shape[0] == len(metadata), (
    f"❌ {emb_matrix.shape[0]} embeddings vs {len(metadata)} metadata entries"
)

# 5) Write to TensorBoard
writer = SummaryWriter(log_dir="runs/embeds")
writer.add_embedding(
    emb_matrix,
    metadata=metadata,
    tag="my_embeddings"
)
writer.close()

print("Done. Run:\n  tensorboard --logdir=runs/embeds\nThen open http://localhost:6006/#projector")

Done. Run:
  tensorboard --logdir=runs/embeds
Then open http://localhost:6006/#projector
