## Import & Config

In [3]:
import os
import json
import numpy as np
import torch
import av
import cv2
from transformers import AutoModel, AutoTokenizer
import faiss
import pickle
from torch.utils.tensorboard import SummaryWriter
from modeling import VideoCLIP_XL

# ==== CONFIGURATION ====
VIDEO_DIR       = "../og_ds"
OUTPUT_DIR      = "ViClipXLv2_output"
FRAMES_PER_CLIP = 8
CLIP_DURATION   = 30
BATCH_SIZE      = 128

# ==== DEVICE SETUP ====
if torch.cuda.is_available():
    device = "cuda:0"
    print(f"device: {device}")
elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device("mps")
    print(f"device: {device}")
else:
    device = torch.device("cpu")
    print("Plain ol' CPU")

# ==== LOAD VideoCLIP-XL-v2 ====
model = VideoCLIP_XL()  
state_dict = torch.load(
    "./VideoCLIP-XL-v2.bin",
    map_location=device
)
model.load_state_dict(state_dict)
model = model.to(device).eval()

device: cuda:0


  state_dict = torch.load(


## Create embeddings

In [None]:
# ==== FRAME EXTRACTOR ====
def read_video_clip(container, indices, fps):
    frames = []
    for idx in indices:
        ts = int((idx / fps) * 1e6)
        container.seek(ts, any_frame=False, backward=True)
        for frame in container.decode(video=0):
            frames.append(frame.to_ndarray(format="rgb24"))
            break
    return frames

# ==== FRAME NORMALIZATION + TENSOR CONVERSION ====
v_mean = np.array([0.485, 0.456, 0.406]).reshape(1, 1, 3)
v_std  = np.array([0.229, 0.224, 0.225]).reshape(1, 1, 3)

def normalize(data):
    return (data / 255.0 - v_mean) / v_std

def frames2tensor(vid_list, fnum=FRAMES_PER_CLIP, target_size=(224, 224), device=device):
    assert len(vid_list) >= fnum
    step = len(vid_list) // fnum
    sampled = vid_list[::step][:fnum]
    resized = [cv2.resize(frame[:, :, ::-1], target_size) for frame in sampled]
    normalized = [np.expand_dims(normalize(frame), axis=0) for frame in resized]  # (1, H, W, C)
    tube = np.stack(normalized, axis=1)  # (1, T, H, W, C)
    tube = np.transpose(tube, (0, 1, 4, 2, 3))  # (1, T, C, H, W)
    return torch.from_numpy(tube).to(device, non_blocking=True).float()

# ==== BATCH PROCESSING FUNCTION ====
def process_directory_to_embeddings(
    video_dir: str,
    clip_len: int = FRAMES_PER_CLIP,
    batch_size: int = BATCH_SIZE,
    OUTPUT_DIR: str = OUTPUT_DIR,
):
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    embedding_batches = []
    lookup_dict = {}
    global_clip_id = 0
    current_batch = []
    current_meta  = []

    exts = ('.mp4', '.mov', '.avi', '.webm', '.mkv', '.flv', '.wmv')

    for filename in sorted(os.listdir(video_dir)):
        if not filename.lower().endswith(exts):
            continue

        video_path = os.path.join(video_dir, filename)
        container  = av.open(video_path)
        stream     = container.streams.video[0]
        total_frames = stream.frames
        fps = float(stream.average_rate) if stream.average_rate else 1.0

        clip_index = 0
        window_size = int(CLIP_DURATION * fps)

        for start in range(0, total_frames - window_size + 1, window_size):
            indices = np.linspace(start, start + window_size, num=clip_len, endpoint=False, dtype=np.int64)
            frames  = read_video_clip(container, indices, fps)
            if len(frames) < clip_len:
                continue

            current_batch.append(frames)
            current_meta.append({
                "clip_id":        global_clip_id,
                "clip_index":     clip_index,
                "video_file":     filename,
                "start_time_sec": int(start / fps),
            })

            clip_index += 1
            global_clip_id += 1

            if len(current_batch) == batch_size:
                print(f"Running inference on batch: IDs {current_meta[0]['clip_id']}–{current_meta[-1]['clip_id']}")
                batch_tensor = torch.cat([
                    frames2tensor(frames, fnum=clip_len, device=device)
                    for frames in current_batch
                ], dim=0)
                with torch.no_grad():
                    embeds = model.vision_model.get_vid_features(batch_tensor).cpu().numpy()
                embedding_batches.append(embeds)
                for m in current_meta:
                    lookup_dict[str(m["clip_id"])] = m

                current_batch = []
                current_meta  = []

        container.close()

    # Final batch
    if current_batch:
        print(f"Running inference on final batch: IDs {current_meta[0]['clip_id']}–{current_meta[-1]['clip_id']}")
        batch_tensor = torch.cat([
            frames2tensor(frames, fnum=clip_len, device=device)
            for frames in current_batch
        ], dim=0)
        with torch.no_grad():
            embeds = model.vision_model.get_vid_features(batch_tensor).cpu().numpy()
        embedding_batches.append(embeds)
        for m in current_meta:
            lookup_dict[str(m["clip_id"])] = m

    # Save outputs
    all_embeddings = np.vstack(embedding_batches)
    np.save(os.path.join(OUTPUT_DIR, "video_embeddings.npy"), all_embeddings)
    with open(os.path.join(OUTPUT_DIR, "embedding_lookup.json"), "w") as f:
        json.dump(lookup_dict, f, indent=2)

    print(f"✅ Saved embeddings to {OUTPUT_DIR}/video_embeddings.npy")
    print(f"✅ Saved lookup to    {OUTPUT_DIR}/embedding_lookup.json")

# ==== MAIN ENTRY ====
if __name__ == "__main__":
    process_directory_to_embeddings(VIDEO_DIR)

Running inference on batch: IDs 0–127


  return fn(*args, **kwargs)


Running inference on batch: IDs 128–255
Running inference on batch: IDs 256–383
Running inference on batch: IDs 384–511
Running inference on batch: IDs 512–639
Running inference on batch: IDs 640–767
Running inference on batch: IDs 768–895
Running inference on batch: IDs 896–1023
Running inference on batch: IDs 1024–1151
Running inference on batch: IDs 1152–1279
Running inference on batch: IDs 1280–1407
Running inference on batch: IDs 1408–1535
Running inference on batch: IDs 1536–1663
Running inference on batch: IDs 1664–1791
Running inference on batch: IDs 1792–1919
Running inference on batch: IDs 1920–2047
Running inference on batch: IDs 2048–2175
Running inference on batch: IDs 2176–2303
Running inference on batch: IDs 2304–2431
Running inference on batch: IDs 2432–2559
Running inference on batch: IDs 2560–2687
Running inference on batch: IDs 2688–2815
Running inference on batch: IDs 2816–2943
Running inference on batch: IDs 2944–3071
Running inference on batch: IDs 3072–3199
Runni

Could not find ref with POC 92
Could not find ref with POC 92
Could not find ref with POC 188
Could not find ref with POC 188
Could not find ref with POC 316
Could not find ref with POC 316
Could not find ref with POC 412
Could not find ref with POC 412
Could not find ref with POC 540
Could not find ref with POC 540
Could not find ref with POC 636
Could not find ref with POC 636
Could not find ref with POC 764
Could not find ref with POC 764
Could not find ref with POC 860
Could not find ref with POC 860
Could not find ref with POC 988
Could not find ref with POC 988
Could not find ref with POC 1084
Could not find ref with POC 1084
Could not find ref with POC 1212
Could not find ref with POC 1212
Could not find ref with POC 1308
Could not find ref with POC 1308
Could not find ref with POC 1436
Could not find ref with POC 1436
Could not find ref with POC 1532
Could not find ref with POC 1532
Could not find ref with POC 1660
Could not find ref with POC 1660
Could not find ref with POC 17

Running inference on batch: IDs 3584–3711
Running inference on batch: IDs 3712–3839
Running inference on batch: IDs 3840–3967
Running inference on batch: IDs 3968–4095
Running inference on batch: IDs 4096–4223
Running inference on batch: IDs 4224–4351
Running inference on batch: IDs 4352–4479


Could not find ref with POC 5276
Could not find ref with POC 92
Could not find ref with POC 92
Could not find ref with POC 188
Could not find ref with POC 188
Could not find ref with POC 316
Could not find ref with POC 316
Could not find ref with POC 444
Could not find ref with POC 444
Could not find ref with POC 540
Could not find ref with POC 540
Could not find ref with POC 668
Could not find ref with POC 668
Could not find ref with POC 764
Could not find ref with POC 764
Could not find ref with POC 92
Could not find ref with POC 92
Could not find ref with POC 188
Could not find ref with POC 188
Could not find ref with POC 316
Could not find ref with POC 316
Could not find ref with POC 444
Could not find ref with POC 444
Could not find ref with POC 540
Could not find ref with POC 540
Could not find ref with POC 668
Could not find ref with POC 668
Could not find ref with POC 764
Could not find ref with POC 764
Could not find ref with POC 92
Could not find ref with POC 92
Could not fin

Running inference on batch: IDs 4480–4607
Running inference on batch: IDs 4608–4735
Running inference on batch: IDs 4736–4863
Running inference on batch: IDs 4864–4991
Running inference on batch: IDs 4992–5119
Running inference on batch: IDs 5120–5247
Running inference on batch: IDs 5248–5375
Running inference on batch: IDs 5376–5503
Running inference on batch: IDs 5504–5631
Running inference on batch: IDs 5632–5759
Running inference on batch: IDs 5760–5887
Running inference on batch: IDs 5888–6015


Could not find ref with POC 764
Could not find ref with POC 92
Could not find ref with POC 92
Could not find ref with POC 188
Could not find ref with POC 188
Could not find ref with POC 316
Could not find ref with POC 316
Could not find ref with POC 412
Could not find ref with POC 412
Could not find ref with POC 540
Could not find ref with POC 540
Could not find ref with POC 636
Could not find ref with POC 636
Could not find ref with POC 764
Could not find ref with POC 764
Could not find ref with POC 860
Could not find ref with POC 860
Could not find ref with POC 988
Could not find ref with POC 988
Could not find ref with POC 1084
Could not find ref with POC 1084
Could not find ref with POC 1212
Could not find ref with POC 1212
Could not find ref with POC 1308
Could not find ref with POC 1308
Could not find ref with POC 1436
Could not find ref with POC 1436
Could not find ref with POC 1532
Could not find ref with POC 1532
Could not find ref with POC 1660
Could not find ref with POC 166

Running inference on batch: IDs 6016–6143
Running inference on batch: IDs 6144–6271
Running inference on batch: IDs 6272–6399
Running inference on batch: IDs 6400–6527
Running inference on batch: IDs 6528–6655
Running inference on batch: IDs 6656–6783
Running inference on batch: IDs 6784–6911
Running inference on batch: IDs 6912–7039
Running inference on batch: IDs 7040–7167
Running inference on batch: IDs 7168–7295
Running inference on batch: IDs 7296–7423
Running inference on batch: IDs 7424–7551
Running inference on batch: IDs 7552–7679
Running inference on batch: IDs 7680–7807


## Visualize embeddings

In [None]:
# Load lookup metadata
with open("ViClipXLv2_output/embedding_lookup.json", "r") as f:
    lookup = json.load(f)

# Ensure entries are sorted by clip_id (assumed to match row index)
sorted_lookup = sorted(lookup.values(), key=lambda x: int(x["clip_id"]))

# Create metadata strings like: "video.mp4[3] @ 90s"
metadata = [
    f'{entry["video_file"]}[{entry["clip_index"]}] @ {entry["start_time_sec"]}s'
    for entry in sorted_lookup
]

# Load embedding matrix
emb_matrix = np.load("ViClipXLv2_output/video_embeddings.npy")

# Ensure alignment
assert emb_matrix.shape[0] == len(metadata), (
    f"❌ {emb_matrix.shape[0]} embeddings vs {len(metadata)} metadata entries"
)

# Write to TensorBoard
writer = SummaryWriter(log_dir="runs/embeds")
writer.add_embedding(
    emb_matrix,
    metadata=metadata,
    tag="my_embeddings"
)
writer.close()

print("✅ Done. Run:\n  tensorboard --logdir=runs/embeds\nThen open http://localhost:6006/#projector")

✅ Done. Run:
  tensorboard --logdir=runs/embeds
Then open http://localhost:6006/#projector


## Index in FAISS

In [None]:
EMB_PATH    = "ViClipXLv2_output/video_embeddings.npy"
LOOKUP_PATH = "ViClipXLv2_output/embedding_lookup.json"
INDEX_PATH  = "ViClipXLv2_output/video_embeddings.index"

# Load embeddings and lookup dict
embeddings = np.load(EMB_PATH)  
with open(LOOKUP_PATH, "r") as f:
    lookup = json.load(f)

# Build FAISS index (inner-product) and add IDs
faiss.normalize_L2(embeddings)
dim   = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index = faiss.IndexIDMap(index)

ids = np.arange(embeddings.shape[0], dtype="int64")
index.add_with_ids(embeddings, ids)

# Save the index for later
faiss.write_index(index, INDEX_PATH)

## Find duplicates

In [None]:
from collections import defaultdict
import numpy as np
import json
import faiss

def count_similar_clip_pairs(radius: float):
    """
    For each unique pair of video files, count how many clip pairs are similar
    (i.e., within the given radius), and record the average distance.
    Returns: list of (file1, file2, count, avg_distance)
    """
    embeddings = np.load(EMB_PATH)
    with open(LOOKUP_PATH, "r") as f:
        lookup = json.load(f)

    index = faiss.read_index(INDEX_PATH)
    faiss.normalize_L2(embeddings)

    lims, distances, labels = index.range_search(embeddings, radius)

    # Map (file1, file2) to list of distances
    pair_stats = defaultdict(list)

    for q in range(len(embeddings)):
        start, end = lims[q], lims[q+1]
        for i in range(start, end):
            idx = labels[i]
            if idx <= q:
                continue  # skip self or repeated
            f1 = lookup[str(q)]["video_file"]
            f2 = lookup[str(idx)]["video_file"]
            if f1 != f2:
                key = tuple(sorted((f1, f2)))
                pair_stats[key].append(distances[i])

    results = []
    for (f1, f2), dists in pair_stats.items():
        avg_dist = sum(dists) / len(dists)
        results.append((f1, f2, len(dists), avg_dist))

    return results

# Choose your similarity radius (e.g., 0.8 for cosine similarity >= 0.8)
radius = 0.999
pairs = count_similar_clip_pairs(radius)

for f1, f2, count, avg_dist in sorted(pairs, key=lambda x: -x[2]):
    print(f"{f1} <--> {f2} | similar clips: {count} | avg distance: {avg_dist:.4f}")

TNS_0030_V.mp4 <--> TNS_0031_V.mp4 | similar clips: 68 | avg distance: 1.0000
TNS_0024_V.mp4 <--> TNS_0025_V.mp4 | similar clips: 19 | avg distance: 1.0000


## Evaluate duplicates

In [None]:
import os
import re
import csv
import json
import av
import cv2
import numpy as np
import faiss
import networkx as nx

# === Configuration ===
VIDEO_DIR         = "../videos"
GROUNDTRUTH_CSV   = "../dup_groundtruth.csv"
LOOKUP_JSON       = "embedding_lookup.json"
EMB_PATH          = "embeddings.npy"
INDEX_PATH        = "index.index"
RADII             = [0.92, 0.94, 0.96, 0.98, 0.990, 0.995, 0.999]
OUTPUT_CSV        = "radius_results.csv"

# === 1. Compute sharpness and duration metrics ===
def compute_video_quality_metrics(video_dir):
    video_variance = {}
    video_duration = {}
    for fn in os.listdir(video_dir):
        if not fn.lower().endswith(('.mp4', '.mov', '.avi', '.webm', '.mkv', '.flv', '.wmv')):
            continue
        path = os.path.join(video_dir, fn)
        try:
            container = av.open(path)
            vs = container.streams.video[0]
            vs.codec_context.skip_frame = "NONKEY"
            frame = next(container.decode(vs), None)
            if frame is None:
                raise RuntimeError("no frame decoded")
            img = frame.to_ndarray(format="bgr24")
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            lap = cv2.Laplacian(gray, cv2.CV_64F)
            video_variance[fn] = float(lap.var())
            duration = (vs.duration * vs.time_base) if vs.duration and vs.time_base else 0.0
            video_duration[fn] = float(duration)
            container.close()
        except Exception as e:
            print(f"❌ {fn} failed: {e}")
            video_variance[fn] = None
            video_duration[fn] = 0.0
    return video_variance, video_duration

video_variance, video_duration = compute_video_quality_metrics(VIDEO_DIR)

# === 2. Load ground truth ===
with open(GROUNDTRUTH_CSV, newline="") as f:
    reader = csv.reader(f)
    next(reader, None)
    ground_truth = {row[0]: row[1] for row in reader}

# === 3. Load lookup, embeddings, and index ===
with open(LOOKUP_JSON) as f:
    lookup = json.load(f)
embeddings = np.load(EMB_PATH)
index = faiss.read_index(INDEX_PATH)
faiss.normalize_L2(embeddings)

# === 4. Utility functions ===
def extract_id(fn):
    m = re.search(r'TNS_(\d+)', fn)
    return m.group(1) if m else None

def build_clusters_from_faiss(labels, lims):
    G = nx.Graph()
    for q in range(len(lims) - 1):
        for i in range(lims[q], lims[q + 1]):
            k = labels[i]
            if k != q:
                G.add_edge(q, k)
    return list(nx.connected_components(G))

def assign_roles_by_heuristics(cluster):
    def quality_key(cid):
        fn = lookup[str(cid)]["video_file"]
        var = video_variance.get(fn) or 0.0
        dur = video_duration.get(fn) or 0.0
        return (var, dur, fn)
    best = max(cluster, key=quality_key)
    principal = lookup[str(best)]["video_file"]
    duplicates = [lookup[str(c)]["video_file"] for c in cluster if c != best]
    return principal, duplicates

def evaluate_clusters(clusters):
    gt_principals = {fn for fn, r in ground_truth.items() if r.upper() == "PRINCIPAL"}
    gt_duplicates = {fn for fn, r in ground_truth.items() if r.upper() == "DUPLICATE"}
    tp, fp, fn_set = set(), set(), set()

    for cluster in clusters:
        if len(cluster) <= 1:
            continue
        p_fn, d_list = assign_roles_by_heuristics(cluster)
        for d_fn in d_list:
            pid, did = extract_id(p_fn), extract_id(d_fn)
            pair = (pid, did)
            if d_fn in gt_duplicates and p_fn in gt_principals:
                tp.add(pair)
            else:
                fp.add(pair)

    for d_fn in gt_duplicates:
        for p_fn in gt_principals:
            pid, did = extract_id(p_fn), extract_id(d_fn)
            if pid and did and (pid, did) not in tp:
                fn_set.add((pid, did))

    recall = len(tp) / (len(tp) + len(fn_set)) if (len(tp) + len(fn_set)) > 0 else 0.0
    return {"tp": len(tp), "fp": len(fp), "fn": len(fn_set), "recall": recall}

# === 5. Sweep radii ===
results = {}
for r in RADII:
    lims, distances, labels = index.range_search(embeddings, r)
    clusters = build_clusters_from_faiss(labels, lims)
    res = evaluate_clusters(clusters)
    results[r] = res
    print(f"--- Radius {r:.3f} → TP:{res['tp']} FP:{res['fp']} FN:{res['fn']} Recall:{res['recall']:.2%}")

# === 6. Summary ===
print("\nSummary:")
print("Radius |  TP   FP   FN   Recall")
for r, m in results.items():
    print(f"{r:6.3f} | {m['tp']:4d} {m['fp']:4d} {m['fn']:4d}  {m['recall']:.1%}")

# === 7. Export results to CSV ===
with open(OUTPUT_CSV, "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["radius", "tp", "fp", "fn", "recall"])
    for r in sorted(results):
        m = results[r]
        writer.writerow([r, m["tp"], m["fp"], m["fn"], f"{m['recall']:.4f}"])

print(f"\nResults exported to {OUTPUT_CSV}")

✅ True Positives (correctly detected): 2
❌ False Positives (wrongly flagged):   0
🔍 False Negatives (missed duplicates): 4685308
📈 Recall: 0.00%


## Find duplicates and optimal radius

In [None]:
import os
import re
import csv
import json
import av
import cv2
import numpy as np
import faiss
import networkx as nx

# === Configuration ===
VIDEO_DIR         = "../videos"
GROUNDTRUTH_CSV   = "../dup_groundtruth.csv"
LOOKUP_JSON       = "embedding_lookup.json"
EMB_PATH          = "embeddings.npy"
INDEX_PATH        = "index.index"
RADII             = [0.92, 0.94, 0.96, 0.98, 0.990, 0.995, 0.999]
OUTPUT_CSV        = "radius_results.csv"

# === 1. Compute sharpness and duration metrics ===
def compute_video_quality_metrics(video_dir):
    video_variance = {}
    video_duration = {}
    for fn in os.listdir(video_dir):
        if not fn.lower().endswith(('.mp4', '.mov', '.avi', '.webm', '.mkv', '.flv', '.wmv')):
            continue
        path = os.path.join(video_dir, fn)
        try:
            container = av.open(path)
            vs = container.streams.video[0]
            vs.codec_context.skip_frame = "NONKEY"
            frame = next(container.decode(vs), None)
            if frame is None:
                raise RuntimeError("no frame decoded")
            img = frame.to_ndarray(format="bgr24")
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            lap = cv2.Laplacian(gray, cv2.CV_64F)
            video_variance[fn] = float(lap.var())
            duration = (vs.duration * vs.time_base) if vs.duration and vs.time_base else 0.0
            video_duration[fn] = float(duration)
            container.close()
        except Exception as e:
            print(f"❌ {fn} failed: {e}")
            video_variance[fn] = None
            video_duration[fn] = 0.0
    return video_variance, video_duration

video_variance, video_duration = compute_video_quality_metrics(VIDEO_DIR)

# === 2. Load ground truth ===
with open(GROUNDTRUTH_CSV, newline="") as f:
    reader = csv.reader(f)
    next(reader, None)
    ground_truth = {row[0]: row[1] for row in reader}

# === 3. Load lookup, embeddings, and index ===
with open(LOOKUP_JSON) as f:
    lookup = json.load(f)
embeddings = np.load(EMB_PATH)
index = faiss.read_index(INDEX_PATH)
faiss.normalize_L2(embeddings)

# === 4. Utility functions ===
def extract_id(fn):
    m = re.search(r'TNS_(\d+)', fn)
    return m.group(1) if m else None

def build_clusters_from_faiss(labels, lims):
    G = nx.Graph()
    for q in range(len(lims) - 1):
        for i in range(lims[q], lims[q + 1]):
            k = labels[i]
            if k != q:
                G.add_edge(q, k)
    return list(nx.connected_components(G))

def assign_roles_by_heuristics(cluster):
    def quality_key(cid):
        fn = lookup[str(cid)]["video_file"]
        var = video_variance.get(fn) or 0.0
        dur = video_duration.get(fn) or 0.0
        return (var, dur, fn)
    best = max(cluster, key=quality_key)
    principal = lookup[str(best)]["video_file"]
    duplicates = [lookup[str(c)]["video_file"] for c in cluster if c != best]
    return principal, duplicates

def evaluate_clusters(clusters):
    gt_principals = {fn for fn, r in ground_truth.items() if r.upper() == "PRINCIPAL"}
    gt_duplicates = {fn for fn, r in ground_truth.items() if r.upper() == "DUPLICATE"}
    tp, fp, fn_set = set(), set(), set()

    for cluster in clusters:
        if len(cluster) <= 1:
            continue
        p_fn, d_list = assign_roles_by_heuristics(cluster)
        for d_fn in d_list:
            pid, did = extract_id(p_fn), extract_id(d_fn)
            pair = (pid, did)
            if d_fn in gt_duplicates and p_fn in gt_principals:
                tp.add(pair)
            else:
                fp.add(pair)

    for d_fn in gt_duplicates:
        for p_fn in gt_principals:
            pid, did = extract_id(p_fn), extract_id(d_fn)
            if pid and did and (pid, did) not in tp:
                fn_set.add((pid, did))

    recall = len(tp) / (len(tp) + len(fn_set)) if (len(tp) + len(fn_set)) > 0 else 0.0
    return {"tp": len(tp), "fp": len(fp), "fn": len(fn_set), "recall": recall}

# === 5. Sweep radii ===
results = {}
for r in RADII:
    lims, distances, labels = index.range_search(embeddings, r)
    clusters = build_clusters_from_faiss(labels, lims)
    res = evaluate_clusters(clusters)
    results[r] = res
    print(f"--- Radius {r:.3f} → TP:{res['tp']} FP:{res['fp']} FN:{res['fn']} Recall:{res['recall']:.2%}")

# === 6. Summary ===
print("\nSummary:")
print("Radius |  TP   FP   FN   Recall")
for r, m in results.items():
    print(f"{r:6.3f} | {m['tp']:4d} {m['fp']:4d} {m['fn']:4d}  {m['recall']:.1%}")

# === 7. Export results to CSV ===
with open(OUTPUT_CSV, "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["radius", "tp", "fp", "fn", "recall"])
    for r in sorted(results):
        m = results[r]
        writer.writerow([r, m["tp"], m["fp"], m["fn"], f"{m['recall']:.4f}"])

print(f"\nResults exported to {OUTPUT_CSV}")

   radius  true_positives  false_positives  false_negatives        recall
0   0.900               6               23          4685304  1.280598e-06
1   0.950               2                8          4685308  4.268661e-07
2   0.990               2                0          4685308  4.268661e-07
3   0.999               2                0          4685308  4.268661e-07


## Search prompts

In [None]:
import json
import numpy as np
import faiss
import torch
from transformers import AutoTokenizer, AutoModel

# ─── CONFIG ────────────────────────────────────────────────────────────────────
LOOKUP_PATH = "output/embedding_lookup.json"
INDEX_PATH  = "output/video_embeddings.index"
MODEL_CHECKPOINT = "microsoft/xclip-base-patch32"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# ────────────────────────────────────────────────────────────────────────────────

# Load lookup table and FAISS index
with open(LOOKUP_PATH, "r") as f:
    lookup = json.load(f)
index = faiss.read_index(INDEX_PATH)

# Load tokenizer & text model
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
model = AutoModel.from_pretrained(MODEL_CHECKPOINT).to(DEVICE)
model.eval()

def search_prompts(prompts, top_k=1):
    """
    Encode text prompts, search the FAISS index, and return
    filename + timestamp for each top-k match.
    """
    # Tokenize and encode
    inputs = tokenizer(prompts, padding=True, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        text_feats = model.get_text_features(**inputs)
    text_feats = text_feats.cpu().numpy()

    # Normalize for cosine similarity
    faiss.normalize_L2(text_feats)

    # Search
    D, I = index.search(text_feats, top_k)

    # Collect results
    results = []
    for i, prompt in enumerate(prompts):
        matches = []
        for score, clip_id in zip(D[i], I[i]):
            info = lookup[str(int(clip_id))]
            matches.append({
                "prompt": prompt,
                "file": info["video_file"],
                "start_time_sec": info["start_time_sec"],
                "clip_index": info["clip_index"],
                "similarity": float(score)
            })
        results.append(matches)
    return results

prompts = [
    "Videos of a man injured in the daytime. Smoke is rising in the background",
    "A clown eating a huge bowl of spagetti while riding a bicycle"
]
results = search_prompts(prompts, top_k=3)
for match_list in results:
    for match in match_list:
        print(f"Prompt: {match['prompt']}")
        print(f"  File: {match['file']}")
        print(f"  Start time: {match['start_time_sec']}s (clip index {match['clip_index']})")
        print(f"  Similarity: {match['similarity']:.4f}")
        print()

FileNotFoundError: [Errno 2] No such file or directory: 'output/embedding_lookup.json'