## Import & Config

In [1]:
import os
import json
import numpy as np
import av
import cv2
import torch
from collections import defaultdict
from transformers import AutoModel, AutoProcessor
import re
import faiss
from torch.utils.tensorboard import SummaryWriter


# ==== CONFIGURATION ====
VIDEO_DIR          = "input_videos"
OUTPUT_DIR         = "xclip_output"
SECONDS_PER_CLIP   = 30        # ▶ clip length in seconds (change this variable)
CLIP_INTERVAL_SEC  = 10       # ▶ time between clip starts, in seconds
CLIP_LEN           = 8       # ▶ how many frames to sample per clip
BATCH_SIZE         = 8

if torch.cuda.is_available() : 
    device = "cuda:0"
    print(f"device:{device}")
elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device("mps")
    print(f"device:{device}")
else:
    print(f"Plain ol' CPU")

# ==== XCLIP SETUP ====
processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch16")  #  [oai_citation:0‡huggingface.co](https://huggingface.co/microsoft/xclip-base-patch16?utm_source=chatgpt.com)
model     = AutoModel.from_pretrained("microsoft/xclip-base-patch16").to(device).eval() 

  from .autonotebook import tqdm as notebook_tqdm


device:cuda:0


## Create embeddings

In [None]:
# ==== FRAME EXTRACTOR ====
def read_video_clip(container, indices, fps):
    frames = []
    for idx in indices:
        ts = int((idx / fps) * 1e6)
        container.seek(ts, any_frame=False, backward=True)
        for frame in container.decode(video=0):
            frames.append(frame.to_ndarray(format="rgb24"))
            break
    return frames

# ==== BATCH PROCESSING ====
def process_directory_to_embeddings(
    video_dir: str,
    clip_len: int = CLIP_LEN,
    seconds_per_clip: int = SECONDS_PER_CLIP,
    batch_size: int = BATCH_SIZE,
    output_dir: str = OUTPUT_DIR,
):
    os.makedirs(output_dir, exist_ok=True)
    embedding_batches = []
    lookup_dict = {}
    global_clip_id = 0
    current_batch = []
    current_meta = []

    for filename in sorted(os.listdir(video_dir)):
        if not filename.lower().endswith(('.mp4', '.mov', '.avi', 'wav', 'webm')):
            continue

        path      = os.path.join(video_dir, filename)
        container = av.open(path)
        stream    = container.streams.video[0]
        fps       = float(stream.average_rate) if stream.average_rate else 30.0
        total_f   = stream.frames
        window    = int(fps * seconds_per_clip)

        clip_index = 0
        for start in range(0, total_f - window + 1, window):
            indices = np.linspace(
                start, start + window, num=clip_len, endpoint=False, dtype=np.int64
            )
            frames = read_video_clip(container, indices, fps)
            if len(frames) < clip_len:
                continue

            current_batch.append(frames)
            current_meta.append({
                "clip_id":        global_clip_id,
                "clip_index":     clip_index,
                "video_file":     filename,
                "start_time_sec": int(start / fps),
            })
            clip_index += 1
            global_clip_id += 1

            if len(current_batch) == batch_size:
                print(f"Inferencing clips {current_meta[0]['clip_id']}–{current_meta[-1]['clip_id']}")
                inputs = processor(videos=current_batch, return_tensors="pt", padding=True)
                inputs = {k: v.to(device) for k, v in inputs.items()}
                with torch.no_grad():
                    embeds = model.get_video_features(**inputs).cpu().numpy()

                embedding_batches.append(embeds)
                for m in current_meta:
                    lookup_dict[str(m["clip_id"])] = m
                current_batch = []
                current_meta = []

        container.close()

    # Final batch
    if current_batch:
        print(f"Inferencing final {len(current_batch)} clips")
        inputs = processor(videos=current_batch, return_tensors="pt", padding=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            embeds = model.get_video_features(**inputs).cpu().numpy()
        embedding_batches.append(embeds)
        for m in current_meta:
            lookup_dict[str(m["clip_id"])] = m

    # Save embeddings + lookup
    all_embeddings = np.vstack(embedding_batches)
    np.save(os.path.join(output_dir, "video_embeddings.npy"), all_embeddings)
    with open(os.path.join(output_dir, "embedding_lookup.json"), "w") as f:
        json.dump(lookup_dict, f, indent=2)

    print(f"✅ Saved {all_embeddings.shape[0]} embeddings to {output_dir}/video_embeddings.npy")
    print(f"✅ Saved lookup   to {output_dir}/embedding_lookup.json")

if __name__ == "__main__":
    process_directory_to_embeddings(VIDEO_DIR)

Inferencing clips 0–7


  return self.preprocess(images, **kwargs)


Inferencing clips 8–15
Inferencing clips 16–23
Inferencing clips 24–31
Inferencing clips 32–39
Inferencing clips 40–47
Inferencing clips 48–55
Inferencing clips 56–63
Inferencing clips 64–71
Inferencing clips 72–79
Inferencing clips 80–87
Inferencing clips 88–95
Inferencing clips 96–103
Inferencing clips 104–111
Inferencing clips 112–119
Inferencing clips 120–127
Inferencing clips 128–135
Inferencing clips 136–143
Inferencing clips 144–151
Inferencing clips 152–159
Inferencing clips 160–167
Inferencing clips 168–175
Inferencing clips 176–183
Inferencing clips 184–191
Inferencing clips 192–199
Inferencing clips 200–207
Inferencing clips 208–215
Inferencing clips 216–223
Inferencing clips 224–231
Inferencing clips 232–239
Inferencing clips 240–247
Inferencing clips 248–255
Inferencing clips 256–263
Inferencing clips 264–271
Inferencing clips 272–279
Inferencing clips 280–287
Inferencing clips 288–295
Inferencing clips 296–303
Inferencing clips 304–311
Inferencing clips 312–319
Inferencin

## Visualize embeddings

In [3]:
# Load lookup metadata
with open("xclip_output/embedding_lookup.json", "r") as f:
    lookup = json.load(f)

# Ensure entries are sorted by clip_id (assumed to match row index)
sorted_lookup = sorted(lookup.values(), key=lambda x: int(x["clip_id"]))

# Create metadata strings like: "video.mp4[3] @ 90s"
metadata = [
    f'{entry["video_file"]}[{entry["clip_index"]}] @ {entry["start_time_sec"]}s'
    for entry in sorted_lookup
]

# Load embedding matrix
emb_matrix = np.load("xclip_output/video_embeddings.npy")

# Ensure alignment
assert emb_matrix.shape[0] == len(metadata), (
    f"❌ {emb_matrix.shape[0]} embeddings vs {len(metadata)} metadata entries"
)

# Write to TensorBoard
writer = SummaryWriter(log_dir="runs/embeds")
writer.add_embedding(
    emb_matrix,
    metadata=metadata,
    tag="my_embeddings"
)
writer.close()

print("✅ Done. Run:\n  tensorboard --logdir=runs/embeds\nThen open http://localhost:6006/#projector")

✅ Done. Run:
  tensorboard --logdir=runs/embeds
Then open http://localhost:6006/#projector


## Index in FAISS

In [5]:
EMB_PATH    = "xclip_output/video_embeddings.npy"
LOOKUP_PATH = "xclip_output/embedding_lookup.json"
INDEX_PATH  = "xclip_output/video_embeddings.index"

# Load embeddings and lookup dict
embeddings = np.load(EMB_PATH)  
with open(LOOKUP_PATH, "r") as f:
    lookup = json.load(f)

# Build FAISS index (inner-product) and add IDs
faiss.normalize_L2(embeddings)
dim   = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index = faiss.IndexIDMap(index)

ids = np.arange(embeddings.shape[0], dtype="int64")
index.add_with_ids(embeddings, ids)

# Save the index for later
faiss.write_index(index, INDEX_PATH)

## Find duplicates and optimal radius

In [6]:
import numpy as np
import json
import faiss
import re
import pandas as pd
from collections import defaultdict

# Paths to embeddings, lookup, and index
EMB_PATH    = 'videoclip_output/video_embeddings.npy'
LOOKUP_PATH = 'videoclip_output/embedding_lookup.json'
INDEX_PATH  = 'videoclip_output/video_embeddings.index'

def count_similar_clip_pairs(radius):
    embeddings = np.load(EMB_PATH)
    with open(LOOKUP_PATH) as f:
        lookup = json.load(f)
    index = faiss.read_index(INDEX_PATH)
    faiss.normalize_L2(embeddings)
    lims, distances, labels = index.range_search(embeddings, radius)

    pair_stats = defaultdict(list)
    for q in range(len(embeddings)):
        for i in range(lims[q], lims[q+1]):
            idx = labels[i]
            if idx <= q:
                continue
            f1 = lookup[str(q)]['video_file']
            f2 = lookup[str(idx)]['video_file']
            if f1 != f2:
                pair = tuple(sorted((f1, f2)))
                pair_stats[pair].append(distances[i])

    return [
        (f1, f2, len(dists), sum(dists)/len(dists))
        for (f1, f2), dists in pair_stats.items()
    ]

def evaluate_duplicates(pairs, ground_truth):
    def extract_id(name):
        m = re.search(r'TNS_(\d+)', name)
        return m.group(1) if m else None

    detected = {
        tuple(sorted((extract_id(f1), extract_id(f2))))
        for f1, f2, _, _ in pairs
    }
    truth_dups = {
        extract_id(k)
        for k, v in ground_truth.items()
        if v.upper() == 'DUPLICATE'
    }
    truth_pairs = {
        tuple(sorted((dup, extract_id(pr))))
        for dup in truth_dups
        for pr, st in ground_truth.items()
        if st.upper() == 'PRINCIPAL'
    }

    tp = len(detected & truth_pairs)
    fp = len(detected - truth_pairs)
    fn = len(truth_pairs - detected)
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0

    return {
        'true_positives': tp,
        'false_positives': fp,
        'false_negatives': fn,
        'recall': recall
    }

# --- Load ground truth ---
gt_df = pd.read_csv('dup_groundtruth.csv')  # comma-separated file
ground_truth = dict(zip(gt_df['UAR Code'], gt_df['Principal / Duplicate']))

# --- Loop over different radii ---
radii = [0.9, 0.95, 0.99, 0.999]
results = []

for r in radii:
    pairs   = count_similar_clip_pairs(r)
    metrics = evaluate_duplicates(pairs, ground_truth)
    results.append({'radius': r, **metrics})

# --- Export & display ---
df = pd.DataFrame(results)
df.to_csv('evaluation_results.csv', index=False)
print(df)

   radius  true_positives  false_positives  false_negatives        recall
0   0.900               3               28          4685307  6.402991e-07
1   0.950               2               10          4685308  4.268661e-07
2   0.990               2                0          4685308  4.268661e-07
3   0.999               2                0          4685308  4.268661e-07


## Search prompts

In [None]:
import json
import numpy as np
import faiss
import torch
from transformers import AutoTokenizer, AutoModel

# ─── CONFIG ────────────────────────────────────────────────────────────────────
LOOKUP_PATH = "output/embedding_lookup.json"
INDEX_PATH  = "output/video_embeddings.index"
MODEL_CHECKPOINT = "microsoft/xclip-base-patch32"
device = "cuda" if torch.cuda.is_available() else "cpu"
# ────────────────────────────────────────────────────────────────────────────────

# Load lookup table and FAISS index
with open(LOOKUP_PATH, "r") as f:
    lookup = json.load(f)
index = faiss.read_index(INDEX_PATH)

# Load tokenizer & text model
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
model = AutoModel.from_pretrained(MODEL_CHECKPOINT).to(device)
model.eval()

def search_prompts(prompts, top_k=1):
    """
    Encode text prompts, search the FAISS index, and return
    filename + timestamp for each top-k match.
    """
    # Tokenize and encode
    inputs = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
    with torch.no_grad():
        text_feats = model.get_text_features(**inputs)
    text_feats = text_feats.cpu().numpy()

    # Normalize for cosine similarity
    faiss.normalize_L2(text_feats)

    # Search
    D, I = index.search(text_feats, top_k)

    # Collect results
    results = []
    for i, prompt in enumerate(prompts):
        matches = []
        for score, clip_id in zip(D[i], I[i]):
            info = lookup[str(int(clip_id))]
            matches.append({
                "prompt": prompt,
                "file": info["video_file"],
                "start_time_sec": info["start_time_sec"],
                "clip_index": info["clip_index"],
                "similarity": float(score)
            })
        results.append(matches)
    return results

prompts = [
    "Videos of a man injured in the daytime. Smoke is rising in the background",
    "A clown eating a huge bowl of spagetti while riding a bicycle"
]
results = search_prompts(prompts, top_k=3)
for match_list in results:
    for match in match_list:
        print(f"Prompt: {match['prompt']}")
        print(f"  File: {match['file']}")
        print(f"  Start time: {match['start_time_sec']}s (clip index {match['clip_index']})")
        print(f"  Similarity: {match['similarity']:.4f}")
        print()