## Import & Config

In [11]:
import os
import json
import numpy as np
import torch
import av
import cv2
from transformers import AutoModel, AutoTokenizer
import faiss
import pickle
from torch.utils.tensorboard import SummaryWriter

# ==== CONFIGURATION ====
VIDEO_DIR       = "input_videos"
OUTPUT_DIR      = "videoclip_output"
FRAMES_PER_CLIP = 8
CLIP_DURATION   = 30
BATCH_SIZE      = 128

if torch.cuda.is_available() : 
    device = "cuda:0"
    print(f"device:{device}")
elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device("mps")
    print(f"device:{device}")
else:
    print(f"Plain ol' CPU")

# ==== LOAD ViCLIP ====
model = AutoModel.from_pretrained("OpenGVLab/ViCLIP-L-14-hf", trust_remote_code=True).to(device).eval()

  from .autonotebook import tqdm as notebook_tqdm


device:cuda:0


  from pkg_resources import packaging


## Create embeddings

In [4]:
# ==== FRAME EXTRACTOR ====
def read_video_clip(container, indices, fps):
    frames = []
    for idx in indices:
        ts = int((idx / fps) * 1e6)
        container.seek(ts, any_frame=False, backward=True)
        for frame in container.decode(video=0):
            frames.append(frame.to_ndarray(format="rgb24"))
            break
    return frames


# ==== FRAME NORMALIZATION + TENSOR CONVERSION ====
v_mean = np.array([0.485, 0.456, 0.406]).reshape(1, 1, 3)
v_std = np.array([0.229, 0.224, 0.225]).reshape(1, 1, 3)

def normalize(data):
    return (data / 255.0 - v_mean) / v_std

def frames2tensor(vid_list, fnum=8, target_size=(224, 224), device=torch.device('cuda')):
    assert len(vid_list) >= fnum
    step = len(vid_list) // fnum
    vid_list = vid_list[::step][:fnum]
    vid_list = [cv2.resize(x[:, :, ::-1], target_size) for x in vid_list]
    vid_tube = [np.expand_dims(normalize(x), axis=0) for x in vid_list]  # (1, H, W, C) for each frame
    vid_tube = np.stack(vid_tube, axis=1)  # (1, T, H, W, C)
    vid_tube = np.transpose(vid_tube, (0, 1, 4, 2, 3))  # (1, T, C, H, W)
    vid_tube = torch.from_numpy(vid_tube).to(device, non_blocking=True).float()
    return vid_tube


# ==== BATCH PROCESSING FUNCTION ====
def process_directory_to_embeddings(
    video_dir: str,
    clip_len: int = 8,
    batch_size: int = BATCH_SIZE,
    output_dir: str = OUTPUT_DIR,
):
    os.makedirs(output_dir, exist_ok=True)
    embedding_batches = []
    lookup_dict = {}
    global_clip_id = 0
    current_batch = []
    current_meta = []

    exts = ('.mp4', '.mov', '.avi', 'wav', 'webm')

    for filename in sorted(os.listdir(video_dir)):
        if not filename.lower().endswith(exts):
            continue

        video_path = os.path.join(video_dir, filename)
        container = av.open(video_path)
        stream = container.streams.video[0]
        total_frames = stream.frames
        fps = float(stream.average_rate) if stream.average_rate else 1.0

        clip_index = 0
        window_size = int(10 * fps)

        for start in range(0, total_frames - window_size + 1, window_size):
            indices = np.linspace(start, start + window_size, num=clip_len, endpoint=False, dtype=np.int64)
            frames = read_video_clip(container, indices, fps)
            if len(frames) < clip_len:
                continue

            current_batch.append(frames)
            current_meta.append({
                "clip_id":        global_clip_id,
                "clip_index":     clip_index,
                "video_file":     filename,
                "start_time_sec": int(start / fps),
            })

            clip_index += 1
            global_clip_id += 1

            if len(current_batch) == batch_size:
                print(f"Running inference on batch of {batch_size} clips: IDs {current_meta[0]['clip_id']}-{current_meta[-1]['clip_id']}")
                batch_tensor = torch.cat([frames2tensor(frames, fnum=clip_len, device=device) for frames in current_batch], dim=0)
                with torch.no_grad():
                    embeds = model.get_vid_features(batch_tensor).cpu().numpy()

                embedding_batches.append(embeds)
                for m in current_meta:
                    lookup_dict[str(m["clip_id"])] = m

                current_batch = []
                current_meta = []

        container.close()

    # Final batch
    if current_batch:
        print(f"Running inference on final batch of {len(current_batch)} clips: IDs {current_meta[0]['clip_id']}-{current_meta[-1]['clip_id']}")
        batch_tensor = torch.cat([frames2tensor(frames, fnum=clip_len, device=device) for frames in current_batch], dim=0)
        with torch.no_grad():
            embeds = model.get_vid_features(batch_tensor).cpu().numpy()
        embedding_batches.append(embeds)
        for m in current_meta:
            lookup_dict[str(m["clip_id"])] = m

    all_embeddings = np.vstack(embedding_batches)
    np.save(os.path.join(output_dir, "video_embeddings.npy"), all_embeddings)
    with open(os.path.join(output_dir, "embedding_lookup.json"), "w") as f:
        json.dump(lookup_dict, f, indent=2)

    print(f"✅ Saved embeddings to {output_dir}/video_embeddings.npy")
    print(f"✅ Saved lookup to  {output_dir}/embedding_lookup.json")


# ==== MAIN ENTRY ====
if __name__ == "__main__":
    process_directory_to_embeddings(VIDEO_DIR)

Running inference on batch of 128 clips: IDs 0-127


  return fn(*args, **kwargs)


Running inference on batch of 128 clips: IDs 128-255
Running inference on batch of 128 clips: IDs 256-383
Running inference on batch of 128 clips: IDs 384-511
Running inference on batch of 128 clips: IDs 512-639
Running inference on batch of 128 clips: IDs 640-767
Running inference on batch of 128 clips: IDs 768-895
Running inference on batch of 128 clips: IDs 896-1023
Running inference on batch of 128 clips: IDs 1024-1151
Running inference on batch of 128 clips: IDs 1152-1279
Running inference on batch of 128 clips: IDs 1280-1407
Running inference on batch of 128 clips: IDs 1408-1535
Running inference on final batch of 113 clips: IDs 1536-1648
✅ Saved embeddings to videoclip_output/video_embeddings.npy
✅ Saved lookup to  videoclip_output/embedding_lookup.json


## Visualize embeddings

In [8]:
# Load lookup metadata
with open("videoclip_output/embedding_lookup.json", "r") as f:
    lookup = json.load(f)

# Ensure entries are sorted by clip_id (assumed to match row index)
sorted_lookup = sorted(lookup.values(), key=lambda x: int(x["clip_id"]))

# Create metadata strings like: "video.mp4[3] @ 90s"
metadata = [
    f'{entry["video_file"]}[{entry["clip_index"]}] @ {entry["start_time_sec"]}s'
    for entry in sorted_lookup
]

# Load embedding matrix
emb_matrix = np.load("videoclip_output/video_embeddings.npy")

# Ensure alignment
assert emb_matrix.shape[0] == len(metadata), (
    f"❌ {emb_matrix.shape[0]} embeddings vs {len(metadata)} metadata entries"
)

# Write to TensorBoard
writer = SummaryWriter(log_dir="runs/embeds")
writer.add_embedding(
    emb_matrix,
    metadata=metadata,
    tag="my_embeddings"
)
writer.close()

print("✅ Done. Run:\n  tensorboard --logdir=runs/embeds\nThen open http://localhost:6006/#projector")

✅ Done. Run:
  tensorboard --logdir=runs/embeds
Then open http://localhost:6006/#projector


## Index in FAISS

In [12]:
EMB_PATH    = "videoclip_output/video_embeddings.npy"
LOOKUP_PATH = "videoclip_output/embedding_lookup.json"
INDEX_PATH  = "videoclip_output/video_embeddings.index"

# Load embeddings and lookup dict
embeddings = np.load(EMB_PATH)  
with open(LOOKUP_PATH, "r") as f:
    lookup = json.load(f)

# Build FAISS index (inner-product) and add IDs
faiss.normalize_L2(embeddings)
dim   = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index = faiss.IndexIDMap(index)

ids = np.arange(embeddings.shape[0], dtype="int64")
index.add_with_ids(embeddings, ids)

# Save the index for later
faiss.write_index(index, INDEX_PATH)

## Find duplicates

In [None]:
from collections import defaultdict
import numpy as np
import json
import faiss

def count_similar_clip_pairs(radius: float):
    """
    For each unique pair of video files, count how many clip pairs are similar
    (i.e., within the given radius), and record the average distance.
    Returns: list of (file1, file2, count, avg_distance)
    """
    embeddings = np.load(EMB_PATH)
    with open(LOOKUP_PATH, "r") as f:
        lookup = json.load(f)

    index = faiss.read_index(INDEX_PATH)
    faiss.normalize_L2(embeddings)

    lims, distances, labels = index.range_search(embeddings, radius)

    # Map (file1, file2) to list of distances
    pair_stats = defaultdict(list)

    for q in range(len(embeddings)):
        start, end = lims[q], lims[q+1]
        for i in range(start, end):
            idx = labels[i]
            if idx <= q:
                continue  # skip self or repeated
            f1 = lookup[str(q)]["video_file"]
            f2 = lookup[str(idx)]["video_file"]
            if f1 != f2:
                key = tuple(sorted((f1, f2)))
                pair_stats[key].append(distances[i])

    results = []
    for (f1, f2), dists in pair_stats.items():
        avg_dist = sum(dists) / len(dists)
        results.append((f1, f2, len(dists), avg_dist))

    return results

# Choose your similarity radius (e.g., 0.8 for cosine similarity >= 0.8)
radius = 0.999
pairs = count_similar_clip_pairs(radius)

for f1, f2, count, avg_dist in sorted(pairs, key=lambda x: -x[2]):
    print(f"{f1} <--> {f2} | similar clips: {count} | avg distance: {avg_dist:.4f}")

TNS_0030_V.mp4 <--> TNS_0031_V.mp4 | similar clips: 205 | avg distance: 1.0000
TNS_0024_V.mp4 <--> TNS_0025_V.mp4 | similar clips: 59 | avg distance: 1.0000


## Evaluate duplicates

In [18]:
import re
import csv

with open("dup_groundtruth.csv", newline="") as f:
    reader = csv.reader(f)
    # if your CSV has a header, skip it:
    next(reader, None)
    # columns: filename, status
    ground_truth = {row[0]: row[1] for row in reader}


def evaluate_duplicates(pairs, ground_truth):
    def extract_id(filename):
        match = re.search(r'TNS_(\d+)', filename)
        return match.group(1) if match else None

    # Normalize detected pairs based on ID number
    detected_duplicates = set()
    for f1, f2, _, _ in pairs:
        id1 = extract_id(f1)
        id2 = extract_id(f2)
        if id1 and id2:
            detected_duplicates.add(tuple(sorted([id1, id2])))

    # Normalize ground-truth: find all DUPLICATE → PRINCIPAL mappings
    true_duplicates = {k for k, v in ground_truth.items() if v.upper() == "DUPLICATE"}
    true_pairs = set()
    for dup in true_duplicates:
        dup_id = extract_id(dup)
        for principal, status in ground_truth.items():
            if status.upper() == "PRINCIPAL":
                principal_id = extract_id(principal)
                if dup_id and principal_id:
                    true_pairs.add(tuple(sorted([dup_id, principal_id])))

    # Evaluation
    true_positives = detected_duplicates & true_pairs
    false_positives = detected_duplicates - true_pairs
    false_negatives = true_pairs - detected_duplicates

    recall = len(true_positives) / (len(true_positives) + len(false_negatives)) if (len(true_positives) + len(false_negatives)) > 0 else 0.0

    print(f"✅ True Positives (correctly detected): {len(true_positives)}")
    print(f"❌ False Positives (wrongly flagged):   {len(false_positives)}")
    print(f"🔍 False Negatives (missed duplicates): {len(false_negatives)}")
    print(f"📈 Recall: {recall:.2%}")

    return {
        "true_positives": true_positives,
        "false_positives": false_positives,
        "false_negatives": false_negatives,
        "recall": recall,
    }
eval_result = evaluate_duplicates(pairs, ground_truth)

✅ True Positives (correctly detected): 2
❌ False Positives (wrongly flagged):   0
🔍 False Negatives (missed duplicates): 4685308
📈 Recall: 0.00%


## Find duplicates and optimal radius

In [22]:
import numpy as np
import json
import faiss
import re
import pandas as pd
from collections import defaultdict

# Paths to embeddings, lookup, and index
EMB_PATH    = 'videoclip_output/video_embeddings.npy'
LOOKUP_PATH = 'videoclip_output/embedding_lookup.json'
INDEX_PATH  = 'videoclip_output/video_embeddings.index'

def count_similar_clip_pairs(radius):
    embeddings = np.load(EMB_PATH)
    with open(LOOKUP_PATH) as f:
        lookup = json.load(f)
    index = faiss.read_index(INDEX_PATH)
    faiss.normalize_L2(embeddings)
    lims, distances, labels = index.range_search(embeddings, radius)

    pair_stats = defaultdict(list)
    for q in range(len(embeddings)):
        for i in range(lims[q], lims[q+1]):
            idx = labels[i]
            if idx <= q:
                continue
            f1 = lookup[str(q)]['video_file']
            f2 = lookup[str(idx)]['video_file']
            if f1 != f2:
                pair = tuple(sorted((f1, f2)))
                pair_stats[pair].append(distances[i])

    return [
        (f1, f2, len(dists), sum(dists)/len(dists))
        for (f1, f2), dists in pair_stats.items()
    ]

def evaluate_duplicates(pairs, ground_truth):
    def extract_id(name):
        m = re.search(r'TNS_(\d+)', name)
        return m.group(1) if m else None

    detected = {
        tuple(sorted((extract_id(f1), extract_id(f2))))
        for f1, f2, _, _ in pairs
    }
    truth_dups = {
        extract_id(k)
        for k, v in ground_truth.items()
        if v.upper() == 'DUPLICATE'
    }
    truth_pairs = {
        tuple(sorted((dup, extract_id(pr))))
        for dup in truth_dups
        for pr, st in ground_truth.items()
        if st.upper() == 'PRINCIPAL'
    }

    tp = len(detected & truth_pairs)
    fp = len(detected - truth_pairs)
    fn = len(truth_pairs - detected)
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0

    return {
        'true_positives': tp,
        'false_positives': fp,
        'false_negatives': fn,
        'recall': recall
    }

# --- Load ground truth ---
gt_df = pd.read_csv('dup_groundtruth.csv')  # comma-separated file
ground_truth = dict(zip(gt_df['UAR Code'], gt_df['Principal / Duplicate']))

# --- Loop over different radii ---
radii = [0.9, 0.95, 0.99, 0.999]
results = []

for r in radii:
    pairs   = count_similar_clip_pairs(r)
    metrics = evaluate_duplicates(pairs, ground_truth)
    results.append({'radius': r, **metrics})

# --- Export & display ---
df = pd.DataFrame(results)
df.to_csv('evaluation_results.csv', index=False)
print(df)

   radius  true_positives  false_positives  false_negatives        recall
0   0.900               3               28          4685307  6.402991e-07
1   0.950               2               10          4685308  4.268661e-07
2   0.990               2                0          4685308  4.268661e-07
3   0.999               2                0          4685308  4.268661e-07


## Search prompts

In [None]:
import json
import numpy as np
import faiss
import torch
from transformers import AutoTokenizer, AutoModel

# ─── CONFIG ────────────────────────────────────────────────────────────────────
LOOKUP_PATH = "output/embedding_lookup.json"
INDEX_PATH  = "output/video_embeddings.index"
MODEL_CHECKPOINT = "microsoft/xclip-base-patch32"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# ────────────────────────────────────────────────────────────────────────────────

# Load lookup table and FAISS index
with open(LOOKUP_PATH, "r") as f:
    lookup = json.load(f)
index = faiss.read_index(INDEX_PATH)

# Load tokenizer & text model
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
model = AutoModel.from_pretrained(MODEL_CHECKPOINT).to(DEVICE)
model.eval()

def search_prompts(prompts, top_k=1):
    """
    Encode text prompts, search the FAISS index, and return
    filename + timestamp for each top-k match.
    """
    # Tokenize and encode
    inputs = tokenizer(prompts, padding=True, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        text_feats = model.get_text_features(**inputs)
    text_feats = text_feats.cpu().numpy()

    # Normalize for cosine similarity
    faiss.normalize_L2(text_feats)

    # Search
    D, I = index.search(text_feats, top_k)

    # Collect results
    results = []
    for i, prompt in enumerate(prompts):
        matches = []
        for score, clip_id in zip(D[i], I[i]):
            info = lookup[str(int(clip_id))]
            matches.append({
                "prompt": prompt,
                "file": info["video_file"],
                "start_time_sec": info["start_time_sec"],
                "clip_index": info["clip_index"],
                "similarity": float(score)
            })
        results.append(matches)
    return results

prompts = [
    "Videos of a man injured in the daytime. Smoke is rising in the background",
    "A clown eating a huge bowl of spagetti while riding a bicycle"
]
results = search_prompts(prompts, top_k=3)
for match_list in results:
    for match in match_list:
        print(f"Prompt: {match['prompt']}")
        print(f"  File: {match['file']}")
        print(f"  Start time: {match['start_time_sec']}s (clip index {match['clip_index']})")
        print(f"  Similarity: {match['similarity']:.4f}")
        print()