## Import & Config

In [None]:
import os
import json
import av
import torch
import numpy as np

from transformers import AutoProcessor, AutoModel

# ─── CONFIG ────────────────────────────────────────────────────────────────────
VIDEO_DIR        = "input_videos"  # set your directory path here
OUTPUT_DIR       = "output"
FRAMES_PER_CLIP  = 8        # frames per clip
FRAME_SAMPLE     = 1        # sample every frame (sampling rate multiplier)
BATCH_SIZE       = 16      # clips per model inference batch
MODEL_CHECKPOINT = "alibaba-pai/VideoCLIP-XL-v2"
CLIP_DURATION    = 10       # Duration of each window in seconds


if torch.cuda.is_available() : 
    DEVICE = "cuda:1"
    print(f"device:{DEVICE}")
elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
    DEVICE = torch.device("mps")
    print(f"device:{DEVICE}")
else:
    print(f"Plain ol' CPU")
# ────────────────────────────────────────────────────────────────────────────────

# Prepare output
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load processor & model
device = torch.device(DEVICE)
processor = AutoProcessor.from_pretrained(MODEL_CHECKPOINT, trust_remote_code=True)
model = AutoModel.from_pretrained(
    MODEL_CHECKPOINT,
    trust_remote_code=True
)
tokenizer = model.tokenizer
models = {"viclip": model, "tokenizer": tokenizer}
print("loaded VideoCLIP-XL-v2")

## Imports

In [None]:
def read_video_clip(container, indices, fps):
    """
    For each requested frame index, seek directly to its timestamp
    and decode just one frame.
    """
    frames = []
    for idx in indices:
        # convert frame index → µs timestamp
        ts = int((idx / fps) * 1e6)
        # seek to the nearest keyframe before ts
        container.seek(ts, any_frame=False, backward=True)
        # decode until we get one video frame, then stop
        for frame in container.decode(video=0):
            frames.append(frame.to_ndarray(format="rgb24"))
            break
    return frames


def process_directory_to_embeddings(
    video_dir: str,
    clip_len: int = FRAMES_PER_CLIP,
    frame_sample_rate: int = FRAME_SAMPLE,
    batch_size: int = BATCH_SIZE,
    output_dir: str = OUTPUT_DIR,
):
    """
    Walk through all video files in `video_dir`, extract fixed-length clips every 10 seconds,
    batch them, encode with XCLIP, and save embeddings + lookup metadata.
    """
    embedding_batches = []
    lookup_dict       = {}
    global_clip_id    = 0
    current_batch     = []
    current_meta      = []

    # Supported video extensions
    exts = ('.mp4', '.mov', '.avi')

    for filename in sorted(os.listdir(video_dir)):
        if not filename.lower().endswith(exts):
            continue

        video_path = os.path.join(video_dir, filename)
        container  = av.open(video_path)
        stream     = container.streams.video[0]
        total_frames = stream.frames
        fps = float(stream.average_rate) if stream.average_rate else 1.0
        
        clip_index = 0 
        # Define a 10-second window in frames
        window_size = int(10 * fps)
        for start in range(0, total_frames - window_size + 1, window_size):
            indices = np.linspace(start,
                                start + window_size,
                                num=clip_len,
                                endpoint=False,
                                dtype=np.int64)
            frames = read_video_clip(container, indices, fps)
            if len(frames) < clip_len:
                continue

            current_batch.append(frames)
            current_meta.append({
                "clip_id":        global_clip_id,
                "clip_index":     clip_index,
                "video_file":     filename,
                "start_time_sec": int(start / fps),
            })

            clip_index     += 1
            global_clip_id += 1

            # When batch is ready, run inference
            if len(current_batch) == batch_size:
                first_id = current_meta[0]["clip_id"]
                last_id  = current_meta[-1]["clip_id"]
                print(f"Running inference on batch of {batch_size} clips: IDs {first_id}-{last_id}")

                inputs = processor(videos=current_batch, return_tensors="pt", padding=True)
                inputs = {k: v.to(device) for k, v in inputs.items()}
                with torch.no_grad():
                    embeds = model.get_video_features(**inputs).cpu().numpy()

                embedding_batches.append(embeds)
                for m in current_meta:
                    lookup_dict[str(m["clip_id"])] = m

                current_batch = []
                current_meta  = []

        container.close()

    # Process any remaining clips
    if current_batch:
        print(f"Running inference on final batch of {len(current_batch)} clips: IDs {current_meta[0]['clip_id']}-{current_meta[-1]['clip_id']}")
        inputs = processor(videos=current_batch, return_tensors="pt", padding=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            embeds = model.get_video_features(**inputs).cpu().numpy()

        embedding_batches.append(embeds)
        for m in current_meta:
            lookup_dict[str(m["clip_id"])] = m

    # Concatenate & save
    all_embeddings = np.vstack(embedding_batches)
    print(f"Saving {all_embeddings.shape[0]} embeddings to disk...")
    np.save(os.path.join(output_dir, "video_embeddings.npy"), all_embeddings)
    with open(os.path.join(output_dir, "embedding_lookup.json"), "w") as f:
        json.dump(lookup_dict, f, indent=2)

    print(f"✅ Saved embeddings to {output_dir}/video_embeddings.npy")
    print(f"✅ Saved lookup to  {output_dir}/embedding_lookup.json")


if __name__ == "__main__":
    process_directory_to_embeddings(VIDEO_DIR)

In [None]:
import os
import json
import numpy as np
import torch
import av
import cv2
from transformers import AutoModel

# ==== CONFIGURATION ====
VIDEO_DIR       = "input_videos"
OUTPUT_DIR      = "output_embeddings"
FRAMES_PER_CLIP = 12
FRAME_SAMPLE    = 8   # Unused here, but can be used to subsample frames
BATCH_SIZE      = 8

if torch.cuda.is_available() : 
    device = "cuda:1"
    print(f"device:{DEVICE}")
elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device("mps")
    print(f"device:{device}")
else:
    print(f"Plain ol' CPU")

# ==== LOAD ViCLIP ====
model = AutoModel.from_pretrained("OpenGVLab/ViCLIP-L-14-hf", trust_remote_code=True).to(device).eval()


# ==== FRAME EXTRACTOR ====
def read_video_clip(container, indices, fps):
    frames = []
    for idx in indices:
        ts = int((idx / fps) * 1e6)
        container.seek(ts, any_frame=False, backward=True)
        for frame in container.decode(video=0):
            frames.append(frame.to_ndarray(format="rgb24"))
            break
    return frames


# ==== FRAME NORMALIZATION + TENSOR CONVERSION ====
v_mean = np.array([0.485, 0.456, 0.406]).reshape(1, 1, 3)
v_std = np.array([0.229, 0.224, 0.225]).reshape(1, 1, 3)

def normalize(data):
    return (data / 255.0 - v_mean) / v_std

def frames2tensor(vid_list, fnum=8, target_size=(224, 224), device=torch.device('cuda')):
    assert len(vid_list) >= fnum
    step = len(vid_list) // fnum
    vid_list = vid_list[::step][:fnum]
    vid_list = [cv2.resize(x[:, :, ::-1], target_size) for x in vid_list]
    vid_tube = [np.expand_dims(normalize(x), axis=(0, 1)) for x in vid_list]
    vid_tube = np.concatenate(vid_tube, axis=1)  # (1, T, H, W, C)
    vid_tube = np.transpose(vid_tube, (0, 1, 4, 2, 3))  # (1, T, C, H, W)
    vid_tube = torch.from_numpy(vid_tube).to(device, non_blocking=True).float()
    return vid_tube


# ==== BATCH PROCESSING FUNCTION ====
def process_directory_to_embeddings(
    video_dir: str,
    clip_len: int = FRAMES_PER_CLIP,
    batch_size: int = BATCH_SIZE,
    output_dir: str = OUTPUT_DIR,
):
    os.makedirs(output_dir, exist_ok=True)
    embedding_batches = []
    lookup_dict = {}
    global_clip_id = 0
    current_batch = []
    current_meta = []

    exts = ('.mp4', '.mov', '.avi')

    for filename in sorted(os.listdir(video_dir)):
        if not filename.lower().endswith(exts):
            continue

        video_path = os.path.join(video_dir, filename)
        container = av.open(video_path)
        stream = container.streams.video[0]
        total_frames = stream.frames
        fps = float(stream.average_rate) if stream.average_rate else 1.0

        clip_index = 0
        window_size = int(10 * fps)

        for start in range(0, total_frames - window_size + 1, window_size):
            indices = np.linspace(start, start + window_size, num=clip_len, endpoint=False, dtype=np.int64)
            frames = read_video_clip(container, indices, fps)
            if len(frames) < clip_len:
                continue

            current_batch.append(frames)
            current_meta.append({
                "clip_id":        global_clip_id,
                "clip_index":     clip_index,
                "video_file":     filename,
                "start_time_sec": int(start / fps),
            })

            clip_index += 1
            global_clip_id += 1

            if len(current_batch) == batch_size:
                print(f"Running inference on batch of {batch_size} clips: IDs {current_meta[0]['clip_id']}-{current_meta[-1]['clip_id']}")
                batch_tensor = torch.cat([frames2tensor(frames, fnum=clip_len, device=device) for frames in current_batch], dim=0)
                with torch.no_grad():
                    embeds = model.get_vid_features(batch_tensor).cpu().numpy()

                embedding_batches.append(embeds)
                for m in current_meta:
                    lookup_dict[str(m["clip_id"])] = m

                current_batch = []
                current_meta = []

        container.close()

    # Final batch
    if current_batch:
        print(f"Running inference on final batch of {len(current_batch)} clips: IDs {current_meta[0]['clip_id']}-{current_meta[-1]['clip_id']}")
        batch_tensor = torch.cat([frames2tensor(frames, fnum=clip_len, device=device) for frames in current_batch], dim=0)
        with torch.no_grad():
            embeds = model.get_vid_features(batch_tensor).cpu().numpy()
        embedding_batches.append(embeds)
        for m in current_meta:
            lookup_dict[str(m["clip_id"])] = m

    all_embeddings = np.vstack(embedding_batches)
    np.save(os.path.join(output_dir, "video_embeddings.npy"), all_embeddings)
    with open(os.path.join(output_dir, "embedding_lookup.json"), "w") as f:
        json.dump(lookup_dict, f, indent=2)

    print(f"✅ Saved embeddings to {output_dir}/video_embeddings.npy")
    print(f"✅ Saved lookup to  {output_dir}/embedding_lookup.json")


# ==== MAIN ENTRY ====
if __name__ == "__main__":
    process_directory_to_embeddings(VIDEO_DIR)

  from .autonotebook import tqdm as notebook_tqdm


device:mps




## Index in FAISS

In [None]:
import numpy as np
import faiss
import json

# ─── CONFIG ─────────────────────────────────────────────────────────────
EMB_PATH    = "output/video_embeddings.npy"
LOOKUP_PATH = "output/embedding_lookup.json"
INDEX_PATH  = "output/video_embeddings.index"
# ────────────────────────────────────────────────────────────────────────

# 1️⃣ Load embeddings and lookup dict
embeddings = np.load(EMB_PATH)  
with open(LOOKUP_PATH, "r") as f:
    lookup = json.load(f)

# 2️⃣ Build FAISS index (inner-product) and add IDs
faiss.normalize_L2(embeddings)
dim   = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index = faiss.IndexIDMap(index)

ids = np.arange(embeddings.shape[0], dtype="int64")
index.add_with_ids(embeddings, ids)

# (Optional) save the index for later
faiss.write_index(index, INDEX_PATH)

## Find duplicates

In [None]:
import numpy as np
import faiss
import json


EMB_PATH    = "output/video_embeddings.npy"
LOOKUP_PATH = "output/embedding_lookup.json"
INDEX_PATH  = "output/video_embeddings.index"


def find_similar_filenames(radius: float):
    """
    Load embeddings, lookup dict, and FAISS index, then perform a range search
    to find all unique pairs of different video filenames whose clip embeddings
    lie within the given radius (cosine similarity threshold).
    """
    # Load embeddings and metadata
    embeddings = np.load(EMB_PATH)
    with open(LOOKUP_PATH, "r") as f:
        lookup = json.load(f)

    # Read the FAISS index
    index = faiss.read_index(INDEX_PATH)

    # Ensure embeddings are normalized (for cosine similarity via inner product)
    faiss.normalize_L2(embeddings)

    # Perform range search
    lims, distances, labels = index.range_search(embeddings, radius)

    pairs = set()
    # Iterate over each query embedding
    for q in range(len(embeddings)):
        start, end = lims[q], lims[q+1]
        for idx in labels[start:end]:
            if idx <= q:
                # skip self and symmetric duplicates
                continue
            f1 = lookup[str(q)]["video_file"]
            f2 = lookup[str(idx)]["video_file"]
            if f1 != f2:
                # add sorted tuple to avoid duplicate orderings
                pairs.add(tuple(sorted((f1, f2))))
    return pairs

if __name__ == "__main__":
    # Choose your similarity radius (e.g., 0.8 for cosine similarity >= 0.8)
    radius = 0.999

    similar_pairs = find_similar_filenames(radius)
    print("Similar file pairs within radius", radius, ":")
    for a, b in sorted(similar_pairs):
        print(f"{a} <--> {b}")

## Visualize embeddings

In [None]:
# Reload your positional name_list
with open("clip_name_list.pkl", "rb") as f:
    name_dict = pickle.load(f)

emb_matrix = np.load(EMB_FILE)[:len(name_dict)]

# Build metadata from name_list
metadata = [ f"{fn}[{idx}]" for fn, idx in name_dict ]

# Sanity check lengths match
assert emb_matrix.shape[0] == len(metadata), (
    f"❌ {emb_matrix.shape[0]} embeddings vs {len(metadata)} metadata entries"
)

# 5) Write to TensorBoard
writer = SummaryWriter(log_dir="runs/embeds")
writer.add_embedding(
    emb_matrix,
    metadata=metadata,
    tag="my_embeddings"
)
writer.close()

print("Done. Run:\n  tensorboard --logdir=runs/embeds\nThen open http://localhost:6006/#projector")

## Search prompts

In [None]:
import json
import numpy as np
import faiss
import torch
from transformers import AutoTokenizer, AutoModel

# ─── CONFIG ────────────────────────────────────────────────────────────────────
LOOKUP_PATH = "output/embedding_lookup.json"
INDEX_PATH  = "output/video_embeddings.index"
MODEL_CHECKPOINT = "microsoft/xclip-base-patch32"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# ────────────────────────────────────────────────────────────────────────────────

# Load lookup table and FAISS index
with open(LOOKUP_PATH, "r") as f:
    lookup = json.load(f)
index = faiss.read_index(INDEX_PATH)

# Load tokenizer & text model
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
model = AutoModel.from_pretrained(MODEL_CHECKPOINT).to(DEVICE)
model.eval()

def search_prompts(prompts, top_k=1):
    """
    Encode text prompts, search the FAISS index, and return
    filename + timestamp for each top-k match.
    """
    # Tokenize and encode
    inputs = tokenizer(prompts, padding=True, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        text_feats = model.get_text_features(**inputs)
    text_feats = text_feats.cpu().numpy()

    # Normalize for cosine similarity
    faiss.normalize_L2(text_feats)

    # Search
    D, I = index.search(text_feats, top_k)

    # Collect results
    results = []
    for i, prompt in enumerate(prompts):
        matches = []
        for score, clip_id in zip(D[i], I[i]):
            info = lookup[str(int(clip_id))]
            matches.append({
                "prompt": prompt,
                "file": info["video_file"],
                "start_time_sec": info["start_time_sec"],
                "clip_index": info["clip_index"],
                "similarity": float(score)
            })
        results.append(matches)
    return results

if __name__ == "__main__":
    prompts = [
        "Videos of a man injured in the daytime. Smoke is rising in the background",
        "A clown eating a huge bowl of spagetti while riding a bicycle"
    ]
    results = search_prompts(prompts, top_k=3)
    for match_list in results:
        for match in match_list:
            print(f"Prompt: {match['prompt']}")
            print(f"  File: {match['file']}")
            print(f"  Start time: {match['start_time_sec']}s (clip index {match['clip_index']})")
            print(f"  Similarity: {match['similarity']:.4f}")
            print()