### Imports

In [1]:
import os
import re
import cv2
import numpy as np
import torch
import faiss
from PIL import Image
from transformers import AutoProcessor, AutoModel
import av
import math
from utils import *
import pandas as pd

VIDEO_DIR = "input_videos"
EMB_FILE  = "embeddings.npy"
ID_FILE   = "embedding_ids.npy"
INDEX_PATH = "clip_index.faiss"

  from .autonotebook import tqdm as notebook_tqdm


### Load GPU

In [2]:
device = "cuda:1" if torch.cuda.is_available() else "cpu"
if device == "cuda:1":
    print(f"running on GPU: {device}")
else:
    print("running on CPU")

running on GPU: cuda:1


### Infer, index, store 

In [None]:
# Model config 
CKPT      = "microsoft/xclip-base-patch32"
processor = AutoProcessor.from_pretrained(CKPT)
model     = AutoModel.from_pretrained(CKPT).to(device)
dim = 512
BATCH = 256

# Create memmaps
max_rows = scan_dir(VIDEO_DIR)
print(f"number of total segments in directory: {max_rows}")
emb_memmap = create_memap(file_path=EMB_FILE, 
                          dtype=np.float32,
                          shape=(max_rows, dim),
                          init_value=0.0)

id_memmap = create_memap(file_path=ID_FILE,
                        dtype=np.int64,
                        shape=(max_rows,),
                        init_value=-1)

# Create index 
index = faiss.IndexFlatL2(dim)
index = faiss.IndexIDMap2(index)

# Run infernece and index 
total_clips, final_write_ptr = process_video_directory(
    VIDEO_DIR, processor, model, index, emb_memmap, id_memmap, BATCH
)

print(f"Successfully processed {total_clips} video clips")
print(f"Data written to positions 0-{final_write_ptr-1} in memory arrays")
assert final_write_ptr == total_clips, "Mismatch between clips and write position!"

faiss.write_index(index, INDEX_PATH)
print("FAISS index saved to disk.")

number of total segments in directory: 1737
[Batch 8] Inference on 256 IDs: 10 … 1626 → writing at rows 0 … 255
[Batch 8] added to FAISS (index size now: 256)
[Batch 9] Inference on 256 IDs: 1627 … 227 → writing at rows 256 … 511
[Batch 9] added to FAISS (index size now: 512)
[Batch 10] Inference on 256 IDs: 228 … 2796 → writing at rows 512 … 767
[Batch 10] added to FAISS (index size now: 768)
[Batch 11] Inference on 256 IDs: 2797 … 314 → writing at rows 768 … 1023
[Batch 11] added to FAISS (index size now: 1024)
[Batch 12] Inference on 256 IDs: 315 … 394 → writing at rows 1024 … 1279
[Batch 12] added to FAISS (index size now: 1280)
[Batch 13] Inference on 256 IDs: 395 … 643 → writing at rows 1280 … 1535
[Batch 13] added to FAISS (index size now: 1536)
[Batch 14] Inference on 200 IDs: 644 … 897 → writing at rows 1536 … 1735
[Batch 14] added to FAISS (index size now: 1736)
Total segments written: 1736
FAISS index size: 1736
Successfully processed 1736 video clips
Data written to positio

### Find duplicates

In [None]:
index = faiss.read_index(INDEX_PATH)

# Get the distance matrix and check distribution 
distance_matrix, id_rows, faiss_ids, all_clip_ids = prepare_search_matrices(index, EMB_FILE)

print("Distance statistics:")
print(f"Min distance: {distance_matrix.min():.4f}")
print(f"Max distance: {distance_matrix.max():.4f}")
print(f"Mean distance: {distance_matrix.mean():.4f}")
print(f"Median distance: {np.median(distance_matrix):.4f}")
print(f"95th percentile: {np.percentile(distance_matrix, 95):.4f}")

# Check distances excluding self-similarity (column 0)
non_self_distances = distance_matrix[:, 1:].flatten()
non_self_distances = non_self_distances[non_self_distances > 0]  # Remove zeros
print(f"\nNon-self distances:")
print(f"Min: {non_self_distances.min():.4f}")
print(f"Max: {non_self_distances.max():.4f}")
print(f"Mean: {non_self_distances.mean():.4f}")

// ...existing code...

# Remove duplicates and ensure unique thresholds
thresholds = [
    0.0, 
    0.001, 0.002, 0.003, 0.004, 0.005,
    0.006, 0.007, 0.008, 0.009, 0.01,
    0.015, 0.02, 0.025, 0.03, 0.04, 0.05
]

# Remove any duplicates while preserving order
unique_thresholds = []
seen = set()
for thresh in thresholds:
    if thresh not in seen:
        unique_thresholds.append(thresh)
        seen.add(thresh)

thresholds = unique_thresholds
print(f"\nUnique thresholds in the critical range (0.0-0.05):")
print(f"Thresholds: {thresholds}")
print(f"Number of unique thresholds: {len(thresholds)}")

# Run analysis with unique thresholds only
df_results = analyze_multiple_thresholds(distance_matrix, id_rows, faiss_ids, thresholds)

# Create DataFrame and export
df_results = df_results.sort_values(['File_name'], ignore_index=True)
output_file = "threshold_deduplication_results.csv"
df_results.to_csv(output_file, index=False)
print(f"Results exported to {output_file}")

# Print summary
print(f"\nSummary:")
print(f"Total files analyzed: {len(df_results)}")
print(f"Thresholds tested: {len(thresholds)}")

# Simple threshold effectiveness analysis
print(f"\nThreshold effectiveness analysis:")
for thresh in thresholds:
    non_dup_col = f'Non_duplicates_{thresh:.3f}'
    
    # Try .2f formatting if .3f doesn't exist
    if non_dup_col not in df_results.columns:
        non_dup_col = f'Non_duplicates_{thresh:.2f}'
    
    if non_dup_col in df_results.columns:
        non_dup_count = int(df_results[non_dup_col].iloc[0])
        total_files = len(df_results)
        dup_count = total_files - non_dup_count
        percentage = (dup_count / total_files) * 100
        
        print(f"Threshold {thresh:7.3f}: {dup_count:3d} duplicates ({percentage:5.1f}%), {non_dup_count:4d} unique files")

print(f"\nFirst 10 rows:")
print(df_results.head(10).to_string())

Loading embeddings and performing FAISS search...
Prepared matrices for 1737 embeddings with k=100 neighbors
Distance statistics:
Min distance: 0.0000
Max distance: 587.9579
Mean distance: 178.1177
Median distance: 167.3908
95th percentile: 338.4974

Non-self distances:
Min: 0.0000
Max: 587.9579
Mean: 180.2496

Thresholds in the critical range (0.0-0.1):
Thresholds: [0.0, 0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009, 0.01, 0.015, 0.02, 0.025, 0.03, 0.04, 0.05]
Processing threshold: 0.00
Processing threshold: 0.00
Processing threshold: 0.00
Processing threshold: 0.00
Processing threshold: 0.00
Processing threshold: 0.01
Processing threshold: 0.01
Processing threshold: 0.01
Processing threshold: 0.01
Processing threshold: 0.01
Processing threshold: 0.01
Processing threshold: 0.01
Processing threshold: 0.02
Processing threshold: 0.03
Processing threshold: 0.03
Processing threshold: 0.04
Processing threshold: 0.05
Results exported to threshold_deduplication_results.csv

Su