# Video Deduplication Workflow

This notebook demonstrates the complete deduplication workflow using functions from `deduplicate_utils.py`.

## Overview
The workflow consists of three main steps:
1. **Build ID Lookup**: Create a mapping from FAISS IDs to metadata
2. **Collect Groups**: Use FAISS range search to find duplicate clusters
3. **Write Results**: Export duplicate pairs to JSONL format

## Setup and Imports

In [None]:
import numpy as np
import faiss
import json
from pathlib import Path
from deduplicate_utils import (
    build_id_lookup,
    collect_groups,
    write_dedup_jsonl,
    update_duplicate_flags
)

## Configuration

Set paths and parameters for deduplication

In [None]:
# File paths
DATABASE_PATH = Path("descriptions.jsonl")  # Your database file
VIDEO_INDEX_PATH = Path("video_embeddings.index")  # FAISS index for videos
IMAGE_INDEX_PATH = Path("image_embeddings.index")  # FAISS index for images
VIDEO_EMBEDDINGS_PATH = Path("video_embeddings.npy")  # Video embeddings
IMAGE_EMBEDDINGS_PATH = Path("image_embeddings.npy")  # Image embeddings

# Deduplication mode
MODE = "duplicate"  # Options: "duplicate" (strict) or "similar" (looser)

# Set radius based on mode
if MODE == "duplicate":
    RADIUS = 0.9999  # Strict, exact duplicates
elif MODE == "similar":
    RADIUS = 0.99    # Looser, near-similar matches
else:
    raise ValueError("MODE must be 'duplicate' or 'similar'")

print(f"Mode: {MODE}")
print(f"Radius: {RADIUS}")

## Step 1: Load Database and Build ID Lookup

Load the database and create a mapping from FAISS IDs to metadata

In [None]:
# Load database from JSONL
database = []
with open(DATABASE_PATH, 'r', encoding='utf-8') as f:
    for line in f:
        if line.strip():
            database.append(json.loads(line))

print(f"Loaded {len(database)} entries from database")

# Build lookup table
lookup = build_id_lookup(database)
print(f"Built lookup table with {len(lookup)} FAISS IDs")

# Display sample lookup entry
if lookup:
    sample_id = next(iter(lookup))
    print(f"\nSample lookup entry (FAISS ID {sample_id}):")
    print(json.dumps(lookup[sample_id], indent=2))

## Step 2: Video Deduplication

Process video embeddings to find duplicate groups

In [None]:
# Load video embeddings and index
video_embeddings = np.load(VIDEO_EMBEDDINGS_PATH)
video_index = faiss.read_index(str(VIDEO_INDEX_PATH))

print(f"Loaded {len(video_embeddings)} video embeddings")
print(f"FAISS index contains {video_index.ntotal} vectors")

In [None]:
# Find duplicate groups using FAISS range search
video_groups = collect_groups(video_index, video_embeddings, radius=RADIUS)

print(f"Found {len(video_groups)} duplicate video groups")

# Show statistics about group sizes
if video_groups:
    group_sizes = [len(g) for g in video_groups]
    print(f"\nGroup size statistics:")
    print(f"  Min: {min(group_sizes)}")
    print(f"  Max: {max(group_sizes)}")
    print(f"  Average: {sum(group_sizes) / len(group_sizes):.2f}")
    
    # Show largest group
    largest_group = max(video_groups, key=len)
    print(f"\nLargest group has {len(largest_group)} items:")
    print(f"  FAISS IDs: {largest_group[:10]}{'...' if len(largest_group) > 10 else ''}")

In [None]:
# Write video deduplication results to JSONL
video_output_path = Path(f"video_dedup_{MODE}.jsonl")
write_dedup_jsonl(video_groups, lookup, video_output_path)

print(f"Video deduplication results written to {video_output_path}")

## Step 3: Image Deduplication

Process image embeddings to find duplicate groups

In [None]:
# Load image embeddings and index
image_embeddings = np.load(IMAGE_EMBEDDINGS_PATH)
image_index = faiss.read_index(str(IMAGE_INDEX_PATH))

print(f"Loaded {len(image_embeddings)} image embeddings")
print(f"FAISS index contains {image_index.ntotal} vectors")

In [None]:
# Find duplicate groups using FAISS range search
image_groups = collect_groups(image_index, image_embeddings, radius=RADIUS)

print(f"Found {len(image_groups)} duplicate image groups")

In [None]:
# Write image deduplication results to JSONL
image_output_path = Path(f"image_dedup_{MODE}.jsonl")
write_dedup_jsonl(image_groups, lookup, image_output_path)

print(f"Image deduplication results written to {image_output_path}")

# Count the number of duplicate pairs
with open(image_output_path, 'r', encoding='utf-8') as f:
    image_pairs = [json.loads(line) for line in f if line.strip()]

print(f"Total image duplicate pairs: {len(image_pairs)}")

# Display sample pair
if image_pairs:
    print("\nSample image duplicate pair:")
    print(json.dumps(image_pairs[0], indent=2))

## Step 4: Summary Statistics

Analyze the overall deduplication results

In [None]:
print("=" * 60)
print(f"DEDUPLICATION SUMMARY (Mode: {MODE}, Radius: {RADIUS})")
print("=" * 60)
print(f"\nVideo Deduplication:")
print(f"  Groups found: {len(video_groups)}")
print(f"  File pairs: {len(video_pairs)}")
print(f"  Output: {video_output_path}")

print(f"\nImage Deduplication:")
print(f"  Groups found: {len(image_groups)}")
print(f"  File pairs: {len(image_pairs)}")
print(f"  Output: {image_output_path}")

print(f"\nTotal duplicate pairs: {len(video_pairs) + len(image_pairs)}")