# Vector Search Performance Benchmark

This notebook benchmarks the performance of vector search with and without HNSW indexes.

In [None]:
import os
import sys
import time
import psycopg2
import numpy as np
import matplotlib.pyplot as plt
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT

# Add project root to path
sys.path.append(os.path.abspath(".."))

DB_URL = os.getenv("DATABASE_URL", "postgresql://postgres:postgres@localhost:5432/cv_matching")

In [None]:
def get_connection():
    conn = psycopg2.connect(DB_URL)
    conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
    return conn

def drop_indexes():
    conn = get_connection()
    cur = conn.cursor()
    cur.execute("DROP INDEX IF EXISTS idx_jobs_embedding;")
    cur.execute("DROP INDEX IF EXISTS idx_cv_embedding;")
    conn.close()
    print("Indexes dropped.")

def create_indexes():
    conn = get_connection()
    cur = conn.cursor()
    cur.execute("CREATE INDEX IF NOT EXISTS idx_jobs_embedding ON jobs USING hnsw (embedding vector_cosine_ops);")
    conn.close()
    print("Indexes created.")

def run_benchmark(iterations=10):
    conn = get_connection()
    cur = conn.cursor()
    
    # Generate a random query vector
    query_vector = np.random.rand(768).tolist()
    
    times = []
    for _ in range(iterations):
        start_time = time.time()
        cur.execute("""
            SELECT id, 1 - (embedding <=> %s::vector) as similarity
            FROM jobs
            ORDER BY embedding <=> %s::vector
            LIMIT 10;
        """, (query_vector, query_vector))
        cur.fetchall()
        times.append(time.time() - start_time)
        
    conn.close()
    return np.mean(times), np.std(times)

## Benchmark without Index

In [None]:
drop_indexes()
mean_no_index, std_no_index = run_benchmark()
print(f"No Index: {mean_no_index:.4f}s ± {std_no_index:.4f}s")

## Benchmark with HNSW Index

In [None]:
create_indexes()
mean_index, std_index = run_benchmark()
print(f"With Index: {mean_index:.4f}s ± {std_index:.4f}s")

In [None]:
# Plot results
labels = ['No Index', 'HNSW Index']
means = [mean_no_index, mean_index]
stds = [std_no_index, std_index]

plt.bar(labels, means, yerr=stds, capsize=5)
plt.ylabel('Execution Time (s)')
plt.title('Vector Search Performance')
plt.show()