# Semantic Matcher Comparison

This notebook compares different matching strategies (Naive vs. Pgvector) and embedding providers (Ollama vs. Google).

In [4]:
import os
import time
import sys
import pandas as pd

sys.path.append(os.path.abspath("../.."))


In [5]:
from core.matching.semantic_matcher import HybridMatcher


Could not connect to Redis: Error -3 connecting to redis:6379. Temporary failure in name resolution.. Caching will be disabled.


In [None]:

# Mock Data
cv_data = {
    "basics": {
        "name": "John Doe",
        "summary": "Experienced software engineer with a focus on backend systems and AI."
    },
    "skills": ["Python", "Docker", "Kubernetes", "PostgreSQL", "Machine Learning"],
    "work": [
        {
            "company": "Tech Corp",
            "position": "Senior Engineer",
            "summary": "Led the development of microservices architecture."
        }
    ]
}

# Generate 10 mock jobs
job_candidates = []
for i in range(10):
    job_candidates.append({
        "job_id": str(i),
        "title": f"Job {i}",
        "company": f"Company {i}",
        "description": f"Description for job {i}. Requires Python and AI skills." if i % 2 == 0 else "Description for job {i}. Requires React and CSS.",
        "skills": ["Python", "AI"] if i % 2 == 0 else ["React", "CSS"]
    })

print(f"Generated {len(job_candidates)} mock jobs.")

In [None]:
strategies = ["naive", "pgvector"]
providers = ["ollama"] # Add "google" if API key is set

results = []

for provider in providers:
    for strategy in strategies:
        print(f"Testing: Provider={provider}, Strategy={strategy}")
        try:
            matcher = HybridMatcher(embedding_provider=provider, strategy=strategy)
            
            # Measure Matching Time
            start_time = time.time()
            matches = matcher.match(cv_data, job_candidates)
            end_time = time.time()
            duration = end_time - start_time
            
            # Check top match (Expect even numbered jobs to be better)
            top_match_id = matches[0]['job_id']
            is_correct = int(top_match_id) % 2 == 0
            
            results.append({
                "Provider": provider,
                "Strategy": strategy,
                "Time (s)": duration,
                "Top Match ID": top_match_id,
                "Correct": is_correct
            })
        except Exception as e:
            print(f"Failed: {e}")
            results.append({
                "Provider": provider,
                "Strategy": strategy,
                "Time (s)": None,
                "Top Match ID": "Error",
                "Correct": False
            })

df = pd.DataFrame(results)
print(df)