# Semantic Matcher Tests & Benchmarks

This notebook tests the upgraded `HybridMatcher` which uses Ollama for embeddings and pgvector for storage/retrieval.

In [None]:
import os
import time
import json
import requests
from core.matching.semantic_matcher import HybridMatcher

# Ensure Ollama is running and model is pulled
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
MODEL = "nomic-embed-text"

def check_ollama():
    try:
        requests.post(f"{OLLAMA_BASE_URL}/api/pull", json={"name": MODEL})
        print(f"Model {MODEL} pulled successfully or already exists.")
    except Exception as e:
        print(f"Error connecting to Ollama: {e}")

check_ollama()

In [None]:
matcher = HybridMatcher()
print("Matcher initialized.")

In [None]:
cv_data = {
    "basics": {
        "name": "John Doe",
        "summary": "Experienced software engineer with a focus on backend systems and AI."
    },
    "skills": ["Python", "Docker", "Kubernetes", "PostgreSQL", "Machine Learning"],
    "work": [
        {
            "company": "Tech Corp",
            "position": "Senior Engineer",
            "summary": "Led the development of microservices architecture."
        }
    ]
}

job_candidates = [
    {
        "job_id": "1",
        "title": "Backend Developer",
        "company": "Startup Inc",
        "description": "Looking for a Python expert with Docker and K8s experience.",
        "skills": ["Python", "Docker", "AWS"]
    },
    {
        "job_id": "2",
        "title": "Frontend Developer",
        "company": "Web Solutions",
        "description": "React and TypeScript developer needed.",
        "skills": ["React", "TypeScript", "CSS"]
    },
    {
        "job_id": "3",
        "title": "AI Engineer",
        "company": "AI Labs",
        "description": "Build LLM applications and RAG pipelines.",
        "skills": ["Python", "LLM", "RAG", "Vector DB"]
    }
]

In [None]:
start_time = time.time()
cv_text = matcher._get_text_representation(cv_data)
embedding = matcher._get_embedding(cv_text)
end_time = time.time()
print(f"Embedding generation time: {end_time - start_time:.4f} seconds")
print(f"Embedding dimension: {len(embedding)}")

In [None]:
start_time = time.time()
results = matcher.match(cv_data, job_candidates)
end_time = time.time()
print(f"Matching time (including indexing): {end_time - start_time:.4f} seconds")

for res in results:
    print(f"Job: {res['job_title']} - Score: {res['match_score']:.4f}")
    print(f"Factors: {res['matching_factors']}")
    print("-" * 20)

In [None]:
# Verification assertions
assert len(results) > 0, "No results returned"
top_match = results[0]
assert top_match['job_id'] in ['1', '3'], "Expected Backend or AI job to be top match"
print("Verification passed!")