# Semantic Matcher Comparison

This notebook compares different matching strategies (Naive vs. Pgvector) and embedding providers (Ollama vs. Google).

In [2]:
import os
import time
import sys
import pandas as pd
import importlib

sys.path.append(os.path.abspath("../.."))

# Override environment variables for local testing (Host machine)
# This ensures we connect to localhost ports exposed by Docker, ignoring any Docker-specific values in .env
os.environ["DATABASE_URL"] = "postgresql://postgres:postgres@localhost:5432/cv_matching"
os.environ["OLLAMA_BASE_URL"] = "http://localhost:11434"

# Reload modules to pick up changes
import core.matching.semantic_matcher
import core.matching.embeddings
import core.matching.strategies
importlib.reload(core.matching.semantic_matcher)
importlib.reload(core.matching.embeddings)
importlib.reload(core.matching.strategies)

from core.matching.semantic_matcher import HybridMatcher

# Mock Data
cv_data = {
    "basics": {
        "name": "John Doe",
        "summary": "Experienced software engineer with a focus on backend systems and AI."
    },
    "skills": ["Python", "Docker", "Kubernetes", "PostgreSQL", "Machine Learning"],
    "work": [
        {
            "company": "Tech Corp",
            "position": "Senior Engineer",
            "summary": "Led the development of microservices architecture."
        }
    ]
}

# Generate 10 mock jobs
job_candidates = []
for i in range(10):
    job_candidates.append({
        "job_id": str(i),
        "title": f"Job {i}",
        "company": f"Company {i}",
        "description": f"Description for job {i}. Requires Python and AI skills." if i % 2 == 0 else "Description for job {i}. Requires React and CSS.",
        "skills": ["Python", "AI"] if i % 2 == 0 else ["React", "CSS"]
    })

print(f"Generated {len(job_candidates)} mock jobs.")

Could not connect to Redis: Error -3 connecting to redis:6379. Temporary failure in name resolution.. Caching will be disabled.


Generated 10 mock jobs.


In [3]:
strategies = ["naive", "pgvector"]
providers = ["ollama"] # Add "google" if API key is set

results = []
# for i in range(3):
# to verify the higher loading time
for provider in providers:
    for strategy in strategies:
        print(f"Testing: Provider={provider}, Strategy={strategy}")
        try:
            matcher = HybridMatcher(embedding_provider=provider, strategy=strategy)
            
            # Measure Matching Time
            start_time = time.time()
            matches = matcher.match(cv_data, job_candidates)
            end_time = time.time()
            duration = end_time - start_time
            
            # Check top match (Expect even numbered jobs to be better)
            if matches:
                top_match_id = matches[0]['job_id']
                is_correct = int(top_match_id) % 2 == 0
            else:
                top_match_id = "None"
                is_correct = False
            
            results.append({
                "Provider": provider,
                "Strategy": strategy,
                "Time (s)": duration,
                "Top Match ID": top_match_id,
                "Correct": is_correct
            })
        except Exception as e:
            print(f"Failed: {e}")
            results.append({
                "Provider": provider,
                "Strategy": strategy,
                "Time (s)": None,
                "Top Match ID": "Error",
                "Correct": False
            })

df = pd.DataFrame(results)
print(df)

Testing: Provider=ollama, Strategy=naive
model_name gemini-2.5-flash
passed gemini
Testing: Provider=ollama, Strategy=pgvector
model_name gemini-2.5-flash
passed gemini
  Provider  Strategy   Time (s) Top Match ID  Correct
0   ollama     naive  13.226385            6     True
1   ollama  pgvector  13.590019            6     True


## the pgvector strategy seems to be performing better than naive strategy

In [4]:
# let's test further
# it was from kaggle, lets just take a small portion 
# job_desc = pd.read_csv("/home/acer/Desktop/cv/core/matching/job_descs/job_descriptions.csv")

In [5]:
# job_desc.head()

In [6]:

# take 10% random sample
# sample = job_desc.sample(frac=0.1, random_state=42)

# save to new file
# sample.to_csv("/home/acer/Desktop/cv/core/matching/job_descs/sampled.csv", index=False)

In [7]:
job_desc = pd.read_csv("/home/acer/Desktop/cv/core/matching/job_descs/sampled.csv")

In [8]:
job_desc.head()

Unnamed: 0,Job Id,Experience,Qualifications,Salary Range,location,Country,latitude,longitude,Work Type,Company Size,...,Contact,Job Title,Role,Job Portal,Job Description,Benefits,skills,Responsibilities,Company,Company Profile
0,1017340707950150,5 to 10 Years,BBA,$55K-$84K,Panama City,Panama,8.5379,-80.7821,Contract,93242,...,242.271.4459,Procurement Manager,Supplier Diversity Manager,The Muse,Promote diversity and inclusion in the supply ...,"{'Transportation Benefits, Professional Develo...",Supplier diversity programs Diversity and incl...,Promote supplier diversity initiatives and inc...,RWE AG,"{""Sector"":""Energy"",""Industry"":""Energy - Utilit..."
1,2421048253959975,0 to 12 Years,MBA,$61K-$108K,Tunis,Tunisia,33.8869,9.5375,Part-Time,18411,...,579.442.3566,Architectural Designer,Architectural Drafter,Idealist,Architectural Drafters assist architects and e...,"{'Employee Assistance Programs (EAP), Tuition ...",Architectural drafting AutoCAD 2D and 3D model...,Prepare detailed architectural drawings and pl...,Asian Paints,"{""Sector"":""Consumer Goods"",""Industry"":""Paints ..."
2,1822636506606589,0 to 11 Years,M.Com,$57K-$82K,Harare,Zimbabwe,-19.0154,29.1549,Full-Time,120621,...,858-776-8996,Art Teacher,Art Education Coordinator,ZipRecruiter,An Art Education Coordinator plans and manages...,"{'Employee Referral Programs, Financial Counse...",Art education curriculum Program development T...,"Coordinate art education programs, curriculum ...",Laboratory Corp. of America,"{""Sector"":""Healthcare Services"",""Industry"":""He..."
3,3068000579894602,5 to 12 Years,B.Com,$56K-$95K,Tirana,Albania,41.1533,20.1683,Temporary,128908,...,938.587.7586x35852,Environmental Consultant,Environmental Impact Analyst,Internships.com,Environmental Impact Analysts assess the envir...,"{'Transportation Benefits, Professional Develo...",Environmental impact analysis Data collection ...,Assess the environmental impact of projects an...,Massachusetts Mutual Life Insurance,"{""Sector"":""Insurance"",""Industry"":""Insurance: L..."
4,1747904829392680,4 to 13 Years,BCA,$58K-$122K,City of Baghdad,Iraq,33.2232,43.6793,Temporary,114717,...,(405)990-8581x57164,Art Teacher,Art Education Coordinator,LinkedIn,An Art Education Coordinator plans and manages...,"{'Employee Referral Programs, Financial Counse...",Art education curriculum Program development T...,"Coordinate art education programs, curriculum ...",Sartorius AG,"{""Sector"":""Lab Equipment"",""Industry"":""Life Sci..."


In [9]:
job_desc.columns

Index(['Job Id', 'Experience', 'Qualifications', 'Salary Range', 'location',
       'Country', 'latitude', 'longitude', 'Work Type', 'Company Size',
       'Job Posting Date', 'Preference', 'Contact Person', 'Contact',
       'Job Title', 'Role', 'Job Portal', 'Job Description', 'Benefits',
       'skills', 'Responsibilities', 'Company', 'Company Profile'],
      dtype='object')

In [10]:
jd = job_desc.iloc[2]["Job Description"]

In [11]:
from core.matching.embeddings import OllamaEmbedder


ol = OllamaEmbedder()

em = ol.embed_query(jd)

In [12]:
len(em)

768

In [13]:
# so this embedding model has dimension of 768 in its latent space.


In [14]:
em

[-0.17306236922740936,
 0.2512880265712738,
 -2.7322919368743896,
 -1.5434433221817017,
 0.521443784236908,
 0.900456964969635,
 1.1395894289016724,
 -0.15956056118011475,
 1.1757392883300781,
 0.14973707497119904,
 -0.15013960003852844,
 -0.7110105752944946,
 2.761680841445923,
 0.7259831428527832,
 0.6177282333374023,
 -0.5435665845870972,
 -0.48124706745147705,
 -0.461507648229599,
 -1.70650315284729,
 -0.04568539932370186,
 -0.021013641729950905,
 -0.8182064294815063,
 0.7228848934173584,
 0.14630192518234253,
 2.4110379219055176,
 0.5836649537086487,
 0.43763813376426697,
 -0.15887869894504547,
 -0.5995116233825684,
 0.3690204620361328,
 0.4218255877494812,
 -0.6024425625801086,
 -0.4826315939426422,
 0.06660065799951553,
 0.45111241936683655,
 -0.7472158670425415,
 0.37261146306991577,
 0.07330179959535599,
 -0.5050349235534668,
 -0.1711142659187317,
 0.28382018208503723,
 0.25231459736824036,
 0.05738614499568939,
 -1.269159197807312,
 -0.05831059068441391,
 -0.39996710419654846

In [15]:
# test without giving any JDs should look into pgvector
strategies = ["naive", "pgvector"]
providers = ["ollama"] # Add "google" if API key is set

results = []
# for i in range(3):
# to verify the higher loading time
for provider in providers:
    for strategy in strategies:
        print(f"Testing: Provider={provider}, Strategy={strategy}")
        try:
            matcher = HybridMatcher(embedding_provider=provider, strategy=strategy)
            
            # Measure Matching Time
            start_time = time.time()
            matches = matcher.match(cv_data)
            end_time = time.time()
            duration = end_time - start_time
            
            # Check top match (Expect even numbered jobs to be better)
            if matches:
                top_match_id = matches[0]['job_id']
                is_correct = int(top_match_id) % 2 == 0
            else:
                top_match_id = "None"
                is_correct = False
            
            results.append({
                "Provider": provider,
                "Strategy": strategy,
                "Time (s)": duration,
                "Top Match ID": top_match_id,
                "Correct": is_correct
            })
        except Exception as e:
            print(f"Failed: {e}")
            results.append({
                "Provider": provider,
                "Strategy": strategy,
                "Time (s)": None,
                "Top Match ID": "Error",
                "Correct": False
            })

df = pd.DataFrame(results)
print(df)

Testing: Provider=ollama, Strategy=naive
model_name gemini-2.5-flash
passed gemini
Testing: Provider=ollama, Strategy=pgvector
model_name gemini-2.5-flash
passed gemini
  Provider  Strategy   Time (s) Top Match ID  Correct
0   ollama     naive   0.057274         None    False
1   ollama  pgvector  16.123466            6     True
