In [2]:
import json
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from concurrent.futures import ThreadPoolExecutor, as_completed, TimeoutError
import atexit
import requests

BASE = Path(".")
RESUME_PARQUET = BASE / "processed" / "resume_matched.parquet"
JOB_PARQUET    = BASE / "processed" / "dice_job_descriptions_matched.parquet"
MATCH_DIR      = BASE / "matches"
CACHE_FILE     = MATCH_DIR / "skill_cache.json"

MATCH_DIR.mkdir(exist_ok=True)

EMBED_MODEL = "nomic-embed-text"

In [3]:
resumes = pd.read_parquet(RESUME_PARQUET)
jobs    = pd.read_parquet(JOB_PARQUET)

resumes = resumes[resumes['Category'] == 'INFORMATION-TECHNOLOGY']

print(f"Loaded {len(resumes):,} resumes and {len(jobs):,} jobs")
print("Resume columns:", resumes.columns.tolist())
print("Job columns:   ", jobs.columns.tolist())

Loaded 120 resumes and 22,000 jobs
Resume columns: ['ID', 'Resume_str', 'Resume_html', 'Category', 'extracted_skills', 'length', 'matched_skills']
Job columns:    ['advertiserurl', 'company', 'employmenttype_jobstatus', 'jobdescription', 'jobid', 'joblocation_address', 'jobtitle', 'postdate', 'shift', 'site_name', 'skills', 'uniq_id', 'extracted_skills', 'matched_skills']


In [4]:
def get_esco_skills(row):
    """
    Extract ESCO labels from 'matched_skills' column.
    Works with: list, np.ndarray, tuple, str, None, NaN.
    """
    if hasattr(row, '_fields'):
        raw = getattr(row, 'matched_skills', None)
    else:
        raw = row.get('matched_skills', None)

    if raw is None:
        return []

    if isinstance(raw, float) and pd.isna(raw):
        return []

    if isinstance(raw, (list, np.ndarray, tuple)):
        return [str(s).strip() for s in raw if str(s).strip()]

    if isinstance(raw, str):
        import ast
        try:
            parsed = ast.literal_eval(raw)
            if isinstance(parsed, (list, tuple, np.ndarray)):
                return [str(s).strip() for s in parsed if str(s).strip()]
        except:
            pass
        return [s.strip() for s in raw.split(',') if s.strip()]

    return []

In [5]:
skill_cache = {}
if CACHE_FILE.exists():
    print(f"Loading cache with {len(json.load(open(CACHE_FILE)))} skills...")
    skill_cache = {k: np.array(v) for k, v in json.load(open(CACHE_FILE)).items()}

def embed_skill(skill, timeout=30):
    skill = skill.strip()
    if skill in skill_cache:
        return skill_cache[skill]
    
    try:
        response = requests.post(
            "http://localhost:11434/api/embeddings",
            json={"model": EMBED_MODEL, "prompt": skill},
            timeout=timeout
        )
        response.raise_for_status()
        vec = np.array(response.json()["embedding"])
        skill_cache[skill] = vec
        return vec
    except requests.Timeout:
        print(f"[TIMEOUT] Embedding '{skill}' took >{timeout}s")
        return None
    except Exception as e:
        print(f"[ERROR] Embedding '{skill}' failed: {e}")
        return None

def embed_list(skills):
    vectors = []
    for s in skills:
        vec = embed_skill(s)
        if vec is not None:
            vectors.append(vec)
    return np.vstack(vectors) if vectors else np.array([]).reshape(0, 768)

def save_cache():
    print(f"Saving {len(skill_cache):,} skills to {CACHE_FILE}")
    json.dump({k: v.tolist() for k, v in skill_cache.items()}, open(CACHE_FILE, "w"))
atexit.register(save_cache)

Loading cache with 3743 skills...


<function __main__.save_cache()>

In [6]:
def match_resume_job(resume_row, job_row, thresh=0.85, top_k=3):
    r_skills = list(set(get_esco_skills(resume_row)))
    j_skills = list(set(get_esco_skills(job_row)))
    if not r_skills or not j_skills:
        return None

    r_emb = embed_list(r_skills)
    j_emb = embed_list(j_skills)
    if r_emb.size == 0 or j_emb.size == 0:
        return None

    sim = cosine_similarity(r_emb, j_emb)

    covered = (sim >= thresh).any(axis=0)
    covered_job_skills = [js for js, cov in zip(j_skills, covered) if cov]
    missing_job_skills = [js for js, cov in zip(j_skills, covered) if not cov]

    top_matches = []
    for j, js in enumerate(j_skills):
        scores = sim[:, j]
        best_idx = np.argsort(scores)[-top_k:][::-1]
        for idx in best_idx:
            if scores[idx] > 0:
                top_matches.append({
                    "job_skill": js,
                    "resume_skill": r_skills[idx],
                    "score": float(scores[idx]),
                    "is_match": scores[idx] >= thresh
                })

    summary = {
        "resume_id"         : resume_row.ID if hasattr(resume_row, 'ID') else resume_row.get('ID'),
        "job_id"            : job_row.uniq_id if hasattr(job_row, 'uniq_id') else job_row.get('uniq_id'),
        "pct_job_covered"   : float((len(covered_job_skills)/len(j_skills))),      
        "gaps"              : missing_job_skills,
        "n_gaps"            : len(missing_job_skills),
        "n_job_skills"      : len(j_skills),
        "n_job_covered"     : len(covered_job_skills),
        "top_matches"       : top_matches
    }

    return {
        "summary"    : summary,
        "sim_matrix" : sim,
        "r_skills"   : r_skills,
        "j_skills"   : j_skills
    }

In [7]:
print("Collecting unique ESCO skills from resumes and jobs...")
unique_skills = set()

# Resumes
for row in tqdm(resumes.itertuples(), total=len(resumes), desc="Resumes"):
    unique_skills.update(get_esco_skills(row))

# Jobs
for row in tqdm(jobs.itertuples(), total=len(jobs), desc="Jobs"):
    unique_skills.update(get_esco_skills(row))

unique_skills = sorted(unique_skills)
print(f"Found {len(unique_skills):,} unique ESCO skills")

print("Pre-embedding unique skills...")
new_embeddings = 0

for skill in tqdm(unique_skills, desc="Embedding"):
    if skill not in skill_cache:
        vec = embed_skill(skill)
        if vec is not None:
            skill_cache[skill] = vec
            new_embeddings += 1
        else:
            print(f"[SKIP] Failed to embed: {skill}")

print(f"Added {new_embeddings:,} new embeddings")

print(f"Saving full cache ({len(skill_cache):,} skills) → {CACHE_FILE}")
json.dump({k: v.tolist() for k, v in skill_cache.items()}, open(CACHE_FILE, "w"))
print("Pre-embedding complete!")

Collecting unique ESCO skills from resumes and jobs...


Resumes:   0%|          | 0/120 [00:00<?, ?it/s]

Jobs:   0%|          | 0/22000 [00:00<?, ?it/s]

Found 2,604 unique ESCO skills
Pre-embedding unique skills...


Embedding:   0%|          | 0/2604 [00:00<?, ?it/s]

Added 0 new embeddings
Saving full cache (3,743 skills) → matches\skill_cache.json
Pre-embedding complete!


In [8]:
THRESH          = 0.85
MAX_WORKERS     = 8
BATCH_SIZE      = 1000
PAIR_TIMEOUT    = 60
SUMMARY_FILE    = MATCH_DIR / "ind_skills_scores_085.parquet"

print(f"Using {len(resumes):,} tech resumes × {len(jobs):,} jobs "
      f"= {len(resumes)*len(jobs):,} total possible pairs")

print("\nBuilding valid pairs...")
pairs = []
for r in tqdm(resumes.itertuples(), total=len(resumes), desc="Scanning resumes"):
    r_skills = get_esco_skills(r)
    if not r_skills:
        continue
    for j in jobs.itertuples():
        j_skills = get_esco_skills(j)
        if j_skills:
            pairs.append((r, j))

total_pairs = len(pairs)
total_batches = (total_pairs + BATCH_SIZE - 1) // BATCH_SIZE

print(f"\nVALID PAIRS FOUND     : {total_pairs:,}")
print(f"TOTAL BATCHES TO RUN  : {total_batches:,}")
print(f"{'='*70}\n")

def match_with_timeout(r, j):
    with ThreadPoolExecutor(max_workers=1) as exe:
        fut = exe.submit(match_resume_job, r, j, THRESH)
        try:
            return fut.result(timeout=PAIR_TIMEOUT)
        except TimeoutError:
            print(f"[TIMEOUT] Resume {getattr(r, 'ID', '?')} ↔ Job {getattr(j, 'uniq_id', '?')}")
            return None
        except Exception as e:
            print(f"[ERROR] Resume {getattr(r, 'ID', '?')} ↔ Job {getattr(j, 'uniq_id', '?')}: {e}")
            return None

print(f"\nStarting matching of {total_pairs:,} pairs in {total_batches} batches\n")
print(f"{'='*70}")

summary_rows = []

with tqdm(total=total_batches, desc="Overall Progress", unit="batch", 
          bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{percentage:3.0f}%]") as pbar:

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as exe:
        for batch_idx in range(1, total_batches + 1):
            start = (batch_idx - 1) * BATCH_SIZE
            end = start + BATCH_SIZE
            batch = pairs[start:end]

            futures = [exe.submit(match_with_timeout, r, j) for r, j in batch]

            for fut in as_completed(futures):
                result = fut.result()
                if result:
                    summary_rows.append(result["summary"])

            pbar.update(1)

# ────────────────────── FINAL SAVE ──────────────────────
summary_df = pd.DataFrame(summary_rows)
summary_df.to_parquet(SUMMARY_FILE, index=False)

print(f"\nDONE! {len(summary_df):,} matched pairs → {SUMMARY_FILE}")

Using 120 tech resumes × 22,000 jobs = 2,640,000 total possible pairs

Building valid pairs...


Scanning resumes:   0%|          | 0/120 [00:00<?, ?it/s]


VALID PAIRS FOUND     : 2,553,383
TOTAL BATCHES TO RUN  : 2,554


Starting matching of 2,553,383 pairs in 2554 batches



Overall Progress:   0%|          | 0/2554 [  0%]


DONE! 2,553,383 matched pairs → matches\ind_skills_scores_085.parquet


In [9]:
df = pd.read_parquet("matches/ind_skills_scores_085.parquet")

In [10]:
from ast import literal_eval

df['gaps'] = df['gaps'].apply(lambda x: literal_eval(x) if isinstance(x, str) else x)

In [11]:
resumes_all = pd.read_parquet("processed/resume_matched.parquet")
jobs_all    = pd.read_parquet("processed/dice_job_descriptions_matched.parquet")

In [12]:
df['score'] = df['pct_job_covered'] * df['n_job_skills'].apply(lambda x: np.log1p(x))

best = df.loc[df.groupby('resume_id')['score'].idxmax()]

In [13]:
row = best[best['resume_id'] == 16186411].iloc[0]

rid = row['resume_id']
jid = row['job_id']
thresh = 0.85

print(f"Inspecting Resume {rid} → Job {jid}")
print(f"Reported pct_job_covered = {row['pct_job_covered']:.3f} ({row['n_job_covered']}/{row['n_job_skills']})")
#print(f"Job skills that matched with resume {row['matched_job_skills']}")
print(f"Predicted gaps ({len(row['gaps'])}): {row['gaps']}")
print("="*130)

resume_row = resumes_all[resumes_all['ID'] == rid].iloc[0]
job_row    = jobs_all[jobs_all['uniq_id'] == jid].iloc[0]

r_skills = list(set(get_esco_skills(resume_row)))
j_skills = list(set(get_esco_skills(job_row)))

print(f"Resume ESCO skills found : {len(r_skills)}")
print(f"Job ESCO skills found    : {len(j_skills)}")
print("-"*130)
print(j_skills)
print(r_skills)

# matches = pd.DataFrame(row['top_matches'])

# print("Explainability for this match:")
# display(matches.sort_values('score', ascending=False))

r_emb = embed_list(r_skills)
j_emb = embed_list(j_skills)
sim   = cosine_similarity(r_emb, j_emb)

covered = (sim >= thresh).any(axis=0)
covered_job_skills = [js for js, cov in zip(j_skills, covered) if cov]
missing_job_skills = [js for js, cov in zip(j_skills, covered) if not cov]

print(len(covered_job_skills))
print(len(missing_job_skills))

print("TOP 10 STRONGEST SKILL PAIRS:")
matches = []
for i, rs in enumerate(r_skills):
    for j, js in enumerate(j_skills):
        matches.append((sim[i,j], rs, js))
for score, rs, js in sorted(matches, reverse=True)[:10]:
    status = "MATCH" if score >= thresh else "close"
    print(f"  {score:.3f}  →  \"{rs}\"  ↔  \"{js}\"  [{status}]")

Inspecting Resume 16186411 → Job 87fbe5a19977daf7b44de1f025da7cb2
Reported pct_job_covered = 0.818 (9/11)
Predicted gaps (2): ['PHP' 'Ruby (computer programming)']
Resume ESCO skills found : 11
Job ESCO skills found    : 11
----------------------------------------------------------------------------------------------------------------------------------
['ASP.NET', 'SQL Server', 'PHP', 'Ruby (computer programming)', 'CSS', 'Java (computer programming)', 'MySQL', 'C#', 'LINQ', 'AJAX', 'JavaScript']
['ASP.NET', 'C++', 'CSS', 'Java (computer programming)', 'SQL', 'MySQL', 'Microsoft Access', 'AJAX', 'C#', 'LINQ', 'JavaScript']
9
2
TOP 10 STRONGEST SKILL PAIRS:
  1.000  →  "ASP.NET"  ↔  "ASP.NET"  [MATCH]
  1.000  →  "C#"  ↔  "C#"  [MATCH]
  1.000  →  "AJAX"  ↔  "AJAX"  [MATCH]
  1.000  →  "MySQL"  ↔  "MySQL"  [MATCH]
  1.000  →  "CSS"  ↔  "CSS"  [MATCH]
  1.000  →  "Java (computer programming)"  ↔  "Java (computer programming)"  [MATCH]
  1.000  →  "LINQ"  ↔  "LINQ"  [MATCH]
  1.000  →  "J

In [15]:
row = best[best['pct_job_covered'] > 0.5].sample(1, random_state=45).iloc[0]

rid = row['resume_id']
jid = row['job_id']

print(f"Inspecting Resume {rid} → Job {jid}")
print(f"Reported pct_job_covered = {row['pct_job_covered']:.3f} ({row['n_job_covered']}/{row['n_job_skills']})")
print(f"Predicted gaps ({len(row['gaps'])}): {row['gaps']}")
print("="*130)

resume_row = resumes_all[resumes_all['ID'] == rid].iloc[0]
job_row    = jobs_all[jobs_all['uniq_id'] == jid].iloc[0]

r_skills = list(set(get_esco_skills(resume_row)))
j_skills = list(set(get_esco_skills(job_row)))

print(f"Resume ESCO skills found : {len(r_skills)}")
print(f"Job ESCO skills found    : {len(j_skills)}")
print("-"*130)
print(j_skills)
print(r_skills)

r_emb = embed_list(r_skills)
j_emb = embed_list(j_skills)
sim   = cosine_similarity(r_emb, j_emb)

covered = (sim >= thresh).any(axis=0)
covered_job_skills = [js for js, cov in zip(j_skills, covered) if cov]
missing_job_skills = [js for js, cov in zip(j_skills, covered) if not cov]

print(len(covered_job_skills))
print(len(missing_job_skills))

print("TOP 10 STRONGEST SKILL PAIRS:")
matches = []
for i, rs in enumerate(r_skills):
    for j, js in enumerate(j_skills):
        matches.append((sim[i,j], rs, js))
for score, rs, js in sorted(matches, reverse=True)[:10]:
    status = "MATCH" if score >= thresh else "close"
    print(f"  {score:.3f}  →  \"{rs}\"  ↔  \"{js}\"  [{status}]")

Inspecting Resume 17111768 → Job e7e326053c586bd94e59f1fd74de4a1b
Reported pct_job_covered = 0.750 (3/4)
Predicted gaps (1): ['coordinate security']
Resume ESCO skills found : 16
Job ESCO skills found    : 4
----------------------------------------------------------------------------------------------------------------------------------
['coordinate security', 'process applications', 'operating systems', 'project management']
['perform system analysis', 'database management systems', 'computer technology', 'financial analysis', 'government policy implementation', 'process applications', 'project management', 'deliver business research proposals', 'organise project meetings', 'customer service', 'investigation research methods', 'perform political negotiation', 'policy analysis', 'create a financial report', 'operating systems', 'analyse network configuration and performance']
3
1
TOP 10 STRONGEST SKILL PAIRS:
  1.000  →  "project management"  ↔  "project management"  [MATCH]
  1.000  →