In [1]:
# lets recruiters paste/upload a job posting and find the top matching CVs.

# 1. Paste job posting text or upload PDF/DOCX/TXT file
# 2. Extract skills using NLP (PhraseMatcher with skill dictionary)
# 3. Build embedding string from extracted fields
# 4. Encode job with bi-encoder and search CV index
# 5. Rerank top candidates with cross-encoder
# 6. Display results with action buttons for feedback

# Input:
# - Job posting text (pasted or uploaded)
# - CV index: `training/output/indexes/cvs_index.faiss` (7,299 CVs)
# - CV data: `ingest_cv/output/cv_query_text.parquet`
# - Skill dictionary: `training/output/skill_dictionary/all_skills`
# - Bi-encoder model: `training/output/models/cv-job-matcher-e5`

#Output:
# - Top 10 matching CVs ranked by cross-encoder score
# - Recruiter feedback logged to `demo/data/feedback/feedback.db`

import os
import sys
import re
import uuid
import numpy as np
import pandas as pd
import faiss
import torch
import spacy
from pathlib import Path
from datetime import datetime

# ipython display
from IPython.display import display, HTML, clear_output
import ipywidgets as widgets

# add project root to path
cwd = os.getcwd()
if 'notebooks' in cwd:
    PROJECT_ROOT = os.path.dirname(os.path.dirname(cwd))  # TWO levels up
else:
    PROJECT_ROOT = cwd
sys.path.insert(0, PROJECT_ROOT)

# models
from sentence_transformers import SentenceTransformer, CrossEncoder

# our modules
from demo.scripts.feedback_storage import init_db, log_action, get_action_count
from demo.scripts.document_parser import parse_document

print("Imports loaded")
print(f"Project root: {PROJECT_ROOT}")
print(f"CUDA available: {torch.cuda.is_available()}")

Imports loaded
Project root: /home/developer/project
CUDA available: True


In [2]:
# file paths
MODEL_PATH = os.path.join(PROJECT_ROOT, 'training', 'output', 'models', 'cv-job-matcher-e5')
CV_INDEX_PATH = os.path.join(PROJECT_ROOT, 'training', 'output', 'indexes', 'cvs_index.faiss')
CV_DATA_PATH = os.path.join(PROJECT_ROOT, 'ingest_cv', 'output', 'cv_query_text.parquet')
SKILL_DICT_PATH = os.path.join(PROJECT_ROOT, 'training', 'output', 'skill_dictionary', 'all_skills')

# check files exist
for path in [MODEL_PATH, CV_INDEX_PATH, CV_DATA_PATH, SKILL_DICT_PATH]:
    if os.path.exists(path):
        print(f"OK: {os.path.basename(path)}")
    else:
        print(f"MISSING: {path}")

OK: cv-job-matcher-e5
OK: cvs_index.faiss
OK: cv_query_text.parquet
OK: all_skills


In [3]:
# load bi-encoder model
# load with fp16 for faster inference
bi_encoder = SentenceTransformer(
    MODEL_PATH,
    device='cuda' if torch.cuda.is_available() else 'cpu',
    model_kwargs={"torch_dtype": torch.float16}  # fp16 precision
)
embedding_dim = bi_encoder.get_sentence_embedding_dimension()
print(f"Loaded bi-encoder: {embedding_dim}D embeddings")

2026-01-28 18:05:43,562 - Load pretrained SentenceTransformer: /home/developer/project/training/output/models/cv-job-matcher-e5
`torch_dtype` is deprecated! Use `dtype` instead!


Loaded bi-encoder: 768D embeddings


In [4]:
# load cross-encoder for reranking
# cross-encoder handles asymmetric direction: model trained CV->Job but we need Job->CV
# MS MARCO model works both directions for reranking
device = "cuda" if torch.cuda.is_available() else "cpu"
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L12-v2", device=device)
print(f"Loaded cross-encoder on {device}")

Loaded cross-encoder on cuda


In [5]:
# load CV faiss index
cvs_index = faiss.read_index(CV_INDEX_PATH)
# set nprobe for IVF index if applicable
if hasattr(cvs_index, 'nprobe'):
    cvs_index.nprobe = 20
print(f"Loaded CV index: {cvs_index.ntotal:,} CVs")

Loaded CV index: 7,299 CVs


In [6]:
# load CV data
cvs_df = pd.read_parquet(CV_DATA_PATH)
print(f"Loaded {len(cvs_df):,} CVs")
print(f"Columns: {cvs_df.columns.tolist()}")
print(f"Sample ID: {cvs_df['id'].iloc[0]}")

Loaded 7,299 CVs
Columns: ['id', 'text']
Sample ID: A1


In [7]:
# load skill dictionary
skills_df = pd.read_parquet(SKILL_DICT_PATH)
# skill column is the first column
skill_col = skills_df.columns[0]
skill_set = set(skills_df[skill_col].str.lower().str.strip().tolist())
print(f"Loaded {len(skill_set):,} skills")
# show some examples
sample_skills = list(skill_set)[:10]
print(f"Sample: {sample_skills}")

Loaded 2,770,595 skills
Sample: ['vorne data analysis', 'victim and witness interviews', 'ability to recognize each person as a unique individual', 'ability to follow directions and take initiative', 'manoverboard rescues', 'flexibile working hours', 'telephonic setting', 'counting accuracy', 'tolerance for change/ambiguity', 'operational policing']


In [8]:
# setup spacy PhraseMatcher for skill extraction
from spacy.matcher import PhraseMatcher

nlp = spacy.blank('en')
matcher = PhraseMatcher(nlp.vocab, attr='LOWER')

# add skills in batches to avoid memory issues
skill_list = list(skill_set)
batch_size = 10000
for i in range(0, len(skill_list), batch_size):
    batch = skill_list[i:i+batch_size]
    patterns = [nlp.make_doc(skill) for skill in batch]
    matcher.add(f"SKILLS_{i}", patterns)

print(f"PhraseMatcher ready with {len(skill_list):,} skills in {len(skill_list)//batch_size + 1} batches")

PhraseMatcher ready with 2,770,595 skills in 278 batches


In [9]:
# CONFIGURATION: Number of results to show
# Change this value to see more or fewer results
NUM_RESULTS = 10

# initialize feedback database
init_db()
print(f"Feedback database initialized")
print(f"Current action count: {get_action_count()}")
print(f"Results to show: {NUM_RESULTS}")

Database initialized at /home/developer/project/demo/data/feedback/feedback.db
Feedback database initialized
Current action count: 38
Results to show: 10


In [10]:
# nlp
# these functions extract structured information from raw job posting text

# skill extraction function
def extract_skills_from_job(job_text):
    """
    Extract skills from job posting using PhraseMatcher.
    
    Uses the skill dictionary loaded from output/skill_dictionary/all_skills
    and spacy PhraseMatcher for fast matching.
    """
    if not job_text or not str(job_text).strip():
        return []
    
    # create doc from lowercased text
    doc = nlp(job_text.lower())
    matches = matcher(doc)
    
    # extract matched skills
    skills = []
    for match_id, start, end in matches:
        skill = doc[start:end].text
        # skip single character matches
        if len(skill) > 1:
            skills.append(skill)
    
    # remove duplicates while preserving order
    seen = set()
    unique_skills = []
    for s in skills:
        s_lower = s.lower()
        if s_lower not in seen:
            seen.add(s_lower)
            unique_skills.append(s)
    
    return unique_skills

# test it
test_text = "Senior Python Developer with Django and AWS experience. PostgreSQL required."
extracted = extract_skills_from_job(test_text)
print(f"Test extraction: {extracted}")

Test extraction: ['senior', 'python', 'developer', 'with', 'django', 'and', 'aws', 'aws experience', 'experience', 'postgresql', 'required']


In [11]:
# extract structured job fields
def extract_job_fields(job_text):
    """
    Extract structured fields from job posting text.
    
    Extracts: title, company, location, skills, experience, salary, remote status
    """
    fields = {
        'title': '',
        'company': '',
        'location': '',
        'skills': [],
        'experience_years': '',
        'salary_min': None,
        'salary_max': None,
        'remote_status': '',
        'seniority': 'mid'  # default
    }
    
    if not job_text:
        return fields
    
    # extract skills using PhraseMatcher
    fields['skills'] = extract_skills_from_job(job_text)
    
    # extract title - usually first non-empty line
    lines = job_text.strip().split('\n')
    for line in lines:
        line = line.strip()
        if line and not line.lower().startswith(('company', 'location', 'salary', 'type')):
            fields['title'] = line[:100]
            break
    
    # extract company
    company_match = re.search(r'company[:\s]+([^\n]+)', job_text, re.I)
    if company_match:
        fields['company'] = company_match.group(1).strip()[:100]
    
    # extract location
    location_match = re.search(r'location[:\s]+([^\n]+)', job_text, re.I)
    if location_match:
        fields['location'] = location_match.group(1).strip()[:100]
    
    # extract salary range
    salary_match = re.search(r'\$?([\d,]+)\s*[-\u2013]\s*\$?([\d,]+)', job_text)
    if salary_match:
        try:
            fields['salary_min'] = int(salary_match.group(1).replace(',', ''))
            fields['salary_max'] = int(salary_match.group(2).replace(',', ''))
        except ValueError:
            pass
    
    # extract experience years
    exp_match = re.search(r'(\d+)\+?\s*years?', job_text, re.I)
    if exp_match:
        fields['experience_years'] = exp_match.group(1) + '+'
    
    # detect remote status
    if re.search(r'\bremote\b', job_text, re.I):
        fields['remote_status'] = 'remote'
    elif re.search(r'\bhybrid\b', job_text, re.I):
        fields['remote_status'] = 'hybrid'
    else:
        fields['remote_status'] = 'onsite'
    
    # detect seniority from title
    title_lower = fields['title'].lower()
    if any(w in title_lower for w in ['intern', 'internship', 'trainee']):
        fields['seniority'] = 'intern'
    elif any(w in title_lower for w in ['principal', 'staff', 'distinguished']):
        fields['seniority'] = 'principal'
    elif any(w in title_lower for w in ['lead', 'head of', 'director', 'vp', 'chief']):
        fields['seniority'] = 'lead'
    elif any(w in title_lower for w in ['senior', 'sr.', 'sr ']):
        fields['seniority'] = 'senior'
    elif any(w in title_lower for w in ['junior', 'jr.', 'jr ', 'entry']):
        fields['seniority'] = 'junior'
    else:
        fields['seniority'] = 'mid'
    
    return fields

# test it
test_job = """Senior Python Developer

Company: TechCorp Inc.
Location: San Francisco, CA (Remote OK)
Salary: $150,000 - $180,000

Requirements:
- 5+ years of Python development
- Django or FastAPI experience
"""
fields = extract_job_fields(test_job)
print(f"Title: {fields['title']}")
print(f"Company: {fields['company']}")
print(f"Location: {fields['location']}")
print(f"Skills: {fields['skills'][:5]}")
print(f"Salary: ${fields['salary_min']:,} - ${fields['salary_max']:,}")
print(f"Experience: {fields['experience_years']}")
print(f"Remote: {fields['remote_status']}")
print(f"Seniority: {fields['seniority']}")

Title: Senior Python Developer
Company: TechCorp Inc.
Location: San Francisco, CA (Remote OK)
Skills: ['senior', 'python', 'developer', 'company', 'inc']
Salary: $150,000 - $180,000
Experience: 5+
Remote: remote
Seniority: senior


In [12]:
# cell 12: build embedding string from extracted fields
# follows pattern from notebook 04_embedding_strings.ipynb
def build_job_embedding_string(fields):

    # Build natural language embedding string from extracted job fields.
    
    # Template
    # "Role of {title} at {company} in {location}. Required skills: {skills}.
    #  Experience level: {seniority}. Salary: {salary}. Work type: {remote}.""

    parts = []
    
    # role and company
    title = fields.get('title', 'Unknown Position')
    company = fields.get('company', 'a company')
    location = fields.get('location', '')
    
    role_part = f"Role of {title} at {company}"
    if location:
        role_part += f" in {location}"
    parts.append(role_part + ".")
    
    # skills - limit to top 10
    skills = fields.get('skills', [])
    if skills:
        skills_str = ', '.join(skills[:10])
        parts.append(f"Required skills: {skills_str}.")
    
    # seniority with expanded descriptions
    seniority = fields.get('seniority', 'mid')
    seniority_map = {
        'intern': 'Intern level, entry position',
        'junior': 'Junior level, 0-2 years experience',
        'mid': 'Mid-level, 3-5 years experience',
        'senior': 'Senior level, 5+ years experience',
        'lead': 'Lead level, 7+ years experience with leadership',
        'principal': 'Principal level, expert with technical leadership'
    }
    level_desc = seniority_map.get(seniority, seniority)
    parts.append(f"Experience level: {level_desc}.")
    
    # salary if available
    salary_min = fields.get('salary_min')
    salary_max = fields.get('salary_max')
    if salary_min and salary_max:
        parts.append(f"Salary range: ${salary_min:,} to ${salary_max:,}.")
    elif salary_min:
        parts.append(f"Minimum salary: ${salary_min:,}.")
    
    # remote status
    remote = fields.get('remote_status', '')
    if remote:
        remote_map = {
            'remote': 'Remote work available',
            'hybrid': 'Hybrid work, partially remote',
            'onsite': 'Onsite work'
        }
        work_type = remote_map.get(remote, remote)
        parts.append(f"Work type: {work_type}.")
    
    text = ' '.join(parts)
    
    # add passage prefix for e5 model
    return "passage: " + text

# test it
embedding_str = build_job_embedding_string(fields)
print(f"Embedding string ({len(embedding_str)} chars):")
print(embedding_str)

Embedding string (319 chars):
passage: Role of Senior Python Developer at TechCorp Inc. in San Francisco, CA (Remote OK). Required skills: senior, python, developer, company, inc, inc., location, location:, san, san francisco. Experience level: Senior level, 5+ years experience. Salary range: $150,000 to $180,000. Work type: Remote work available.


In [13]:
# These functions find and rank matching CVs for a given job posting

# find matching CVs using bi-encoder and faiss
def find_matching_cvs(job_text, top_k=50):
    # find top-k matching CVs for a job posting
    
    # Steps:
    # 1. Extract fields from job text
    # 2. Build embedding string
    # 3. Encode with bi-encoder
    # 4. Search CV index with faiss
    # Returns:
    #     matches: list of dicts with cv_id, bi_score, text
    #     fields: extracted job fields

    # extract fields
    fields = extract_job_fields(job_text)
    print(f"  Extracted {len(fields['skills'])} skills: {fields['skills'][:5]}")
    
    # build embedding string
    embedding_text = build_job_embedding_string(fields)
    print(f"  Embedding string: {embedding_text[:80]}")
    # encode with bi-encoder
    job_embedding = bi_encoder.encode(
        [embedding_text],
        convert_to_numpy=True,
        normalize_embeddings=True
    )
    
    # search CV index
    distances, indices = cvs_index.search(job_embedding, top_k)
    
    # build results
    matches = []
    for rank, (dist, idx) in enumerate(zip(distances[0], indices[0]), 1):
        if idx >= 0 and idx < len(cvs_df):
            cv_row = cvs_df.iloc[idx]
            matches.append({
                'rank': rank,
                'cv_id': cv_row['id'],
                'bi_score': float(dist),
                'text': cv_row['text'],
                'idx': int(idx)
            })
    
    return matches, fields

print("find_matching_cvs() defined")

find_matching_cvs() defined


In [14]:
# rerank CVs with cross-encoder
def rerank_cvs(job_text, matches, top_k=10):
    # rerank CV matches using cross-encoder
    
    # Cross-encoder handles asymmetric direction:
    # - Our bi-encoder was trained CV->Job
    # - MS MARCO cross-encoder works both directions for reranking
    
    # Args:
    #     job_text: raw job posting text
    #     matches: list of match dicts from find_matching_cvs
    #     top_k: number of results to return
    
    # Returns:
    #     reranked list of matches with cross_score added
    if not matches:
        return []
    
    # clean job text - remove any prefixes
    clean_job = job_text.replace("passage: ", "").replace("query: ", "")
    
    # create pairs for cross-encoder: (job, cv)
    pairs = []
    for m in matches:
        clean_cv = m['text'].replace("query: ", "").replace("Query: ", "")
        pairs.append((clean_job, clean_cv))
    
    # score with cross-encoder
    print(f"  Reranking {len(pairs)} candidates with cross-encoder")
    cross_scores = cross_encoder.predict(pairs, batch_size=50)
    
    # add scores to matches
    for m, score in zip(matches, cross_scores):
        m['cross_score'] = float(score)
    
    # sort by cross-encoder score descending
    reranked = sorted(matches, key=lambda x: x['cross_score'], reverse=True)
    
    return reranked[:top_k]

print("rerank_cvs() defined")

rerank_cvs() defined


In [15]:
# session state
session_id = str(uuid.uuid4())[:8]
current_job_text = None
current_job_id = None
current_matches = []
current_job_skills = []
current_run_id = 0  # tracks which search run buttons belong to
logged_actions = set()  # prevents duplicate action logging

print(f"Session ID: {session_id}")

Session ID: 0d46cf80


In [16]:
# display single CV match
def display_cv_match(match, idx, job_skills, run_id):
    # Display a single CV match with score and action buttons.
    # run_id is used to prevent stale button clicks

    # find matching skills between job and cv
    cv_text_lower = match['text'].lower()
    matching_skills = [s for s in job_skills if s.lower() in cv_text_lower]

    # determine score color
    cross_score = match['cross_score']
    if cross_score > 5:
        score_color = '#90EE90'  # green
    elif cross_score > 0:
        score_color = '#FFD700'  # gold
    else:
        score_color = '#FFB6C1'  # pink

    html = f'''
    <div style="border: 1px solid #ddd; padding: 15px; margin: 10px 0; border-radius: 5px;">
        <h3>CV #{idx+1}: {match['cv_id']}</h3>
        <p><strong>Cross-Encoder Score:</strong> 
           <span style="background-color: {score_color}; padding: 2px 8px; border-radius: 3px;">
               {match['cross_score']:.2f}
           </span>
           (bi-encoder: {match['bi_score']:.4f})
        </p>
    '''

    if matching_skills:
        skills_str = ', '.join(matching_skills[:8])
        html += f'<p><strong>Matching Skills:</strong> <span style="color: green;">{skills_str}</span></p>'

    # cv preview
    preview = match['text'][:300].replace("query: ", "").replace("Query: ", "")
    html += f'<p style="color: #666; font-style: italic;">{preview}</p>'
    html += '</div>'
    display(HTML(html))

    # action buttons
    actions = [
        ('View Full CV', 'view_full', 'info'),
        ('Contact', 'contact', 'success'),
        ('Shortlist', 'save', ''),
        ('Skip', 'skip', 'warning'),
        ('Not a Match', 'not_interested', 'danger')
    ]

    buttons = []
    for label, action, style in actions:
        btn = widgets.Button(
            description=label,
            button_style=style,
            layout=widgets.Layout(width='110px')
        )

        # closure to capture match, action, and run_id
        def make_callback(m=match, a=action, rid=run_id):
            def cb(b):
                # only process if this run is still current
                if rid != current_run_id:
                    return  # stale button, ignore
                handle_recruiter_action(m, a, b)
            return cb

        btn.on_click(make_callback())
        buttons.append(btn)

    display(widgets.HBox(buttons))

print("display_cv_match() defined")

display_cv_match() defined


In [17]:
# handle recruiter action
def handle_recruiter_action(match, action, button=None):
    # Handle recruiter action on a CV match.
    # logs action to feedback database for future model improvement
    global current_job_text, logged_actions

    # create unique key for this action
    action_key = f"{current_run_id}_{match['cv_id']}_{action}"

    # skip if already logged (prevents duplicate clicks)
    if action_key in logged_actions:
        return
    logged_actions.add(action_key)

    # disable button to prevent double-click
    if button:
        button.disabled = True

    # log to database
    log_action(
        session_id=session_id,
        role='recruiter',
        doc_id=current_job_id,
        match_id=match['cv_id'],
        action=action,
        similarity=match['cross_score'],
        cv_text=match['text'][:2000],
        job_text=current_job_text[:2000] if current_job_text else ''
    )

    # feedback messages
    action_msg = {
        'contact': 'Marked for contact!',
        'save': 'Added to shortlist.',
        'skip': 'Skipped.',
        'not_interested': 'Noted as not a match.',
        'view_full': 'Showing full CV'
    }

    print(f"\n{action_msg.get(action, 'Action recorded.')}")
    print(f"Total actions logged: {get_action_count()}")

    # show full CV if requested
    if action == 'view_full':
        full_text = match['text'].replace("query: ", "").replace("Query: ", "")
        print(f"\nFULL CV: {match['cv_id']}")
        print(f"Bi-Score: {match['bi_score']:.4f}")
        print(f"Cross-Score: {match['cross_score']:.2f}")
        print(f"\nCV Text:")
        print(full_text)


def display_cv_results(matches, job_skills, show_top=10):
    # Display CV matches
    global current_run_id

    page_matches = matches[:show_top]
    run_id = current_run_id  # capture current run_id

    print(f"TOP {len(page_matches)} MATCHING CVs")
    print(f"Showing {len(page_matches)} of {len(matches)}")
    print("Click View Full CV to see complete CV.\n")

    for i, match in enumerate(page_matches):
        display_cv_match(match, i, job_skills, run_id)


print("handle_recruiter_action() defined")
print("display_cv_results() defined")

handle_recruiter_action() defined
display_cv_results() defined


In [18]:
## Main Interface

#Interactive widgets for job posting input and CV matchig
# create interface widgets
output_area = widgets.Output()

# job posting text input
job_input = widgets.Textarea(
    placeholder='Paste job posting here...\n\nOr upload a file below.',
    layout=widgets.Layout(width='100%', height='200px')
)

# file upload widget
file_upload = widgets.FileUpload(
    accept='.pdf,.docx,.doc,.txt',
    multiple=False,
    description='Upload Job'
)

# match button
match_button = widgets.Button(
    description='Find Matching CVs!',
    button_style='primary',
    layout=widgets.Layout(width='200px')
)

print("Widgets created")

Widgets created


In [19]:
# match button click handler
# ROBUST DEBOUNCING using globals() for reliable persistence
if '_nb13_last_search' not in globals():
    globals()['_nb13_last_search'] = 0

def on_match_click(b):
    # Handle match button click.
    global current_job_text, current_job_id, current_matches, current_job_skills
    global current_run_id, logged_actions

    # DEBOUNCE: Prevent duplicate calls within 1 second
    import time
    now = time.time()
    if now - globals().get('_nb13_last_search', 0) < 1.0:
        return
    globals()['_nb13_last_search'] = now
    
    b.disabled = True

    try:
        with output_area:
            clear_output(wait=True)

            # new search, increment run_id and clear logged actions
            current_run_id += 1
            logged_actions = set()

            # get job text from input or file upload
            if job_input.value.strip():
                current_job_text = job_input.value.strip()
                current_job_id = f"input_{uuid.uuid4().hex[:8]}"
                print(f"Using pasted text ({len(current_job_text)} chars)")

            elif file_upload.value:
                print("Processing uploaded file")
                try:
                    uploaded_data = file_upload.value

                    if isinstance(uploaded_data, tuple) and len(uploaded_data) > 0:
                        uploaded = uploaded_data[0]
                        filename = uploaded.get('name', 'unknown')
                        content = uploaded.get('content', b'')
                    elif isinstance(uploaded_data, dict) and len(uploaded_data) > 0:
                        first_key = list(uploaded_data.keys())[0]
                        uploaded = uploaded_data[first_key]
                        filename = uploaded.get('metadata', {}).get('name', first_key)
                        content = uploaded.get('content', b'')
                    else:
                        print(f"ERROR: Unexpected upload format: {type(uploaded_data)}")
                        return

                    print(f"File: {filename} ({len(content)} bytes)")

                    temp_path = Path(f"/tmp/{filename}")
                    temp_path.write_bytes(content)

                    print(f"Parsing {filename}")
                    parsed = parse_document(temp_path)

                    if parsed and parsed.get('text'):
                        current_job_text = parsed['text']
                        current_job_id = f"file_{uuid.uuid4().hex[:8]}"
                        print(f"Parsed: {parsed['word_count']} words")
                    else:
                        print("ERROR: Could not parse uploaded file")
                        return

                    temp_path.unlink(missing_ok=True)

                except Exception as e:
                    print(f"ERROR processing file: {e}")
                    import traceback
                    traceback.print_exc()
                    return
            else:
                print("Please enter a job posting or upload a file!")
                return

            print(f"\nJob posting: {len(current_job_text)} characters")
            print("Finding matching CVs")
            print()

            # find matches with bi-encoder
            current_matches, fields = find_matching_cvs(current_job_text, top_k=50)
            current_job_skills = fields['skills']

            print(f"Found {len(current_matches)} candidates from bi-encoder")
            print()

            # rerank with cross-encoder
            current_matches = rerank_cvs(current_job_text, current_matches, top_k=NUM_RESULTS)

            # display results
            print(f"\nJob: {fields['title'][:50]}")
            print(f"Skills: {', '.join(fields['skills'][:5])}\n")

            display_cv_results(current_matches, current_job_skills, show_top=NUM_RESULTS)

    finally:
        b.disabled = False

# ALWAYS register handler - button is created fresh each cell run
match_button.on_click(on_match_click)

print("Click handler attached")

Click handler attached


In [20]:
# display interface
print("RECRUITER MODE: Find Matching CVs")
print(f"Searching {cvs_index.ntotal:,} CVs with trained bi-encoder model")
print("Paste job posting text OR upload a PDF/DOCX/TXT file")
print()

display(widgets.Label("Enter job posting:"))
display(job_input)
display(file_upload)
display(match_button)
display(output_area)

RECRUITER MODE: Find Matching CVs
Searching 7,299 CVs with trained bi-encoder model
Paste job posting text OR upload a PDF/DOCX/TXT file



Label(value='Enter job posting:')

Textarea(value='', layout=Layout(height='200px', width='100%'), placeholder='Paste job posting here...\n\nOr u…

FileUpload(value=(), accept='.pdf,.docx,.doc,.txt', description='Upload Job')

Button(button_style='primary', description='Find Matching CVs!', layout=Layout(width='200px'), style=ButtonSty…

Output()

In [21]:
# demo instructions
print("""
RECRUITER MODE DEMO INSTRUCTIONS

1. ENTER JOB POSTING:
   - Paste job description text directly into the text area, OR
   - Click "Upload" to select a PDF, DOCX, or TXT file

2. CLICK "Find Matching CVs!" BUTTON

3. REVIEW RESULTS:
   - Top 10 CVs are ranked by cross-encoder score
   - Score colors:
     * Green (>5) = strong match
     * Gold (0-5) = moderate match
     * Pink (<0) = weak match
   - Matching skills are highlighted in green

4. TAKE ACTIONS ON EACH CV:
   - Contact (+1.0) - mark to reach out to candidate
   - Shortlist (+0.5) - save for later review
   - View Full CV (+0.3) - see complete CV text
   - Skip (0) - pass for now
   - Not a Match (-0.5) - mark as poor fit

5. FEEDBACK LOOP:
   - All actions are logged to the feedback database
   - This feedback can be used to retrain the matching model
   - Better matching over time

TRY IT: Load sample_senior_python.txt from incoming/job/
""")


RECRUITER MODE DEMO INSTRUCTIONS

1. ENTER JOB POSTING:
   - Paste job description text directly into the text area, OR
   - Click "Upload" to select a PDF, DOCX, or TXT file

2. CLICK "Find Matching CVs!" BUTTON

3. REVIEW RESULTS:
   - Top 10 CVs are ranked by cross-encoder score
   - Score colors:
     * Green (>5) = strong match
     * Gold (0-5) = moderate match
     * Pink (<0) = weak match
   - Matching skills are highlighted in green

4. TAKE ACTIONS ON EACH CV:
   - Contact (+1.0) - mark to reach out to candidate
   - Shortlist (+0.5) - save for later review
   - View Full CV (+0.3) - see complete CV text
   - Skip (0) - pass for now
   - Not a Match (-0.5) - mark as poor fit

5. FEEDBACK LOOP:
   - All actions are logged to the feedback database
   - This feedback can be used to retrain the matching model
   - Better matching over time

TRY IT: Load sample_senior_python.txt from incoming/job/



In [22]:
# direct search function for programmatic use
def search_cvs(job_text, top_k=10):
    # direct search function for programmatic use
    # Example:
    #     results = search_cvs("Senior Python Developer with Django experience")
    
    # Args:
    #     job_text: job posting text
    #     top_k: number of results
    
    # Returns:
    #     list of match dicts with cv_id, cross_score, text
    
    # find candidates with bi-encoder
    matches, fields = find_matching_cvs(job_text, top_k=50)
    
    # rerank with cross-encoder
    reranked = rerank_cvs(job_text, matches, top_k=top_k)
    
    # display summary
    print(f"\nJob: {fields['title']}")
    print(f"Skills: {', '.join(fields['skills'][:5])}")
    print(f"\nTop {len(reranked)} matching CVs:")
    
    for i, m in enumerate(reranked, 1):
        print(f"{i}. {m['cv_id']}: cross_score={m['cross_score']:.2f}")
        preview = m['text'][:80].replace("query:", "").replace("Query:", "").strip()
        print(f"   {preview}")
    
    return reranked

print("search_cvs() function defined")
print("")
print("Example usage:")
print('  results = search_cvs("Senior Python Developer with Django and AWS experience")')

search_cvs() function defined

Example usage:
  results = search_cvs("Senior Python Developer with Django and AWS experience")


In [23]:
# test with sample job posting
# uncomment to test

# sample_path = os.path.join(PROJECT_ROOT, "incoming/job/sample_senior_python.txt")
# with open(sample_path) as f:
#     sample_job = f.read()
# 
# results = search_cvs(sample_job, top_k=5)

print("Use the widgets above to search, or call search_cvs() directly.")

Use the widgets above to search, or call search_cvs() directly.
