In [1]:
# Interactive CV-Job Matching
# You can upload a CV (or paste text) and find matching jobs from 1.3M+ job postings

# - Upload CV as PDF/DOCX/TXT or paste directly
# - Search 1.3M+ jobs using bi-encoder + cross-encoder
# - Take actions (apply, save, skip) to provide feedback
# - Feedback is logged to SQLite for future model retraining




import os
import sys

# Find project root (cv-job-matcher-project/)
# Notebooks are at */notebooks/ so we need to go up TWO levels
cwd = os.getcwd()
if 'notebooks' in cwd:
    PROJECT_ROOT = os.path.dirname(os.path.dirname(cwd))  # TWO levels up
else:
    PROJECT_ROOT = cwd
sys.path.insert(0, PROJECT_ROOT)

print(f"Project root: {PROJECT_ROOT}")

Project root: /home/developer/project


In [2]:
# set working directory first
import os
os.chdir(PROJECT_ROOT)
print(f"Working directory: {os.getcwd()}")

Working directory: /home/developer/project


In [3]:
# standard imports
import sys
import uuid
import numpy as np
import pandas as pd
import faiss
import torch
from pathlib import Path
from datetime import datetime

# IPython display stuff
from IPython.display import display, HTML, clear_output

# widgets for interactive UI
import ipywidgets as widgets

# add project root to path so we can import our modules
sys.path.insert(0, str(Path.cwd()))

# our ML models
from sentence_transformers import SentenceTransformer, CrossEncoder

# our custom modules from demo scripts
from demo.scripts.feedback_storage import init_db, log_action, get_action_count, get_action_summary
from demo.scripts.document_parser import parse_document, detect_document_type

print("Imports loaded successfully")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

Imports loaded successfully
PyTorch version: 2.6.0+cu124
CUDA available: True


In [4]:
# make sure all required files exist before loading
# this prevents confusing errors later


MODEL_PATH = os.path.join(PROJECT_ROOT, "training", "output", "models", "cv-job-matcher-e5")
INDEX_PATH = os.path.join(PROJECT_ROOT, "training", "output", "indexes", "jobs_full_index.faiss")
IDS_PATH = os.path.join(PROJECT_ROOT, "training", "output", "indexes", "jobs_full_ids.npy")
JOBS_PATH = os.path.join(PROJECT_ROOT, "ingest_job_postings", "output", "unified_job_postings", "unified_jobs.parquet")

# check each file
checks = [
    (MODEL_PATH, "Bi-encoder model"),
    (INDEX_PATH, "Faiss index"),
    (IDS_PATH, "Job IDs"),
    (JOBS_PATH, "Jobs parquet")
]

all_ok = True
for path, name in checks:
    if Path(path).exists():
        # show file size too
        if Path(path).is_file():
            size_mb = Path(path).stat().st_size / 1e6
            print(f"  [OK] {name}: {path} ({size_mb:.1f} MB)")
        else:
            print(f"  [OK] {name}: {path} (directory)")
    else:
        print(f"  [MISSING] {name}: {path}")
        all_ok = False

if all_ok:
    print("\nAll files found, ready to load.")
else:
    print("\nERROR: Some files missing. Run previous notebooks first.")
    raise FileNotFoundError("Required files missing")

  [OK] Bi-encoder model: /home/developer/project/training/output/models/cv-job-matcher-e5 (directory)
  [OK] Faiss index: /home/developer/project/training/output/indexes/jobs_full_index.faiss (4149.4 MB)
  [OK] Job IDs: /home/developer/project/training/output/indexes/jobs_full_ids.npy (193.8 MB)
  [OK] Jobs parquet: /home/developer/project/ingest_job_postings/output/unified_job_postings/unified_jobs.parquet (directory)

All files found, ready to load.


In [5]:
# load bi-encoder (our fine-tuned model)
# load with fp16 for faster inference
bi_encoder = SentenceTransformer(
    MODEL_PATH,
    device='cuda' if torch.cuda.is_available() else 'cpu',
    model_kwargs={"torch_dtype": torch.float16}  # fp16 precision
)
emb_dim = bi_encoder.get_sentence_embedding_dimension()
print(f" Bi-encoder loaded")
print(f" Embedding dimension: {emb_dim}")

2026-01-28 18:05:00,542 - Load pretrained SentenceTransformer: /home/developer/project/training/output/models/cv-job-matcher-e5
`torch_dtype` is deprecated! Use `dtype` instead!


 Bi-encoder loaded
 Embedding dimension: 768


In [6]:
# load cross-encoder for reranking
# this is a pre-trained model, we use it as-is
device = "cuda" if torch.cuda.is_available() else "cpu"
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L12-v2", device=device)
print(f" Cross-encoder loaded on {device}")

 Cross-encoder loaded on cuda


In [7]:
# load Faiss index
import time
start = time.time()

jobs_index = faiss.read_index(INDEX_PATH)
jobs_index.nprobe = 20  # number of clusters to search (balance speed/accuracy)

load_time = time.time() - start
print(f"  Index loaded in {load_time:.1f}s")
print(f"  Total jobs: {jobs_index.ntotal:,}")
print(f"  nprobe set to: {jobs_index.nprobe}")

  Index loaded in 2.9s
  Total jobs: 1,345,711
  nprobe set to: 20


In [8]:
# load job IDs and metadata

# these are the IDs that correspond to index positions
job_ids = np.load(IDS_PATH, allow_pickle=True)
print(f"  Job IDs loaded: {len(job_ids):,}")

# full job data
jobs_df = pd.read_parquet(JOBS_PATH)
print(f"  Jobs dataframe loaded: {len(jobs_df):,} rows")

# build a lookup dict: job_id -> dataframe index
# because job_ids array might not be in same order as dataframe
job_id_to_row = {jid: idx for idx, jid in enumerate(jobs_df['id'])}
print(f"  ID lookup built")

print(f"\nJob columns: {list(jobs_df.columns)}")

  Job IDs loaded: 1,345,711
  Jobs dataframe loaded: 1,345,711 rows
  ID lookup built

Job columns: ['id', 'job_title', 'company', 'job_location', 'skills', 'seniority', 'embedding_text']


In [9]:
# initialize the feedback database
# this creates the SQLite file if it doesn't exist
init_db()

# show current stats
current_count = get_action_count()
print(f"\nFeedback database ready")
print(f"Current action count: {current_count}")

Database initialized at /home/developer/project/demo/data/feedback/feedback.db

Feedback database ready
Current action count: 36


In [10]:
# final status
print(f"  Jobs searchable: {jobs_index.ntotal:,}")
print(f"  Bi-encoder: {emb_dim}D embeddings")
print(f"  Cross-encoder: on {device}")
print(f"  Feedback actions logged: {current_count}")

  Jobs searchable: 1,345,711
  Bi-encoder: 768D embeddings
  Cross-encoder: on cuda
  Feedback actions logged: 36


In [11]:
# CONFIGURATION: Number of results to show
# Change this value to see more or fewer results
NUM_RESULTS = 10

# session state, keeps track of current user, CV, and matches
class DemoState:
    '''holds state for the demo session'''

    def __init__(self):
        # unique session ID for this run
        self.session_id = str(uuid.uuid4())[:8]

        # role is fixed to job_seeker in this demo
        self.role = 'job_seeker'

        # CV text that user uploaded/pasted
        self.uploaded_text = None
        self.uploaded_id = None

        # matching results
        self.matches = []
        self.current_page = 0

        # run counter to prevent stale button clicks
        # increments each time we do a new search
        self.run_id = 0
        
        # track which actions already logged (prevent duplicates)
        self.logged_actions = set()

    def reset(self):
        '''reset for a new CV search'''
        self.uploaded_text = None
        self.uploaded_id = None
        self.matches = []
        self.current_page = 0
        self.run_id += 1
        self.logged_actions = set()

# create the global state object
state = DemoState()

print(f"Session ID: {state.session_id}")
print(f"Mode: {state.role.replace('_', ' ').title()} (upload CV -> find matching jobs)")
print(f"Results to show: {NUM_RESULTS}")

Session ID: c69d705d
Mode: Job Seeker (upload CV -> find matching jobs)
Results to show: 10


In [12]:
def find_matches(query_text, top_k=50):
    # Find top-k job matches for a CV using bi-encoder
    # Args:
    #     query_text: CV text (raw)
    #     top_k: Number of candidates to retrieve (default 50)
    # Returns:
    #     List of match dicts with: job_id, bi_score, title, company...
    # e5 models expect a prefix for queries vs passages
    # our CV is the query, jobs are passages
    prefix = "query: "
    
    # clean up the text in case user accidentally pasted with prefix
    clean_text = query_text.replace("query: ", "").replace("Query: ", "").replace("passage: ", "")
    prefixed_text = prefix + clean_text
    
    # encode the query
    # normalize_embeddings=True for cosine similarity
    query_emb = bi_encoder.encode(
        [prefixed_text], 
        convert_to_numpy=True, 
        normalize_embeddings=True
    )
    
    # search the Faiss index
    # returns: similarities (inner product = cosine for normalized vectors)
    #          indices (positions in the index)
    similarities, indices = jobs_index.search(query_emb, top_k)
    
    # build result list
    matches = []
    for rank, (sim, idx) in enumerate(zip(similarities[0], indices[0]), 1):
        # get job ID from the IDs array
        job_id = job_ids[idx]
        
        # get job metadata from dataframe
        # need to find the row with this job_id
        if job_id in job_id_to_row:
            row_idx = job_id_to_row[job_id]
            job_row = jobs_df.iloc[row_idx]
        else:
            # fallback: try direct index access
            job_row = jobs_df.iloc[idx]
        
        matches.append({
            'rank': rank,
            'job_id': str(job_id),
            'bi_score': float(sim),
            'title': job_row.get('job_title', 'Unknown Title'),
            'company': job_row.get('company', 'Unknown Company'),
            'location': job_row.get('job_location', 'Unknown'),
            'skills': job_row.get('skills', ''),
            'seniority': job_row.get('seniority', ''),
            'text': job_row.get('embedding_text', '')
        })
    
    return matches


def rerank_matches(query_text, matches, top_k=10):
    # Rerank matches using cross-encoder for better accuracy.
    # Cross-encoder looks at query+doc together (not separately),
    # so it's more accurate but slower, that's why we only use it
    # on the top-50 candidates from bi-encoder
    
    # Args:
    #     query_text: Original CV text
    #     matches: List of match dicts from find_matches()
    #     top_k: Number to return after reranking (default 10)
    
    # Returns:
    #     Reranked list with cross_score added
    # clean the query (remove any prefixes)
    clean_query = query_text.replace("query: ", "").replace("Query: ", "").replace("passage: ", "")
    
    # create pairs for cross-encoder: (query, document)
    pairs = []
    for m in matches:
        doc_text = m['text'].replace("passage: ", "")
        pairs.append((clean_query, doc_text))
    
    # score all pairs
    # batch_size=128 is good for GPU
    cross_scores = cross_encoder.predict(pairs, batch_size=128)
    
    # add cross-encoder scores to matches
    for m, score in zip(matches, cross_scores):
        m['cross_score'] = float(score)
    
    # sort by cross-encoder score (higher is better)
    reranked = sorted(matches, key=lambda x: x['cross_score'], reverse=True)
    
    return reranked[:top_k]


print("Matching functions defined")

Matching functions defined


In [13]:
# quick test of matching functions
# just to make sure they work before we build the UI

test_cv = "Python developer with 5 years experience in Django, PostgreSQL, and AWS."
print(f"Testing with: '{test_cv}'")
print("")

# step 1: bi-encoder retrieval
print("Step 1: Bi-encoder search")
import time
start = time.time()
test_matches = find_matches(test_cv, top_k=5)
bi_time = time.time() - start
print(f"  Found {len(test_matches)} matches in {bi_time*1000:.1f}ms")

# step 2: cross-encoder rerank
print("Step 2: Cross-encoder rerank")
start = time.time()
test_reranked = rerank_matches(test_cv, test_matches, top_k=3)
cross_time = time.time() - start
print(f" Reranked to top-3 in {cross_time*1000:.1f}ms")

# show results
print("\nTop 3 matches:")
for i, m in enumerate(test_reranked, 1):
    print(f"  {i}. {m['title']} at {m['company']}")
    print(f"     bi={m['bi_score']:.3f}, cross={m['cross_score']:.2f}")

Testing with: 'Python developer with 5 years experience in Django, PostgreSQL, and AWS.'

Step 1: Bi-encoder search


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Found 5 matches in 224.7ms
Step 2: Cross-encoder rerank


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 Reranked to top-3 in 50.1ms

Top 3 matches:
  1. Senior Engineer / Python Developer with Django at Allnessjobs
     bi=0.626, cross=8.30
  2. Python/Django Application Developer at Pinnacle Group, Inc.
     bi=0.635, cross=8.15
  3. Python Developer at SThree
     bi=0.586, cross=7.28


In [14]:
def display_match(match, idx, current_run_id):
    # display a single match with formatted HTML and action buttons.
    # current_run_id is used to prevent stale button clicks
    
    score = match['cross_score']
    if score > 5:
        color = '#90EE90'  # light green
    elif score > 0:
        color = '#FFD700'  # gold
    else:
        color = '#FFB6C1'  # light pink

    html = f'''
    <div style="border: 1px solid #ddd; padding: 15px; margin: 10px 0; 
                border-radius: 8px; background: #fafafa;">
        <h3 style="margin-top: 0;">#{idx+1}: {match['title']}</h3>
        <p style="margin: 5px 0;">
            <strong>Company:</strong> {match['company']} &nbsp;|&nbsp; 
            <strong>Location:</strong> {match['location']} &nbsp;|&nbsp;
            <strong>Level:</strong> {match['seniority']}
        </p>
        <p style="margin: 5px 0;">
            <strong>Match Score:</strong> 
            <span style="background: {color}; padding: 2px 8px; border-radius: 4px; 
                         font-weight: bold;">{score:.2f}</span>
            <span style="color: #888;">(bi-encoder: {match['bi_score']:.4f})</span>
        </p>
    '''

    skills = match.get('skills', '')
    if skills:
        if isinstance(skills, list):
            skills_str = ', '.join(skills[:8])
            if len(skills) > 8:
                skills_str += f" (+{len(skills)-8} more)"
        else:
            skills_list = str(skills).split(',')[:8]
            skills_str = ', '.join(s.strip() for s in skills_list)
        html += f'<p style="margin: 5px 0;"><strong>Skills:</strong> {skills_str}</p>'

    preview = match['text'][:400].replace('passage: ', '')
    html += f'''
        <p style="color: #555; font-style: italic; margin-top: 10px; 
                  padding: 10px; background: #f0f0f0; border-radius: 4px;">
            {preview}...
        </p>
    </div>
    '''

    display(HTML(html))

    button_configs = [
        ('View Full', 'view_full', 'info'),
        ('Apply', 'apply', 'success'),
        ('Save', 'save', ''),
        ('Skip', 'skip', 'warning'),
        ('Not Interested', 'not_interested', 'danger')
    ]

    buttons = []
    for label, action, style in button_configs:
        btn = widgets.Button(
            description=label,
            button_style=style,
            layout=widgets.Layout(width='120px', margin='2px')
        )
        
        # capture match, action, and run_id in closure
        def make_callback(m=match, a=action, rid=current_run_id):
            def callback(b):
                # only process if this run is still current
                if rid != state.run_id:
                    return  # stale button, ignore
                handle_action(m, a, b)
            return callback

        btn.on_click(make_callback())
        buttons.append(btn)

    display(widgets.HBox(buttons))


def handle_action(match, action, button=None):
    # handle user action on a match
    global state

    # create unique key for this action
    action_key = f"{state.run_id}_{match['job_id']}_{action}"
    
    # skip if already logged (prevents duplicate clicks)
    if action_key in state.logged_actions:
        return
    state.logged_actions.add(action_key)
    
    # disable button to prevent double-click
    if button:
        button.disabled = True

    # log to SQLite database
    log_action(
        session_id=state.session_id,
        role=state.role,
        doc_id=state.uploaded_id or 'unknown',
        match_id=match['job_id'],
        action=action,
        similarity=match['cross_score'],
        cv_text=state.uploaded_text[:2000] if state.uploaded_text else '',
        job_text=match['text'][:2000]
    )

    messages = {
        'apply': 'Applied! Application logged.',
        'save': 'Saved for later review.',
        'skip': 'Skipped this job.',
        'not_interested': 'Marked as not interested.',
        'view_full': 'Viewing full details...'
    }

    msg = messages.get(action, f'Action "{action}" recorded.')
    print(f"\n{msg}")
    print(f"Total feedback actions: {get_action_count()}")

    if action == 'view_full':
        print(f"\nFULL DETAILS: {match['title']}")
        print(f"Company: {match['company']}")
        print(f"Location: {match['location']}")
        print(f"Seniority: {match['seniority']}")
        print(f"Skills: {match['skills']}")
        print(f"\nFull Text:")
        print(match['text'].replace('passage: ', ''))


def display_top_matches(matches, show_top=10):
    # display matches
    page_matches = matches[:show_top]
    current_run = state.run_id  # capture run_id at display time

    print(f"TOP {len(page_matches)} JOB MATCHES")
    print(f"Showing {len(page_matches)} of {len(matches)}")

    for i, match in enumerate(page_matches):
        display_match(match, i, current_run)


print("Display functions defined")

Display functions defined


In [15]:
# output area where results will be displayed
output_area = widgets.Output()

# text input for pasting CV
text_input = widgets.Textarea(
    placeholder='Paste your CV text here...\n\nOr upload a file below.',
    layout=widgets.Layout(width='100%', height='200px')
)

# file upload widget
file_upload = widgets.FileUpload(
    accept='.pdf,.docx,.txt',
    multiple=False,
    description='Upload CV'
)

# main search button
match_button = widgets.Button(
    description='Find Matching Jobs!',
    button_style='primary',
    icon='search',
    layout=widgets.Layout(width='200px', height='40px')
)

# status label
status_label = widgets.Label(
    value='Ready. Enter CV text or upload a file, then click the button.'
)

# ROBUST DEBOUNCING using globals() for reliable persistence
if '_nb12_last_search' not in globals():
    globals()['_nb12_last_search'] = 0

def on_match_click(b):
    # callback when user clicks Find Matching Jobs
    global state
    
    # DEBOUNCE: Prevent duplicate calls within 1 second
    import time
    now = time.time()
    if now - globals().get('_nb12_last_search', 0) < 1.0:
        return
    globals()['_nb12_last_search'] = now
    
    # disable button during search
    b.disabled = True

    try:
        with output_area:
            clear_output(wait=True)

            # reset state for new search
            state.reset()

            cv_text = None

            # check text input first
            if text_input.value.strip():
                cv_text = text_input.value.strip()
                state.uploaded_id = f"text_{uuid.uuid4().hex[:8]}"
                print(f"Using pasted text ({len(cv_text)} chars)")

            # if no text, check file upload
            elif file_upload.value:
                print("Processing uploaded file...")
                try:
                    uploaded_data = file_upload.value

                    if isinstance(uploaded_data, tuple) and len(uploaded_data) > 0:
                        uploaded = uploaded_data[0]
                        filename = uploaded.get('name', 'unknown')
                        content = uploaded.get('content', b'')
                    elif isinstance(uploaded_data, dict) and len(uploaded_data) > 0:
                        first_key = list(uploaded_data.keys())[0]
                        uploaded = uploaded_data[first_key]
                        filename = uploaded.get('metadata', {}).get('name', first_key)
                        content = uploaded.get('content', b'')
                    else:
                        print(f"ERROR: Unexpected upload format: {type(uploaded_data)}")
                        return

                    print(f"File: {filename} ({len(content)} bytes)")

                    temp_path = Path(f"/tmp/{filename}")
                    temp_path.write_bytes(content)
                    print(f"Parsing {filename}...")

                    parsed = parse_document(temp_path)

                    if parsed and parsed.get('text'):
                        cv_text = parsed['text']
                        state.uploaded_id = f"file_{uuid.uuid4().hex[:8]}"
                        print(f"Parsed: {parsed['word_count']} words")
                        print(f"Preview: {cv_text[:200]}...")
                    else:
                        print("ERROR: Could not extract text from file")
                        return

                    temp_path.unlink(missing_ok=True)

                except Exception as e:
                    print(f"ERROR processing file: {e}")
                    import traceback
                    traceback.print_exc()
                    return

            else:
                print("Please enter CV text or upload a file!")
                return

            state.uploaded_text = cv_text

            # bi-encoder retrieval
            print(f"\nSearching {jobs_index.ntotal:,} jobs...")
            start = time.time()
            candidates = find_matches(cv_text, top_k=50)
            bi_time = time.time() - start
            print(f"Found {len(candidates)} candidates in {bi_time*1000:.0f}ms")

            # cross-encoder reranking
            print("Reranking with cross-encoder...")
            start = time.time()
            state.matches = rerank_matches(cv_text, candidates, top_k=NUM_RESULTS)
            cross_time = time.time() - start
            print(f"Reranked to top {NUM_RESULTS} in {cross_time*1000:.0f}ms")

            print(f"\nTotal time: {(bi_time + cross_time)*1000:.0f}ms")

            # display results
            display_top_matches(state.matches, show_top=NUM_RESULTS)

    finally:
        b.disabled = False


# ALWAYS register handler - button is created fresh each cell run
match_button.on_click(on_match_click)

# build the UI layout
print("INTERACTIVE CV-JOB MATCHING DEMO")
print(f"Model trained on 6K pairs | Searching {jobs_index.ntotal:,} jobs")
print("Mode: Job Seeker (upload CV -> find matching jobs)")

display(widgets.Label("Enter your CV text:"))
display(text_input)
display(widgets.Label("Or upload a file (PDF/DOCX/TXT):"))
display(file_upload)
display(match_button)
display(status_label)
display(widgets.HTML("<hr/>"))
display(output_area)

INTERACTIVE CV-JOB MATCHING DEMO
Model trained on 6K pairs | Searching 1,345,711 jobs
Mode: Job Seeker (upload CV -> find matching jobs)


Label(value='Enter your CV text:')

Textarea(value='', layout=Layout(height='200px', width='100%'), placeholder='Paste your CV text here...\n\nOr …

Label(value='Or upload a file (PDF/DOCX/TXT):')

FileUpload(value=(), accept='.pdf,.docx,.txt', description='Upload CV')

Button(button_style='primary', description='Find Matching Jobs!', icon='search', layout=Layout(height='40px', …

Label(value='Ready. Enter CV text or upload a file, then click the button.')

HTML(value='<hr/>')

Output()

In [28]:
# show feedback statistics
print("FEEDBACK STATISTICS")

summary = get_action_summary()

total = summary.get('total', 0)
print(f"\nTotal actions logged: {total}")

# actions by type
by_action = summary.get('by_action', {})
if by_action:
    print("\nActions by type:")
    for action_name, data in by_action.items():
        count = data['count']
        weight = data['total_weight']
        print(f"  {action_name:20s} count={count:4d}  weight_sum={weight:+.1f}")

# actions by role
by_role = summary.get('by_role', {})
if by_role:
    print("\nActions by role:")
    for role, count in by_role.items():
        print(f"  {role}: {count}")

# retraining status
RETRAIN_THRESHOLD = 50
meaningful_actions = get_action_count()  # excludes weight=0 actions
actions_until_retrain = max(0, RETRAIN_THRESHOLD - meaningful_actions)

print(f"\nRetraining threshold: {RETRAIN_THRESHOLD} meaningful actions")
print(f"Meaningful actions (weight != 0): {meaningful_actions}")
print(f"Actions until retrain eligible: {actions_until_retrain}")

if actions_until_retrain == 0:
    print("\n READY FOR RETRAINING")
    print("Run the retraining cell below to update the model.")

FEEDBACK STATISTICS

Total actions logged: 44

Actions by type:
  view_full            count=  20  weight_sum=+6.0
  apply                count=   9  weight_sum=+9.0
  skip                 count=   6  weight_sum=+0.0
  save                 count=   4  weight_sum=+2.0
  contact              count=   3  weight_sum=+3.0
  not_interested       count=   2  weight_sum=-1.0

Actions by role:
  job_seeker: 27
  recruiter: 17

Retraining threshold: 50 meaningful actions
Meaningful actions (weight != 0): 38
Actions until retrain eligible: 12


In [17]:
# show recent actions (for debugging)
recent = summary.get('recent', [])
if recent:
    print("\nRecent actions (last 10):")
    for r in recent:
        session, role, action, sim, ts = r
        sim_str = f"{sim:.2f}" if sim else "N/A"
        print(f"  [{ts}] session={session} role={role} action={action} sim={sim_str}")
else:
    print("\nNo actions logged yet. Try the demo above")


Recent actions (last 10):
  [2026-01-28 16:39:18] session=dff7cbc1 role=job_seeker action=skip sim=-10.17
  [2026-01-28 16:39:15] session=dff7cbc1 role=job_seeker action=view_full sim=-10.17
  [2026-01-28 16:39:06] session=dff7cbc1 role=job_seeker action=view_full sim=-9.93
  [2026-01-28 15:28:23] session=9e4f0306 role=job_seeker action=view_full sim=-10.08
  [2026-01-28 15:22:33] session=ac48d6c4 role=recruiter action=contact sim=-0.52
  [2026-01-28 15:22:32] session=ac48d6c4 role=recruiter action=save sim=-0.51
  [2026-01-28 15:22:32] session=ac48d6c4 role=recruiter action=contact sim=-0.46
  [2026-01-28 15:22:31] session=ac48d6c4 role=recruiter action=save sim=-0.10
  [2026-01-28 15:18:11] session=e458310a role=job_seeker action=view_full sim=-10.06
  [2026-01-28 15:11:31] session=5b5bfc87 role=recruiter action=view_full sim=7.71


In [18]:
print("""
DEMO INSTRUCTIONS (Job Seeker Mode)

1. ENTER YOUR CV:
   - Paste CV text directly in the text area above, OR
   - Click "Upload CV" and select a PDF/DOCX/TXT file

2. CLICK "Find Matching Jobs" to search 1.3M+ jobs

3. REVIEW MATCHES:
   - Each match shows title, company, location, and skills
   - Score is from cross-encoder (higher = better match)
   - Green (>5): Great match
   - Gold (0-5): Good match
   - Pink (<0): Poor match

4. TAKE ACTIONS on results:
   - Apply (weight +1.0): Strong positive signal
   - Save (weight +0.5): Moderate interest
   - View Full (weight +0.3): Shows details, mild interest
   - Skip (weight 0.0): No signal
   - Not Interested (weight -0.5): Negative signal

5. FEEDBACK LOOP:
   After 50 meaningful actions, the model can be retrained with your feedback.

TRY THIS SAMPLE CV:

Senior Python Developer with 8 years of experience in Django, PostgreSQL,
and AWS. Led teams of 5+ engineers. Expert in microservices, REST APIs,
and CI/CD pipelines. Strong background in machine learning and data
engineering. Looking for Staff Engineer or Lead roles.
      
NOTE: Recruiter mode (upload job -> find matching CVs) requires CV embeddings
""")


DEMO INSTRUCTIONS (Job Seeker Mode)

1. ENTER YOUR CV:
   - Paste CV text directly in the text area above, OR
   - Click "Upload CV" and select a PDF/DOCX/TXT file

2. CLICK "Find Matching Jobs" to search 1.3M+ jobs

3. REVIEW MATCHES:
   - Each match shows title, company, location, and skills
   - Score is from cross-encoder (higher = better match)
   - Green (>5): Great match
   - Gold (0-5): Good match
   - Pink (<0): Poor match

4. TAKE ACTIONS on results:
   - Apply (weight +1.0): Strong positive signal
   - Save (weight +0.5): Moderate interest
   - View Full (weight +0.3): Shows details, mild interest
   - Skip (weight 0.0): No signal
   - Not Interested (weight -0.5): Negative signal

5. FEEDBACK LOOP:
   After 50 meaningful actions, the model can be retrained with your feedback.

TRY THIS SAMPLE CV:

Senior Python Developer with 8 years of experience in Django, PostgreSQL,
and AWS. Led teams of 5+ engineers. Expert in microservices, REST APIs,
and CI/CD pipelines. Strong back

In [19]:
# quick convenience: copy-paste sample CV
sample_cv = """Senior Python Developer with 8 years of experience in Django, PostgreSQL,
and AWS. Led teams of 5+ engineers. Expert in microservices, REST APIs,
and CI/CD pipelines. Strong background in machine learning and data
engineering. Looking for Staff Engineer or Lead roles."""

print("Sample CV (you can copy this):")
print(sample_cv)

Sample CV (you can copy this):
Senior Python Developer with 8 years of experience in Django, PostgreSQL,
and AWS. Led teams of 5+ engineers. Expert in microservices, REST APIs,
and CI/CD pipelines. Strong background in machine learning and data
engineering. Looking for Staff Engineer or Lead roles.


In [20]:
# direct Search (Alternative)
# if the widgets don't work in Jupyter environment

def search_jobs(cv_text, show_top=10):
    # direct search function
    
    # Usage:
    #     search_jobs("Python developer with 5 years experience...")
    global state
    
    # reset state
    state.reset()
    state.uploaded_text = cv_text
    state.uploaded_id = f"direct_{uuid.uuid4().hex[:8]}"
    
    print(f"CV length: {len(cv_text)} chars")
    print(f"\nSearching {jobs_index.ntotal:,} jobs")
    
    # bi-encoder
    import time
    start = time.time()
    candidates = find_matches(cv_text, top_k=50)
    bi_time = time.time() - start
    print(f" Bi-encoder: {len(candidates)} candidates in {bi_time*1000:.0f}ms")
    
    # cross-encoder
    start = time.time()
    state.matches = rerank_matches(cv_text, candidates, top_k=show_top)
    cross_time = time.time() - start
    print(f" Cross-encoder: top {show_top} in {cross_time*1000:.0f}ms")
    
    # display
    display_top_matches(state.matches, show_top=show_top)
    
    return state.matches


print("Direct search function defined.")
print("")
print("Usage:")
print('  results = search_jobs("Your CV text here")')

Direct search function defined.

Usage:
  results = search_jobs("Your CV text here")


In [21]:
# example direct search
# uncomment and run this to test without widgets:

# results = search_jobs(sample_cv)

In [22]:
# log manual action function

def log_manual_action(match_index, action):
    # log an action for a match from direct search results.
    
    # Args:
    #     match_index: Index in state.matches (0-based)
    #     action: One of 'apply', 'save', 'skip', 'not_interested', 'view_full'
    
    # Usage:
    #     log_manual_action(0, 'apply')  # apply to first match
    if not state.matches:
        print("No matches loaded. Run search_jobs() first.")
        return
    
    if match_index < 0 or match_index >= len(state.matches):
        print(f"Invalid index. Use 0-{len(state.matches)-1}")
        return
    
    match = state.matches[match_index]
    handle_action(match, action)


print("Manual action function defined.")
print("")
print("Usage (after search_jobs()):")
print("  log_manual_action(0, 'apply')   # apply to first match")
print("  log_manual_action(1, 'save')    # save second match")
print("  log_manual_action(2, 'not_interested')  # reject third")

Manual action function defined.

Usage (after search_jobs()):
  log_manual_action(0, 'apply')   # apply to first match
  log_manual_action(1, 'save')    # save second match
  log_manual_action(2, 'not_interested')  # reject third


In [23]:
# when users take actions (apply, save, not interested), those signals can be used to retrain
# the bi-encoder model. This improves matching quality over time based on real user feedback

# Requirements for retraining:
# - At least 50 meaningful actions (weight != 0)
# - Mixes 80% original training data + 20% feedback data
# - Uses low learning rate to avoid catastrophic forgetting

# Check retraining status
from demo.scripts.model_retrainer import (
    retrain_from_feedback,
    check_retrain_needed,
    get_latest_model
)

print("FEEDBACK-DRIVEN RETRAINING")

# Check current status
needed, count = check_retrain_needed(threshold=50)
print(f"Current actions: {count}")
print(f"Threshold for auto-retrain: 50")
print(f"Retrain ready: {needed}")

if needed:
    print("\nModel is ready for retraining! Run the next cell to start.")
else:
    print(f"\nNeed {50 - count} more actions before retraining.")

FEEDBACK-DRIVEN RETRAINING
Current actions: 36
Threshold for auto-retrain: 50
Retrain ready: False

Need 14 more actions before retraining.


In [24]:
# MANUAL RETRAIN TRIGGER
# Run this cell to trigger retraining (even if threshold not met)

FORCE_RETRAIN = False  # Set to True to force retraining

if FORCE_RETRAIN or check_retrain_needed(threshold=10)[0]:
    print("Starting retraining")
    
    result = retrain_from_feedback(
        threshold=10 if FORCE_RETRAIN else 50,
        epochs=2,
        batch_size=32,
        learning_rate=1e-5
    )
    
    if result['success']:
        print("\n" + "="*50)
        print("RETRAINING COMPLETE!")
        print("="*50)
        print(f"New model: {result['model_path']}")
        print(f"Training pairs: {result['training_pairs']}")
        print(f"Time: {result['training_time']:.1f}s")
        print("\nTo use new model, reload the notebook kernel.")
    else:
        print(f"Retraining not needed: {result.get('reason', 'unknown')}")
else:
    print("Not enough actions for retraining.")
    print("Either collect more feedback or set FORCE_RETRAIN = True")

2026-01-28 18:05:06,939 - Only 36 actions, need 50 for retraining


Starting retraining
Retraining not needed: Need 50 actions, have 36


In [25]:
# when users upload CVs and view job matches, we track potential new skills that aren't
# in our dictionary. when a skill appears frequently (3+ times), it's proposed for
# dictionary updates
# this helps keep the skill dictionary current as new technologies emerge

# Skill tracking statistics
from demo.scripts.skill_tracker import (
    get_skill_proposals,
    get_skill_statistics,
    approve_skill
)

print("SKILL DICTIONARY UPDATES")

# Get statistics
stats = get_skill_statistics()
print(f"Total skills tracked: {stats['total_tracked']}")
print(f"Pending approval: {stats['pending']}")
print(f"Already approved: {stats['approved']}")

# Show proposals (frequency >= 3)
proposals = get_skill_proposals(min_frequency=3)
if proposals:
    print(f"\nProposed new skills (frequency >= 3):")
    for skill, freq in proposals[:10]:
        print(f"  {skill}: {freq} occurrences")
    print("\nTo approve a skill, run: approve_skill('skill_name')")
else:
    print("\nNo new skill proposals yet.")
    print("Skills are tracked from uploaded CVs and matched jobs.")
    print("When new skills appear 3+ times, they'll be proposed here.")

SKILL DICTIONARY UPDATES
Total skills tracked: 0
Pending approval: 0
Already approved: 0

No new skill proposals yet.
Skills are tracked from uploaded CVs and matched jobs.
When new skills appear 3+ times, they'll be proposed here.


In [26]:
# Show retraining history
import sqlite3
from pathlib import Path

print("RETRAINING HISTORY")

db_path = os.path.join(PROJECT_ROOT, "demo", "data", "feedback.db")
if Path(db_path).exists():
    conn = sqlite3.connect(db_path)
    history = conn.execute('''
        SELECT model_version, num_actions_used, num_positive_pairs,
               training_time_sec, timestamp
        FROM retraining_log
        ORDER BY timestamp DESC
        LIMIT 5
    ''').fetchall()
    conn.close()
    
    if history:
        print("Recent retraining runs:")
        for row in history:
            print(f"  {row[4]}: v{row[0]}")
            print(f"    Actions: {row[1]}, Pairs: {row[2]}, Time: {row[3]:.1f}s")
    else:
        print("No retraining runs yet.")
else:
    print("Database not initialized")

# Show available models
print("\nAvailable models:")
from demo.scripts.model_retrainer import get_latest_model
latest = get_latest_model()
print(f"  Latest: {latest}")
print(f"  Base: training/output/models/cv-job-matcher-e5")

RETRAINING HISTORY
Database not initialized

Available models:
  Latest: /home/developer/project/training/output/models/cv-job-matcher-e5
  Base: training/output/models/cv-job-matcher-e5


In [27]:
print("""

1. Interactive CV-Job matching from 1.3M+ jobs
2. User actions (apply, save, skip, not interested)
3. Feedback logging to SQLite database
4. Batch retraining from user feedback
5. Skill dictionary updates from new skill discovery

- Model trained on 6,000 pairs, matching against 1.3M+ unseen jobs
- Cross-encoder reranking improves match quality
- Feedback loop enables continuous improvement
- System learns from both positive and negative signals
- New skills are tracked and proposed for dictionary updates

Files created:
- notebooks/12_interactive_matching.ipynb (this notebook)
- demo/scripts/feedback_storage.py (SQLite storage)
- demo/scripts/document_parser.py (file parsing)
- demo/scripts/model_retrainer.py (batch retraining)
- demo/scripts/skill_tracker.py (skill dictionary updates)
- demo/data/feedback.db (action logs)
- training/output/indexes/jobs_full_index.faiss (1.3M job index)



""")



1. Interactive CV-Job matching from 1.3M+ jobs
2. User actions (apply, save, skip, not interested)
3. Feedback logging to SQLite database
4. Batch retraining from user feedback
5. Skill dictionary updates from new skill discovery

- Model trained on 6,000 pairs, matching against 1.3M+ unseen jobs
- Cross-encoder reranking improves match quality
- Feedback loop enables continuous improvement
- System learns from both positive and negative signals
- New skills are tracked and proposed for dictionary updates

Files created:
- notebooks/12_interactive_matching.ipynb (this notebook)
- demo/scripts/feedback_storage.py (SQLite storage)
- demo/scripts/document_parser.py (file parsing)
- demo/scripts/model_retrainer.py (batch retraining)
- demo/scripts/skill_tracker.py (skill dictionary updates)
- demo/data/feedback.db (action logs)
- training/output/indexes/jobs_full_index.faiss (1.3M job index)




