In [None]:
# ============================================================================
# NOTEBOOK 2: MATCHAI INFERENCE, 3 EXTRA MODELS, AND SCORING PIPELINE
# ============================================================================


# ============================================================================
# STEP 0: INSTALL REQUIRED PACKAGES
# ============================================================================


# Cell 1: Install required libraries and import

!pip install -q transformers sentence-transformers datasets

import torch
import numpy as np
import time
import json

from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForSequenceClassification
)
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from torch.nn.functional import cosine_similarity, softmax

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

In [None]:
# ============================================================================
# STEP 1: LOAD PROCESSED DATASET & LABEL MAPPINGS
# ============================================================================


# Cell 2: Load dataset from Hugging Face and rebuild numeric labels

print("\n=== STEP 1: Loading dataset from Hugging Face ===")

dataset = load_dataset("cnamuangtoun/resume-job-description-fit")
print(dataset)

resume_col = "resume_text"
jd_col = "job_description_text"
label_col = "label"

# Label normalization rules (same logic as Notebook 1)
label_variations = {
    'no fit': ['no fit','no_fit','no-fit','0','no','not fit','unfit'],
    'potential fit': ['potential fit','potential_fit','potential-fit','1','potential','maybe','partial'],
    'good fit': ['good fit','good_fit','good-fit','2','good','excellent','perfect','best'],
}

def normalize_label(raw):
    s = str(raw).lower().strip()
    if any(v in s for v in label_variations['no fit']):
        return "No Fit"
    if any(v in s for v in label_variations['potential fit']):
        return "Potential Fit"
    if any(v in s for v in label_variations['good fit']):
        return "Good Fit"
    return "No Fit"  # default fallback

numeric_mapping = {
    "No Fit": 0,
    "Potential Fit": 1,
    "Good Fit": 2
}

reverse_mapping = {v: k for k, v in numeric_mapping.items()}
label_id2name = {
    0: "No Fit",
    1: "Potential Fit",
    2: "Good Fit"
}

def add_numeric_label(example):
    norm = normalize_label(example[label_col])
    example["numeric_label"] = numeric_mapping[norm]
    return example

dataset = dataset.map(add_numeric_label)

print("\nDataset with numeric_label added:")
print(dataset)

train_split = dataset["train"]
test_split = dataset["test"]

print(f"Train size: {len(train_split)}, Test size: {len(test_split)}")

In [None]:
# ============================================================================
# STEP 2: LOAD FINE-TUNED CLASSIFIER FROM HUGGING FACE
# ============================================================================


# Cell 3: Load fine-tuned classifier from Hugging Face

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.nn.functional import softmax
import numpy as np
import torch

print("\n=== STEP 2: Loading fine-tuned classifier from Hugging Face ===")

# üî¥ CHANGE THIS to your actual model on Hugging Face, e.g. "ericachen/matchai-fit-classifier"
FINE_TUNED_MODEL_ID = "your-username/matchai-fit-classifier"  # <-- change this

# 'device' should already be defined in Cell 1, but we guard just in case
try:
    device
except NameError:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

clf_tokenizer = AutoTokenizer.from_pretrained(FINE_TUNED_MODEL_ID)
clf_model = AutoModelForSequenceClassification.from_pretrained(FINE_TUNED_MODEL_ID)
clf_model.to(device)
clf_model.eval()

print("Loaded classifier model from:", FINE_TUNED_MODEL_ID)
print("Number of labels:", clf_model.config.num_labels)

# label_id2name should have been defined in Cell 2:
# label_id2name = {0: "No Fit", 1: "Potential Fit", 2: "Good Fit"}

def predict_fit_label(jd_text: str, res_text: str):
    """
    Use the fine-tuned classifier to predict suitability.
    Returns label name and probabilities.
    """
    combined = res_text + " [SEP] " + jd_text  # same concatenation used in training
    inputs = clf_tokenizer(
        combined,
        return_tensors="pt",
        truncation=True,
        max_length=512
    ).to(device)

    with torch.no_grad():
        outputs = clf_model(**inputs)
        probs = softmax(outputs.logits, dim=-1).cpu().numpy()[0]

    pred_id = int(np.argmax(probs))
    return {
        "label_id": pred_id,
        "label_name": label_id2name.get(pred_id, f"Class {pred_id}"),
        "probs": probs.tolist()
    }

# Quick smoke test on one example
example = test_split[0]
test_pred = predict_fit_label(example[jd_col], example[resume_col])
print("Test prediction:", test_pred)

In [24]:
# ============================================================================
# STEP 3: SUMMARIZATION MODEL SELECTION
# ============================================================================


# Cell 4: Summarization Model Selection With Ranking

print("\n=== STEP 3: Summarization Model Selection With Ranking ===")

from transformers import pipeline
import time
import numpy as np

# Only load lightweight models for testing
summarizer_candidates = {
    "t5_base": "t5-base",
    "distilbart": "sshleifer/distilbart-cnn-12-6",
}

# Prepare small evaluation sample (3 JDs, 3 resumes)
sample_texts = []
for i in range(3):
    sample_texts.append(train_split[i][jd_col])
    sample_texts.append(train_split[i][resume_col])

def keyword_score(original, summary):
    orig_tokens = set([w.lower() for w in original.split() if len(w) > 5])
    sum_tokens  = set([w.lower() for w in summary.split() if len(w) > 5])
    if not orig_tokens:
        return 0
    return len(orig_tokens.intersection(sum_tokens)) / len(orig_tokens)

raw_results = []

print("\nRunning evaluation on candidate models...\n")

for name, model_name in summarizer_candidates.items():
    print(f"--- Testing {name} ({model_name}) ---")

    summ = pipeline("summarization", model=model_name, device=0 if torch.cuda.is_available() else -1)

    comp_ratios, speeds, kw_scores = [], [], []

    for text in sample_texts:
        t = text[:2000]  # truncate long texts for stability

        start = time.time()
        summary = summ(t, max_length=150, min_length=40, do_sample=False)[0]["summary_text"]
        elapsed = time.time() - start

        comp = len(summary) / len(t)
        kw = keyword_score(t, summary)

        comp_ratios.append(comp)
        speeds.append(elapsed)
        kw_scores.append(kw)

    raw_results.append({
        "name": name,
        "model": model_name,
        "avg_compression_ratio": float(np.mean(comp_ratios)),
        "avg_keyword_score": float(np.mean(kw_scores)),
        "avg_inference_time": float(np.mean(speeds)),
    })

# Add BART-large in the ranking table as excluded
raw_results.append({
    "name": "bart_large (excluded)",
    "model": "facebook/bart-large-cnn",
    "avg_compression_ratio": None,
    "avg_keyword_score": None,
    "avg_inference_time": 20.0,
    "note": "Excluded due to memory failures and >20s inference"
})

# Compute normalized scores for candidates only
valid = [r for r in raw_results if r["avg_keyword_score"] is not None]

# Normalize metrics to 0‚Äì1 scale
max_kw = max(r["avg_keyword_score"] for r in valid)
min_speed = min(r["avg_inference_time"] for r in valid)
max_comp = max(r["avg_compression_ratio"] for r in valid)

for r in valid:
    r["keyword_norm"] = r["avg_keyword_score"] / max_kw
    r["speed_norm"] = min_speed / r["avg_inference_time"]  # faster = higher score
    r["compression_norm"] = r["avg_compression_ratio"] / max_comp

    # Composite score
    r["final_score"] = (
        0.4 * r["keyword_norm"] +
        0.3 * r["compression_norm"] +
        0.3 * r["speed_norm"]
    )

# Sort by final score (highest first)
ranked = sorted(valid, key=lambda x: x["final_score"], reverse=True)

print("\nüèÜ Summarization Model Ranking (Composite Score)")
print("------------------------------------------------------------")
print(f"{'Rank':<5} {'Model':<25} {'Score':<10} {'Time(s)':<10}")
print("------------------------------------------------------------")

for i, r in enumerate(ranked, 1):
    print(f"{i:<5} {r['model']:<25} {r['final_score']:.4f}    {r['avg_inference_time']:.2f}")

print("\nNote: 'facebook/bart-large-cnn' excluded for OOM and >20s inference.")


=== STEP 3: Summarization Model Selection With Ranking ===

Running evaluation on candidate models...

--- Testing t5_base (t5-base) ---


Device set to use cpu
Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_cla

--- Testing distilbart (sshleifer/distilbart-cnn-12-6) ---


Device set to use cpu



üèÜ Summarization Model Ranking (Composite Score)
------------------------------------------------------------
Rank  Model                     Score      Time(s)   
------------------------------------------------------------
1     sshleifer/distilbart-cnn-12-6 1.0000    11.31
2     t5-base                   0.9162    11.39

Note: 'facebook/bart-large-cnn' excluded for OOM and >20s inference.


In [25]:
# Cell 5: Use top-ranked summarizer + define summarize_text()


from transformers import pipeline as hf_pipeline

# Take the best summarizer from the ranking computed in Cell 4
BEST_SUMMARIZER = ranked[0]  # 'ranked' is defined in Cell 4
FINAL_SUMMARIZER_MODEL = BEST_SUMMARIZER["model"]

print("\nSelected summarization model based on composite score:")
print("  Internal name :", BEST_SUMMARIZER["name"])
print("  HF model ID   :", FINAL_SUMMARIZER_MODEL)
print("  Final score   :", f"{BEST_SUMMARIZER['final_score']:.4f}")
print("  Avg time (s)  :", f"{BEST_SUMMARIZER['avg_inference_time']:.2f}")

summarizer = hf_pipeline(
    "summarization",
    model=FINAL_SUMMARIZER_MODEL,
    device=0 if torch.cuda.is_available() else -1,
)

def summarize_text(text: str, max_len: int = 150) -> str:
    """
    Safely summarize text using the selected model:
    - Handles None / empty input
    - Truncates very long texts to avoid model/tokenizer issues
    - Provides a graceful fallback if summarization fails
    """
    if not text or not isinstance(text, str):
        return ""

    truncated_text = text[:2000]  # safety for very long resumes / JDs

    try:
        result = summarizer(
            truncated_text,
            max_length=max_len,
            min_length=40,
            do_sample=False,
        )[0]["summary_text"]
        return result
    except Exception as e:
        print(f"[WARN] Summarization failed due to: {e}")
        # Fallback: return a truncated version of the original text
        return truncated_text[:300]



Selected summarization model based on composite score:
  Internal name : distilbart
  HF model ID   : sshleifer/distilbart-cnn-12-6
  Final score   : 1.0000
  Avg time (s)  : 11.31


Device set to use cpu


In [26]:
# ============================================================================
# Step 4: SEMANTIC SIMILARITY MODEL SELECTION
# ============================================================================


# Cell 6: Semantic similarity model selection with ranking

print("\n=== STEP 4: Semantic similarity model selection with ranking ===")

from torch.nn.functional import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np
import time

embedding_candidates = {
    "minilm": "sentence-transformers/all-MiniLM-L6-v2",
    "mpnet": "sentence-transformers/all-mpnet-base-v2",
    "multiqa": "sentence-transformers/multi-qa-MiniLM-L6-cos-v1",
}

# Build small positive (Good Fit) and negative (No Fit) sets from train split
good_fit_pairs = []
no_fit_pairs = []

for ex in train_split:
    label = ex["numeric_label"]
    if label == 2 and len(good_fit_pairs) < 15:
        good_fit_pairs.append((ex[jd_col], ex[resume_col]))
    if label == 0 and len(no_fit_pairs) < 15:
        no_fit_pairs.append((ex[jd_col], ex[resume_col]))
    if len(good_fit_pairs) >= 15 and len(no_fit_pairs) >= 15:
        break

print("Good Fit pairs:", len(good_fit_pairs))
print("No Fit pairs  :", len(no_fit_pairs))

sim_raw_results = []

for name, model_name in embedding_candidates.items():
    print(f"\n--- Testing embedding model: {name} ({model_name}) ---")
    model = SentenceTransformer(model_name)

    sims_good, sims_no = [], []
    start = time.time()

    # Good Fit similarities
    for jd_text, res_text in good_fit_pairs:
        emb_jd = model.encode(jd_text, convert_to_tensor=True)
        emb_res = model.encode(res_text, convert_to_tensor=True)
        sims_good.append(cosine_similarity(emb_jd, emb_res, dim=0).item())

    # No Fit similarities
    for jd_text, res_text in no_fit_pairs:
        emb_jd = model.encode(jd_text, convert_to_tensor=True)
        emb_res = model.encode(res_text, convert_to_tensor=True)
        sims_no.append(cosine_similarity(emb_jd, emb_res, dim=0).item())

    elapsed = time.time() - start

    avg_good = float(np.mean(sims_good))
    avg_no = float(np.mean(sims_no))
    gap = avg_good - avg_no  # we want Good Fit >> No Fit

    sim_raw_results.append({
        "name": name,
        "model": model_name,
        "avg_good": avg_good,
        "avg_no": avg_no,
        "gap": gap,
        "time_sec": elapsed,
    })

print("\nRaw similarity evaluation results:")
for r in sim_raw_results:
    print(r)

# Normalize gap and speed into a composite score
valid_sim = sim_raw_results

max_gap = max(r["gap"] for r in valid_sim)
min_time = min(r["time_sec"] for r in valid_sim)

for r in valid_sim:
    # larger gap = better
    r["gap_norm"] = r["gap"] / max_gap if max_gap > 0 else 0.0
    # faster = better
    r["speed_norm"] = min_time / r["time_sec"] if r["time_sec"] > 0 else 0.0

    # composite score (weight gap more heavily than speed)
    r["final_score"] = 0.7 * r["gap_norm"] + 0.3 * r["speed_norm"]

# Rank models
sim_ranked = sorted(valid_sim, key=lambda x: x["final_score"], reverse=True)

print("\nüèÜ Similarity Model Ranking (Composite Score)")
print("------------------------------------------------------------")
print(f"{'Rank':<5} {'Model':<40} {'Score':<10} {'Gap':<10} {'Time(s)':<10}")
print("------------------------------------------------------------")
for i, r in enumerate(sim_ranked, 1):
    print(f"{i:<5} {r['model']:<40} {r['final_score']:.4f}   {r['gap']:.4f}   {r['time_sec']:.2f}")


=== STEP 4: Semantic similarity model selection with ranking ===
Good Fit pairs: 15
No Fit pairs  : 15

--- Testing embedding model: minilm (sentence-transformers/all-MiniLM-L6-v2) ---

--- Testing embedding model: mpnet (sentence-transformers/all-mpnet-base-v2) ---

--- Testing embedding model: multiqa (sentence-transformers/multi-qa-MiniLM-L6-cos-v1) ---

Raw similarity evaluation results:
{'name': 'minilm', 'model': 'sentence-transformers/all-MiniLM-L6-v2', 'avg_good': 0.47122112711270653, 'avg_no': 0.33195153176784514, 'gap': 0.13926959534486139, 'time_sec': 8.818103313446045}
{'name': 'mpnet', 'model': 'sentence-transformers/all-mpnet-base-v2', 'avg_good': 0.5821873227755229, 'avg_no': 0.4211725036303202, 'gap': 0.16101481914520266, 'time_sec': 85.70857048034668}
{'name': 'multiqa', 'model': 'sentence-transformers/multi-qa-MiniLM-L6-cos-v1', 'avg_good': 0.5364052474498748, 'avg_no': 0.3831758052110672, 'gap': 0.15322944223880763, 'time_sec': 15.846481323242188}

üèÜ Similarity M

In [27]:
# Cell 7: Use best similarity model + define compute_similarity()


BEST_SIM = sim_ranked[0]
FINAL_EMBEDDING_MODEL = BEST_SIM["model"]

print("\nSelected similarity model based on composite score:")
print("  Internal name :", BEST_SIM["name"])
print("  HF model ID   :", FINAL_EMBEDDING_MODEL)
print("  Final score   :", f"{BEST_SIM['final_score']:.4f}")
print("  Gap (Good-No) :", f"{BEST_SIM['gap']:.4f}")
print("  Time (s)      :", f"{BEST_SIM['time_sec']:.2f}")

sim_model = SentenceTransformer(FINAL_EMBEDDING_MODEL)

def compute_similarity(text1: str, text2: str) -> float:
    """
    Compute cosine similarity between two texts using the selected embedding model.
    """
    emb1 = sim_model.encode(text1, convert_to_tensor=True)
    emb2 = sim_model.encode(text2, convert_to_tensor=True)
    sim = cosine_similarity(emb1, emb2, dim=0).item()
    return float(sim)


Selected similarity model based on composite score:
  Internal name : minilm
  HF model ID   : sentence-transformers/all-MiniLM-L6-v2
  Final score   : 0.9055
  Gap (Good-No) : 0.1393
  Time (s)      : 8.82


In [28]:
# ============================================================================
# STEP 5: NER MODEL SELECTION
# ============================================================================


# Cell 8: NER model selection with simple heuristic ranking

print("\n=== STEP 5: NER model selection with heuristic ranking ===")

from transformers import pipeline as hf_pipeline
import time

# Updated candidate list: removed invalid deepset/roberta-base-medium-ner
ner_candidates = {
    "bert_ner": "dslim/bert-base-NER",                      # English, widely used
    "xlm_ner": "Davlan/xlm-roberta-base-ner-hrl",           # multilingual high-resource
    "multi_ner": "Babelscape/wikineural-multilingual-ner",  # multilingual NER
}

# Take a few sample resumes for heuristic comparison
sample_resumes = [x[resume_col] for x in train_split.select(range(min(5, len(train_split))))]

ner_raw_results = []

for name, model_name in ner_candidates.items():
    print(f"\n--- Testing NER model: {name} ({model_name}) ---")
    try:
        ner_pipe = hf_pipeline("ner", model=model_name, grouped_entities=True)
    except Exception as e:
        print(f"  [SKIP] Failed to load {model_name}: {e}")
        continue

    total_org, total_entities, total_time = 0, 0, 0.0

    for res in sample_resumes:
        text = res[:1000]  # truncate for speed
        start = time.time()
        ents = ner_pipe(text)
        elapsed = time.time() - start
        total_time += elapsed

        total_entities += len(ents)
        total_org += sum(1 for e in ents if e.get("entity_group") == "ORG")

    avg_org = total_org / len(sample_resumes)
    avg_ents = total_entities / len(sample_resumes)
    avg_time = total_time / len(sample_resumes)

    ner_raw_results.append({
        "name": name,
        "model": model_name,
        "avg_org": avg_org,          # how many ORG entities we detect on average
        "avg_entities": avg_ents,    # total entities
        "avg_time": avg_time,
    })
    print(f"  Avg ORG entities: {avg_org:.2f}, Avg total entities: {avg_ents:.2f}, Avg time: {avg_time:.2f}s")

if not ner_raw_results:
    raise RuntimeError("All NER models failed to load. Please check internet or model IDs.")

# Normalize ORG count and time ‚Üí composite score
valid_ner = ner_raw_results

max_org = max(r["avg_org"] for r in valid_ner)
min_time_ner = min(r["avg_time"] for r in valid_ner)

for r in valid_ner:
    r["org_norm"] = r["avg_org"] / max_org if max_org > 0 else 0.0
    r["speed_norm"] = min_time_ner / r["avg_time"] if r["avg_time"] > 0 else 0.0

    # composite: more ORG, faster speed
    r["final_score"] = 0.7 * r["org_norm"] + 0.3 * r["speed_norm"]

# Rank NER models
ner_ranked = sorted(valid_ner, key=lambda x: x["final_score"], reverse=True)

print("\nüèÜ NER Model Ranking (Composite Score)")
print("------------------------------------------------------------")
print(f"{'Rank':<5} {'Model':<40} {'Score':<10} {'ORG':<10} {'Time(s)':<10}")
print("------------------------------------------------------------")
for i, r in enumerate(ner_ranked, 1):
    print(f"{i:<5} {r['model']:<40} {r['final_score']:.4f}   {r['avg_org']:.2f}   {r['avg_time']:.2f}")




=== STEP 5: NER model selection with heuristic ranking ===

--- Testing NER model: bert_ner (dslim/bert-base-NER) ---


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


  Avg ORG entities: 3.00, Avg total entities: 7.20, Avg time: 1.59s

--- Testing NER model: xlm_ner (Davlan/xlm-roberta-base-ner-hrl) ---


Device set to use cpu


  Avg ORG entities: 1.80, Avg total entities: 5.40, Avg time: 2.26s

--- Testing NER model: multi_ner (Babelscape/wikineural-multilingual-ner) ---


Device set to use cpu


  Avg ORG entities: 3.60, Avg total entities: 14.60, Avg time: 2.12s

üèÜ NER Model Ranking (Composite Score)
------------------------------------------------------------
Rank  Model                                    Score      ORG        Time(s)   
------------------------------------------------------------
1     Babelscape/wikineural-multilingual-ner   0.9249   3.60   2.12
2     dslim/bert-base-NER                      0.8833   3.00   1.59
3     Davlan/xlm-roberta-base-ner-hrl          0.5611   1.80   2.26


In [29]:
# Cell 9: Use best NER model + define extract_entities()


BEST_NER = ner_ranked[0]
FINAL_NER_MODEL = BEST_NER["model"]

print("\nSelected NER model based on composite score:")
print("  Internal name :", BEST_NER["name"])
print("  HF model ID   :", FINAL_NER_MODEL)
print("  Final score   :", f"{BEST_NER['final_score']:.4f}")
print("  Avg ORG count :", f"{BEST_NER['avg_org']:.2f}")
print("  Avg time (s)  :", f"{BEST_NER['avg_time']:.2f}")

ner = hf_pipeline("ner", model=FINAL_NER_MODEL, grouped_entities=True)

def extract_entities(text: str):
    """
    Extract main entities from resume text:
    Returns dict with ORG, PER, LOC lists.
    """
    if not text or not isinstance(text, str):
        return {"ORG": [], "PER": [], "LOC": []}
    ents = ner(text[:1000])
    result = {"ORG": [], "PER": [], "LOC": []}
    for e in ents:
        label = e.get("entity_group")
        word = e.get("word", "").strip()
        if label in result and word:
            result[label].append(word)
    return result


Selected NER model based on composite score:
  Internal name : multi_ner
  HF model ID   : Babelscape/wikineural-multilingual-ner
  Final score   : 0.9249
  Avg ORG count : 3.60
  Avg time (s)  : 2.12


Device set to use cpu


In [30]:
# ============================================================================
# STEP 6: PROCESSING HELPERS (JD & RESUME)
# ============================================================================


# Cell 10: Processing helpers for JD and resume

print("\n=== STEP 6: Defining JD & resume processing helpers ===")

def process_job_description(jd_text: str):
    summary = summarize_text(jd_text)
    # simple keyword extraction: unique words longer than 4 chars
    keywords = list({
        w.lower() for w in summary.split() if len(w) > 4
    })
    return {
        "raw": jd_text,
        "summary": summary,
        "keywords": keywords,
    }

def process_resume(res_text: str):
    summary = summarize_text(res_text)
    entities = extract_entities(res_text)
    return {
        "raw": res_text,
        "summary": summary,
        "entities": entities,
    }

print("Helpers process_job_description() and process_resume() are ready.")


=== STEP 6: Defining JD & resume processing helpers ===
Helpers process_job_description() and process_resume() are ready.


In [None]:
# ============================================================================
# STEP 7: FINAL SCORING FUNCTION COMBINING 4 MODELS
# ============================================================================


# Cell 11: Final MatchAI scoring function (4-model pipeline)

print("\n=== STEP 7: Building final MatchAI scoring function ===")

# Default weights (later can be HR-adjustable in Streamlit)
WEIGHTS = {
    "classifier": 0.5,   # probability of Good Fit
    "similarity": 0.3,   # semantic similarity on summaries
    "keywords": 0.2,     # JD keyword coverage in resume summary
}

def keyword_match_score(jd_keywords, resume_summary: str) -> float:
    """
    Simple keyword coverage score:
    proportion of JD keywords that appear in the resume summary.
    """
    resume_words = set(w.lower() for w in resume_summary.split())
    if not jd_keywords:
        return 0.0
    hits = sum(1 for kw in jd_keywords if kw in resume_words)
    return hits / len(jd_keywords)

def evaluate_candidate(jd_text: str, res_text: str):
    """
    Full evaluation pipeline:
    - Summarize JD and resume (summarization model selected in STEP 3)
    - Extract entities from resume (NER model selected in STEP 5)
    - Compute semantic similarity between summaries (embedding model from STEP 4)
    - Run fine-tuned classifier (Notebook 1 model, loaded in STEP 2)
    - Compute keyword match between JD summary keywords and resume summary
    - Combine into final weighted suitability score
    """
    # Process JD and resume
    jd = process_job_description(jd_text)
    res = process_resume(res_text)

    # Similarity between summaries
    sim_raw = compute_similarity(jd["summary"], res["summary"])
    # Normalize similarity from [-1, 1] to ~[0, 1]
    sim_norm = (sim_raw + 1) / 2 if sim_raw < 1 else min(sim_raw, 1.0)

    # Keyword match
    kw_score = keyword_match_score(jd["keywords"], res["summary"])

    # Classifier prediction (uses fine-tuned model)
    fit = predict_fit_label(jd_text, res_text)
    prob_good_fit = fit["probs"][2]  # probability of Good Fit (label id 2)

    # Final weighted score
    final_score = (
        WEIGHTS["classifier"] * prob_good_fit +
        WEIGHTS["similarity"] * sim_norm +
        WEIGHTS["keywords"] * kw_score
    )

    return {
        "jd": jd,
        "resume": res,
        "similarity_raw": sim_raw,
        "similarity": sim_norm,
        "keyword_score": kw_score,
        "fit": fit,
        "prob_good_fit": prob_good_fit,
        "final_score": float(final_score),
    }

print("evaluate_candidate(jd_text, res_text) is ready.")


# ============================================================================
# EXTRA: Generate human-readable candidate highlights
# ============================================================================

def generate_candidate_highlights(result: dict, thresholds: dict = None) -> dict:
    """
    Generate a short, recruiter-friendly highlight summary for one candidate,
    based on the evaluation result from evaluate_candidate().

    Input:
        result:   dict returned by evaluate_candidate(jd_text, res_text)
        thresholds (optional): override default thresholds

    Output:
        {
            "highlights": [list of bullet strings],
            "summary": single_string_summary
        }
    """
    # Default thresholds (tunable or later exposed in UI)
    default_thresholds = {
        "final_score_strong": 0.80,
        "final_score_good":  0.70,
        "prob_good_fit_high": 0.80,
        "similarity_high":    0.70,
        "keyword_high":       0.60,
        "org_count_high":     3,
    }
    if thresholds is not None:
        default_thresholds.update(thresholds)
    t = default_thresholds

    highlights = []

    final_score = result.get("final_score", 0.0)
    prob_good_fit = result.get("prob_good_fit", 0.0)
    similarity = result.get("similarity", 0.0)
    keyword_score = result.get("keyword_score", 0.0)
    fit_label = result.get("fit", {}).get("label_name", "Unknown")
    entities = result.get("resume", {}).get("entities", {})
    orgs = entities.get("ORG", [])

    # 1. Overall fit strength (final score + classifier)
    if final_score >= t["final_score_strong"] and prob_good_fit >= t["prob_good_fit_high"]:
        highlights.append(
            f"Very strong overall match (final score {final_score:.2f}, Good Fit probability {prob_good_fit:.2f})."
        )
    elif final_score >= t["final_score_good"]:
        highlights.append(
            f"Solid match (final score {final_score:.2f}) with classifier label: {fit_label}."
        )
    elif fit_label == "Good Fit":
        highlights.append(
            f"Classifier marks this candidate as '{fit_label}', although overall score is moderate ({final_score:.2f})."
        )

    # 2. JD‚Äìresume alignment via semantic similarity
    if similarity >= t["similarity_high"]:
        highlights.append(
            f"Resume narrative is highly aligned with the job description (semantic similarity {similarity:.2f})."
        )

    # 3. Keyword coverage from JD in resume summary
    if keyword_score >= t["keyword_high"]:
        highlights.append(
            f"Strong coverage of key requirements mentioned in the job description (keyword match {keyword_score:.2f})."
        )

    # 4. Company background (ORG entities)
    if len(orgs) >= t["org_count_high"]:
        unique_orgs = list(dict.fromkeys(orgs))  # preserve order, remove duplicates
        top_orgs = ", ".join(unique_orgs[:3])
        highlights.append(
            f"Rich company background, including experience with: {top_orgs}."
        )

    # 5. If nothing stands out, provide a neutral note
    if not highlights:
        highlights.append(
            "No particular standout factors detected; scores are moderate across fit, similarity, and keyword coverage."
        )

    # Build a single-line summary (for UI or report)
    if len(highlights) == 1:
        summary = highlights[0]
    else:
        # Take the strongest 1‚Äì2 points for a concise summary
        summary = " ".join(highlights[:2])

    return {
        "highlights": highlights,
        "summary": summary
    }

In [31]:
# ============================================================================
# STEP 8: SANITY TEST ON A FEW CANDIDATES
# ============================================================================


# Cell 12: Sanity test on a few candidates

print("\n=== STEP 8: Sanity test on a few candidates ===")

if len(test_split) >= 4:
    jd_example = test_split[0][jd_col]
    print("\nUsing JD from test[0] for demo.")

    candidate_resumes = [
        test_split[i][resume_col] for i in range(1, 4)
    ]

    for idx, res_text in enumerate(candidate_resumes, start=1):
        result = evaluate_candidate(jd_example, res_text)
        highlights = generate_candidate_highlights(result)

        print(f"\n--- Candidate {idx} ---")
        print("Fit label      :", result["fit"]["label_name"])
        print("Prob Good Fit  :", f"{result['prob_good_fit']:.3f}")
        print("Final score    :", f"{result['final_score']:.3f}")
        print("Similarity     :", f"{result['similarity']:.3f}")
        print("Keyword score  :", f"{result['keyword_score']:.3f}")
        print("ORG entities   :", result["resume"]["entities"]["ORG"][:3])
        print("Highlight summary:", highlights["summary"])
        print("Full highlights:")
        for h in highlights["highlights"]:
            print(" -", h)
else:
    print("Not enough test samples for demo.")


=== STEP 8: Sanity test on a few candidates ===

Using JD from test[0] for demo.


NameError: name 'predict_fit_label' is not defined

In [None]:
# ============================================================================
# STEP 9: SAVE MATCHAI CONFIG FOR STREAMLIT / DEPLOYMENT
# ============================================================================


# Cell 13: Save MatchAI configuration for reuse

print("\n=== STEP 9: Saving MatchAI configuration ===")

matchai_config = {
    "fine_tuned_model_id": FINE_TUNED_MODEL_ID,
    "summarization_model": FINAL_SUMMARIZER_MODEL,
    "embedding_model": FINAL_EMBEDDING_MODEL,
    "ner_model": FINAL_NER_MODEL,
    "weights": WEIGHTS,
    "label_id2name": label_id2name,
}

with open("matchai_config.json", "w") as f:
    json.dump(matchai_config, f, indent=2)

print("Saved matchai_config.json with model choices and weights.")
print("\n=== Notebook 2 completed: 4-model MatchAI pipeline is ready. ===")