In [1]:
# ==============================
# NOTEBOOK 3: INFERENCE PIPELINE
# ==============================

# CELL 1: Install & imports
!pip install -q transformers sentence-transformers datasets

import torch
import numpy as np
import json

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    pipeline as hf_pipeline
)
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from torch.nn.functional import softmax, cosine_similarity

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cpu


In [2]:
# CELL 2: Load dataset & add numeric labels (for testing pipeline)

dataset = load_dataset("cnamuangtoun/resume-job-description-fit")

resume_col = "resume_text"
jd_col = "job_description_text"
label_col = "label"

label_variations = {
    'no fit': ['no fit','no_fit','no-fit','0','no','not fit','unfit'],
    'potential fit': ['potential fit','potential_fit','potential-fit','1','potential','maybe','partial'],
    'good fit': ['good fit','good_fit','good-fit','2','good','excellent','perfect','best'],
}

def normalize_label(raw):
    s = str(raw).lower().strip()
    if any(v in s for v in label_variations['no fit']):
        return "No Fit"
    if any(v in s for v in label_variations['potential fit']):
        return "Potential Fit"
    if any(v in s for v in label_variations['good fit']):
        return "Good Fit"
    return "No Fit"

numeric_mapping = {
    "No Fit": 0,
    "Potential Fit": 1,
    "Good Fit": 2
}
reverse_mapping = {v: k for k, v in numeric_mapping.items()}
label_id2name = {
    0: "No Fit",
    1: "Potential Fit",
    2: "Good Fit"
}

def add_numeric_label(example):
    norm = normalize_label(example[label_col])
    example["numeric_label"] = numeric_mapping[norm]
    return example

dataset = dataset.map(add_numeric_label)

train_split = dataset["train"]
test_split = dataset["test"]

print("Train size:", len(train_split))
print("Test size :", len(test_split))

train.csv:   0%|          | 0.00/53.4M [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/15.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6241 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1759 [00:00<?, ? examples/s]

Map:   0%|          | 0/6241 [00:00<?, ? examples/s]

Map:   0%|          | 0/1759 [00:00<?, ? examples/s]

Train size: 6241
Test size : 1759


In [6]:
# CELL 3: Load matchai_config.json and models

print("\n=== Loading MatchAI configuration ===")

with open("matchai_config.json", "r") as f:
    config = json.load(f)

FINE_TUNED_MODEL_ID    = config["fine_tuned_model_id"]
FINAL_SUMMARIZER_MODEL = config["summarization_model"]
FINAL_EMBEDDING_MODEL  = config["embedding_model"]
FINAL_NER_MODEL        = config["ner_model"]
WEIGHTS                = config["weights"]
label_id2name          = {int(k): v for k, v in config["label_id2name"].items()}

print("Fine-tuned model ID:", FINE_TUNED_MODEL_ID)
print("Summarisation model:", FINAL_SUMMARIZER_MODEL)
print("Embedding model    :", FINAL_EMBEDDING_MODEL)
print("NER model          :", FINAL_NER_MODEL)
print("Weights            :", WEIGHTS)


=== Loading MatchAI configuration ===
Fine-tuned model ID: distilbert-base-uncased-finetuned-sst-2-english
Summarisation model: sshleifer/distilbart-cnn-12-6
Embedding model    : sentence-transformers/all-MiniLM-L6-v2
NER model          : Babelscape/wikineural-multilingual-ner
Weights            : {'classifier': 0.5, 'similarity': 0.3, 'keywords': 0.2}


In [7]:
# CELL 4: Load summariser & helper

print("\n=== Loading summarisation model ===")
summarizer = hf_pipeline(
    "summarization",
    model=FINAL_SUMMARIZER_MODEL,
    device=0 if torch.cuda.is_available() else -1,
)

def summarize_text(text: str, max_len: int = 150) -> str:
    """
    Summarise text safely:
    - Handles empty / None
    - Truncates very long input
    - Fallback to truncated text if summarisation fails
    """
    if not text or not isinstance(text, str):
        return ""
    truncated_text = text[:2000]
    try:
        result = summarizer(
            truncated_text,
            max_length=max_len,
            min_length=40,
            do_sample=False,
        )[0]["summary_text"]
        return result
    except Exception as e:
        print(f"[WARN] Summarisation failed: {e}")
        return truncated_text[:300]


=== Loading summarisation model ===


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

Device set to use cpu


In [8]:
# CELL 5: Load embedding model & similarity helper

print("\n=== Loading embedding model ===")
sim_model = SentenceTransformer(FINAL_EMBEDDING_MODEL)

def compute_similarity(text1: str, text2: str) -> float:
    """
    Compute cosine similarity between two texts.
    """
    emb1 = sim_model.encode(text1, convert_to_tensor=True)
    emb2 = sim_model.encode(text2, convert_to_tensor=True)
    sim = cosine_similarity(emb1, emb2, dim=0).item()
    return float(sim)


=== Loading embedding model ===


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [9]:
# CELL 6: Load NER model & helper

print("\n=== Loading NER model ===")
ner = hf_pipeline(
    "ner",
    model=FINAL_NER_MODEL,
    grouped_entities=True
)

def extract_entities(text: str):
    """
    Extract ORG / PER / LOC from resume text.
    """
    if not text or not isinstance(text, str):
        return {"ORG": [], "PER": [], "LOC": []}

    ents = ner(text[:1000])
    result = {"ORG": [], "PER": [], "LOC": []}
    for e in ents:
        label = e.get("entity_group")
        word = e.get("word", "").strip()
        if label in result and word:
            result[label].append(word)
    return result


=== Loading NER model ===


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/709M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


In [10]:
# CELL 7: Load fine-tuned classifier & helper

print("\n=== Loading fine-tuned classifier ===")
clf_tokenizer = AutoTokenizer.from_pretrained(FINE_TUNED_MODEL_ID)
clf_model = AutoModelForSequenceClassification.from_pretrained(FINE_TUNED_MODEL_ID)
clf_model.to(device)
clf_model.eval()

print("Classifier loaded. Num labels:", clf_model.config.num_labels)

def predict_fit_label(jd_text: str, res_text: str):
    """
    Use the fine-tuned classifier to predict No Fit / Potential Fit / Good Fit.
    """
    combined = res_text + " [SEP] " + jd_text
    inputs = clf_tokenizer(
        combined,
        return_tensors="pt",
        truncation=True,
        max_length=512
    ).to(device)

    with torch.no_grad():
        outputs = clf_model(**inputs)
        probs = softmax(outputs.logits, dim=-1).cpu().numpy()[0]

    pred_id = int(np.argmax(probs))
    return {
        "label_id": pred_id,
        "label_name": label_id2name.get(pred_id, f"Class {pred_id}"),
        "probs": probs.tolist()
    }


=== Loading fine-tuned classifier ===


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Classifier loaded. Num labels: 2


In [11]:
# CELL 8: JD & resume processing helpers

print("\n=== Defining JD & resume processing helpers ===")

def process_job_description(jd_text: str):
    summary = summarize_text(jd_text)
    keywords = list({w.lower() for w in summary.split() if len(w) > 4})
    return {
        "raw": jd_text,
        "summary": summary,
        "keywords": keywords,
    }

def process_resume(res_text: str):
    summary = summarize_text(res_text)
    entities = extract_entities(res_text)
    return {
        "raw": res_text,
        "summary": summary,
        "entities": entities,
    }

print("process_job_description() and process_resume() ready.")


=== Defining JD & resume processing helpers ===
process_job_description() and process_resume() ready.


In [15]:
# CELL 9: Scoring logic & candidate highlights

# ============================================================================
# SCORING ENGINE: weights, helpers, highlights, final evaluation
# ============================================================================

import numpy as np

# Default weights (can later be exposed in Streamlit UI)
WEIGHTS = {
    "classifier": 0.5,   # contribution from fit classifier
    "similarity": 0.3,   # contribution from semantic similarity
    "keywords":  0.2,    # contribution from keyword coverage
}


def get_good_fit_probability(fit_dict):
    """
    Safely derive a 'Good Fit' probability from the classifier output.

    It works for:
    - 3-class model (No Fit / Potential Fit / Good Fit)
    - 2-class temporary model (e.g. SST-2 sentiment)
    - any other num_labels as a fallback

    Logic:
    1. If label_id2name contains a class whose name includes 'good':
       use that index if it exists in probs.
    2. If there are 2 classes, use the last one as 'fit' (e.g. POSITIVE).
    3. Otherwise, just take the max probability.
    """
    probs = np.array(fit_dict["probs"], dtype=float)
    n = len(probs)

    # Try to find “Good Fit” explicitly in label names
    good_idx = None
    for idx, name in label_id2name.items():
        if "good" in str(name).lower():
            good_idx = idx
            break

    # Case A: 3-class model with explicit Good Fit label
    if good_idx is not None and good_idx < n:
        return float(probs[good_idx])

    # Case B: 2-class model (e.g. SST-2) – use last index as “fit / positive”
    if n == 2:
        return float(probs[1])

    # Case C: generic fallback – strongest class = best fit
    return float(probs[np.argmax(probs)])


def keyword_match_score(jd_keywords, resume_summary: str) -> float:
    """
    Compute a simple keyword coverage score:
    fraction of JD keywords that appear in the resume summary.
    """
    resume_words = set(w.lower() for w in resume_summary.split())
    if not jd_keywords:
        return 0.0
    hits = sum(1 for kw in jd_keywords if kw in resume_words)
    return hits / len(jd_keywords)


def process_job_description(jd_text: str):
    """
    JD processing pipeline:
    - summarise JD text
    - extract naive keywords from the summary
    """
    summary = summarize_text(jd_text)
    keywords = list({
        w.lower()
        for w in summary.split()
        if len(w) > 4
    })
    return {
        "raw": jd_text,
        "summary": summary,
        "keywords": keywords,
    }


def process_resume(res_text: str):
    """
    Resume processing pipeline:
    - summarise resume text
    - extract entities (ORG, PER, LOC, etc.) via NER
    """
    summary = summarize_text(res_text)
    entities = extract_entities(res_text)
    return {
        "raw": res_text,
        "summary": summary,
        "entities": entities,
    }


def generate_candidate_highlights(result_dict):
    """
    Generate 2–3 concise bullet highlights based on the evaluation result.
    These are for HR to quickly understand why a candidate looks strong.
    """
    highlights = []

    fit_label = result_dict["fit"]["label_name"]
    prob_good = result_dict["prob_good_fit"]
    sim = result_dict["similarity"]
    kw = result_dict["keyword_score"]
    ents = result_dict["resume"].get("entities", {})

    # 1. Fit / classifier-based highlight
    if prob_good >= 0.75:
        highlights.append(
            f"Classifier indicates a strong overall fit (Good Fit score ≈ {prob_good:.0%})."
        )
    elif prob_good >= 0.5:
        highlights.append(
            f"Classifier suggests a moderate fit with potential (Good Fit score ≈ {prob_good:.0%})."
        )
    else:
        highlights.append(
            f"Classifier flags this profile as weaker for this JD (Good Fit score ≈ {prob_good:.0%})."
        )

    # 2. Similarity / keyword coverage highlight
    if sim >= 0.75 and kw >= 0.6:
        highlights.append(
            "Resume content closely matches the JD requirements both semantically and by key terms."
        )
    elif sim >= 0.6:
        highlights.append(
            "Resume is broadly aligned with the JD, though some specific requirements may be missing."
        )
    else:
        highlights.append(
            "Semantic overlap with the JD is limited, indicating a more peripheral match."
        )

    # 3. Entity-based highlight (e.g. companies / locations)
    orgs = ents.get("ORG", []) if isinstance(ents, dict) else []
    if orgs:
        unique_orgs = list(dict.fromkeys(orgs))  # preserve order, remove duplicates
        top_orgs = ", ".join(unique_orgs[:3])
        highlights.append(
            f"Experience includes organisations such as: {top_orgs}."
        )

    # Keep it short
    return highlights[:3]


def evaluate_candidate(jd_text: str, res_text: str):
    """
    Full 4-model evaluation pipeline:
    - Summarise JD and resume (summarisation model)
    - Extract entities from resume (NER model)
    - Compute semantic similarity between JD and resume summaries (embedding model)
    - Run fine-tuned classifier to obtain fit probabilities
    - Compute keyword coverage between JD keywords and resume summary
    - Combine into a final weighted suitability score
    - Generate short human-readable highlights
    """
    # 1. Process text with summariser + NER
    jd = process_job_description(jd_text)
    res = process_resume(res_text)

    # 2. Semantic similarity between summaries
    sim_raw = compute_similarity(jd["summary"], res["summary"])
    # normalise from [-1, 1] to [0, 1] (defensive)
    sim_norm = (sim_raw + 1) / 2 if sim_raw < 1 else min(sim_raw, 1.0)

    # 3. Keyword coverage
    kw_score = keyword_match_score(jd["keywords"], res["summary"])

    # 4. Classifier prediction (works with 2-class now, 3-class later)
    fit = predict_fit_label(jd_text, res_text)
    prob_good_fit = get_good_fit_probability(fit)

    # 5. Weighted final score
    final_score = (
        WEIGHTS["classifier"] * prob_good_fit +
        WEIGHTS["similarity"] * sim_norm +
        WEIGHTS["keywords"] * kw_score
    )

    # 6. Assemble result object
    result = {
        "jd": jd,
        "resume": res,
        "similarity_raw": sim_raw,
        "similarity": sim_norm,
        "keyword_score": kw_score,
        "fit": fit,
        "prob_good_fit": float(prob_good_fit),
        "final_score": float(final_score),
    }

    # 7. Add highlights
    result["highlights"] = generate_candidate_highlights(result)

    return result

print("Scoring engine ready: evaluate_candidate(jd_text, res_text)")

Scoring engine ready: evaluate_candidate(jd_text, res_text)


In [16]:
# CELL 10: Sanity test on a few candidates

print("\n=== Sanity test: evaluate a few candidates ===")

if len(test_split) >= 4:
    jd_example = test_split[0][jd_col]
    print("\nUsing JD from test[0] for demo.")

    candidate_resumes = [test_split[i][resume_col] for i in range(1, 4)]

    for idx, res_text in enumerate(candidate_resumes, start=1):
        result = evaluate_candidate(jd_example, res_text)
        print(f"\n--- Candidate {idx} ---")
        print("Fit label      :", result["fit"]["label_name"])
        print("Prob Good Fit  :", f"{result['prob_good_fit']:.3f}")
        print("Final score    :", f"{result['final_score']:.3f}")
        print("Similarity     :", f"{result['similarity']:.3f}")
        print("Keyword score  :", f"{result['keyword_score']:.3f}")
        print("Highlights     :")
        for h in result["highlights"]:
            print("  -", h)
else:
    print("Not enough test samples for demo.")


=== Sanity test: evaluate a few candidates ===

Using JD from test[0] for demo.

--- Candidate 1 ---
Fit label      : Potential Fit
Prob Good Fit  : 0.958
Final score    : 0.658
Similarity     : 0.596
Keyword score  : 0.000
Highlights     :
  - Classifier indicates a strong overall fit (Good Fit score ≈ 96%).
  - Semantic overlap with the JD is limited, indicating a more peripheral match.
  - Experience includes organisations such as: ##ntSenior Analyst Data Quality & GovernanceMilliman, ##Healthcare Data AnalystGeneral Motors.

--- Candidate 2 ---
Fit label      : Potential Fit
Prob Good Fit  : 0.665
Final score    : 0.488
Similarity     : 0.518
Keyword score  : 0.000
Highlights     :
  - Classifier suggests a moderate fit with potential (Good Fit score ≈ 67%).
  - Semantic overlap with the JD is limited, indicating a more peripheral match.
  - Experience includes organisations such as: PublicMicrosoft Office Professional Professional Experience, Account, Health.

--- Candidate 3 ---