In [18]:
# ==== Cell 1: Imports + Setup (Gemini-only: LLM + Embeddings) ====

# Std libs
import os, math, uuid, json, time
from typing import List, Dict, Any, Optional

# LangChain utilities we’ll reuse later
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Gemini via LangChain (LLM + Embeddings)
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings


# ---------- API key (Gemini) ----------
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY") or os.getenv("GEMINI_APIKEY")
if not GEMINI_API_KEY or not GEMINI_API_KEY.strip():
    raise RuntimeError("Missing Gemini key. Set GEMINI_API_KEY in your environment before running.")

# Make sure downstream libs find the key
os.environ["GOOGLE_API_KEY"] = GEMINI_API_KEY
os.environ["GEMINI_API_KEY"] = GEMINI_API_KEY
# Avoid ADC fallback confusion
os.environ.pop("GOOGLE_APPLICATION_CREDENTIALS", None)


# ---------- Model handles ----------
# LLM (we’ll keep calls modest; e.g., 1 compact JD snapshot call later)
llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0,
    api_key=GEMINI_API_KEY,
)

# Semantic embeddings (Gemini text-embedding-004) — used for:
# retrieval, role/project similarity, skill matching (OOP≈OOPS≈object-oriented programming)
embedding_model = GoogleGenerativeAIEmbeddings(
    model="models/text-embedding-004",
    google_api_key=GEMINI_API_KEY,
)

# ---------- Utilities (no outbound calls here) ----------
def cosine_sim(u: List[float], v: List[float]) -> float:
    """Cosine similarity for two equal-length vectors."""
    if not u or not v:
        return 0.0
    dot = sum(a * b for a, b in zip(u, v))
    nu = math.sqrt(sum(a * a for a in u))
    nv = math.sqrt(sum(b * b for b in v))
    if nu == 0 or nv == 0:
        return 0.0
    return dot / (nu * nv)

def _batch(iterable: List[Any], size: int) -> List[List[Any]]:
    """Yield successive batches of given size from a list."""
    return [iterable[i : i + size] for i in range(0, len(iterable), size)]

def embed_texts(texts: List[str], batch_size: int = 64, sleep_between: float = 0.0) -> List[List[float]]:
    """
    Embed a list of texts using Gemini embeddings with optional batching.
    Keep this function for later cells; do NOT call it here to avoid API usage in Cell 1.
    """
    if not texts:
        return []
    vecs: List[List[float]] = []
    for chunk in _batch(texts, batch_size):
        # This performs a single embeddings API call per batch
        chunk_vecs = embedding_model.embed_documents(chunk)
        vecs.extend(chunk_vecs)
        if sleep_between:
            time.sleep(sleep_between)
    return vecs

def embed_query(text: str) -> List[float]:
    """Single-text embedding (used for queries like JD vectors)."""
    return embedding_model.embed_query(text or "")

print("[ok] Gemini LLM ready (gemini-2.0-flash)")
print("[ok] Gemini embeddings ready (models/text-embedding-004)")
print("[ok] Cell 1 complete — using Gemini for both reasoning and semantic analysis (no local models).")


[ok] Gemini LLM ready (gemini-2.0-flash)
[ok] Gemini embeddings ready (models/text-embedding-004)
[ok] Cell 1 complete — using Gemini for both reasoning and semantic analysis (no local models).


In [None]:
# ==== Cell 2 (updated): State + Options + Prompts (context-rich, domain-agnostic, friendly ATS) ====
import re, json, uuid
from pathlib import Path
from typing import Optional, Dict, Any, List, TypedDict

# ---------- User Inputs (set these in your notebook run) ----------
# Provide either a file path or raw text for resume/JD. Leave as None if you plan to set later.
RESUME_FILE: Optional[str] = "JALADI ASHISH RESUME_oracle.pdf"      # e.g. "/mnt/data/resume.pdf"
RESUME_TEXT: Optional[str] = None      # paste resume text if no file

JD_FILE: Optional[str] = None          # e.g. "/mnt/data/jd.pdf"
JD_TEXT: Optional[str] = """
🧠 Job Title: Associate / Junior Generative AI Engineer (Fresher)
Location: [Hybrid / Onsite / Remote — e.g., Bangalore, Pune, Hyderabad]
Experience: 0–1 Years
Employment Type: Full-time
Department: AI / Machine Learning / Product Engineering
About the Role

We are looking for an enthusiastic and innovative Generative AI Engineer (Fresher) to join our AI team.
In this role, you’ll work on designing, fine-tuning, and deploying large language model (LLM)-based applications using frameworks like LangChain, LlamaIndex, and OpenAI APIs. You will collaborate with data scientists, backend engineers, and product teams to build intelligent systems that understand, reason, and generate human-like text and content.

This is an exciting opportunity for someone passionate about AI, NLP, and prompt engineering, eager to learn real-world GenAI workflows and contribute to production-ready AI systems.

Key Responsibilities

Design and implement LLM-powered applications using frameworks such as LangChain, LlamaIndex, or Haystack.

Develop prompt engineering pipelines and retrieval-augmented generation (RAG) workflows.

Work with vector databases (FAISS, Chroma, Pinecone, Weaviate, Milvus) for semantic search and document retrieval.

Fine-tune or adapt open-source LLMs (LLaMA, Mistral, Falcon, Gemma, etc.) for domain-specific tasks.

Integrate AI APIs (OpenAI, Anthropic, Google Gemini, etc.) into web or backend systems.

Collaborate with backend teams to deploy AI models using FastAPI, Flask, or Streamlit.

Analyze and evaluate model performance (precision, recall, BLEU, ROUGE, perplexity, etc.).

Write efficient, production-quality Python code for data preprocessing, model integration, and automation.

Stay up-to-date with advancements in Generative AI, Transformer architectures, and MLOps practices.

Technical Skills (Required)
🧩 Programming & Tools:

Proficiency in Python and experience with libraries like transformers, torch, langchain, sentence-transformers, chromadb.

Familiarity with REST APIs and JSON for integrating AI services.

Experience with Jupyter Notebooks or VS Code for experimentation.

Understanding of data preprocessing, text embeddings, and tokenization.

🤖 AI/ML Concepts:

Basic understanding of:

Neural networks and transformers (BERT, GPT, etc.)

RAG (Retrieval Augmented Generation)

Embeddings & vector search

Fine-tuning and model evaluation

Awareness of Hugging Face ecosystem and OpenAI API usage.

🗄️ Databases & Backend:

Familiarity with NoSQL / vector databases (Chroma, Pinecone, FAISS, Milvus).

Knowledge of FastAPI or Flask for backend integration.

Basic understanding of cloud services (AWS, GCP, or Azure).

Soft Skills

Strong problem-solving and analytical abilities.

Curiosity and a learning mindset toward cutting-edge AI technologies.

Ability to work collaboratively in a fast-paced environment.

Good written and verbal communication skills for technical documentation.

Educational Qualifications

B.E. / B.Tech / M.Tech / MCA in Computer Science, Artificial Intelligence, Data Science, or related fields.

Good understanding of Mathematics, Statistics, and Machine Learning fundamentals.

Academic or personal projects related to AI / NLP / LLMs are a plus.

Bonus / Nice-to-Have Skills

Experience with LangGraph, CrewAI, or OpenDevin-style agent frameworks.

Exposure to MLOps pipelines (Docker, MLflow, Hugging Face Hub).

Knowledge of document loaders, PDF parsing, or knowledge base QA systems.

Understanding of prompt optimization, few-shot learning, and tool use in LLMs.
"""          # paste JD text if no file

# ---------- Configuration / thresholds ----------
class RunOptions(TypedDict):
    # chunking + retrieval
    chunk_tokens: int               # approx tokens per chunk (proxy via chars*4)
    chunk_overlap: float            # 0..1 overlap ratio
    faiss_topk: int                 # how many JD-relevant chunks to use
    extract_max_chunks: int         # cap LLM extractions (one call per selected chunk)
    hybrid_alpha: float             # weight for semantic vs lexical (0..1, higher = more semantic)
    mmr_lambda: float               # MMR diversity weight (0..1), higher = more diversity

    # API budgets
    llm_budget_calls: int           # total Gemini LLM call budget
    embed_batch_size: int           # embeddings batch size per call

    # semantic thresholds (embeddings)
    strong_sim: float               # cosine ≥ strong match
    partial_sim: float              # cosine ≥ partial match
    canon_threshold: float          # clustering threshold for canonicalizing observed terms

    # complexity / transferability
    max_projects_for_complexity: int  # max projects to score with LLM (batched once)
    complexity_weight_cap: float      # cap added by complexity/transferability bonus (0..1 influence)

    # scoring weights (0–100 total)
    weights: Dict[str, float]       # component weights sum ~100
    gate_hard_cap: int              # cap score when a hard gate fails

    # eligibility + UX
    eligibility_threshold: int      # eligible if score_100 ≥ this
    sections_to_emit: List[str]     # three sections for frontend

    # misc
    recency_months_ideal: int       # months considered freshest for roles
    language: str                   # hint for JD/resume language (e.g., "en")
    keep_artifacts: bool            # write intermediate JSONs to ./tmp

DEFAULT_OPTIONS: RunOptions = {
    # retrieval
    "chunk_tokens": 900,
    "chunk_overlap": 0.15,
    "faiss_topk": 8,
    "extract_max_chunks": 4,
    "hybrid_alpha": 0.6,      # 60% semantic, 40% lexical
    "mmr_lambda": 0.35,

    # budgets
    "llm_budget_calls": 7,    # JD snapshot (1) + per-chunk (≤4) + consolidation (1) + complexity batch (1)
    "embed_batch_size": 64,

    # semantic thresholds
    "strong_sim": 0.78,
    "partial_sim": 0.65,
    "canon_threshold": 0.82,

    # complexity / transferability
    "max_projects_for_complexity": 4,
    "complexity_weight_cap": 0.12,   # at most +12 points influence

    # scoring weights (sum to ~100)
    "weights": {
        "must_have_coverage": 22.0,
        "required_coverage": 16.0,
        "preferred_coverage": 6.0,
        "role_alignment": 16.0,
        "project_alignment": 10.0,
        "evidence_depth": 6.0,
        "seniority_fit": 8.0,
        "responsibility_overlap": 6.0,
        "transferability_bonus": 10.0,  # complexity × transferability
    },
    "gate_hard_cap": 59,

    # eligibility + UX
    "eligibility_threshold": 50,   # ≥ 50 is eligible
    "sections_to_emit": [
        "present_against_jd",      # what’s present (must/required/preferred with evidence)
        "missing_against_jd",      # gaps vs JD
        "extra_strengths"          # additional strengths (complex/transferable projects, internships, certs)
    ],

    # misc
    "recency_months_ideal": 6,
    "language": "en",
    "keep_artifacts": True,
}

# ---------- Pipeline STATE ----------
class PipelineState(TypedDict, total=False):
    run_id: str
    options: RunOptions
    inputs: Dict[str, Optional[str]]
    raw: Dict[str, Optional[str]]
    provenance: Dict[str, Any]
    chunks: List[Dict[str, Any]]
    faiss: Dict[str, Any]
    contacts: Dict[str, Any]
    high_level: Dict[str, Any]
    education: List[Dict[str, Any]]
    timeline: List[Dict[str, Any]]
    projects: List[Dict[str, Any]]
    skills: List[Dict[str, Any]]
    certs: List[Dict[str, Any]]
    awards: List[Dict[str, Any]]
    locations: List[str]
    jd_snapshot: Dict[str, Any]
    canon: Dict[str, Any]
    jd_alignment: Dict[str, Any]
    coverage: Dict[str, Any]
    gates: Dict[str, Any]
    complexity: Dict[str, Any]
    final: Dict[str, Any]
    artifacts: Dict[str, Any]
    audit: List[str]
    _llm_calls: int

def new_state(
    resume_file: Optional[str],
    resume_text: Optional[str],
    jd_file: Optional[str],
    jd_text: Optional[str],
    options: RunOptions = DEFAULT_OPTIONS,
) -> PipelineState:
    run_id = uuid.uuid4().hex[:12]
    base_dir = Path(f"./tmp/{run_id}")
    paths = {
        "base_dir": str(base_dir),
        "chunks_json": str(base_dir / "chunks.json"),
        "entities_by_chunk_json": str(base_dir / "entities_by_chunk.json"),
        "final_json": str(base_dir / "final.json"),
        "faiss_dir": str(base_dir / "faiss"),
        "jd_snapshot_json": str(base_dir / "jd_snapshot.json"),
        "complexity_json": str(base_dir / "complexity.json"),
    }
    st: PipelineState = {
        "run_id": run_id,
        "options": options,
        "inputs": {
            "resume_file": resume_file, "resume_text": resume_text,
            "jd_file": jd_file, "jd_text": jd_text,
        },
        "raw": {"resume_text": None, "jd_text": None},
        "provenance": {"chunks": [], "hybrid_retrieval": [], "mmr_selected": []},
        "chunks": [],
        "faiss": {"index_path": None, "topk_ids": []},
        "contacts": {"name": None, "email": None, "phone": None,
                     "links": {"linkedin": None, "github": None, "portfolio": None, "website": None}},
        "high_level": {"summary": None, "location": None, "years_experience": None},
        "education": [], "timeline": [], "projects": [],
        "skills": [], "certs": [], "awards": [], "locations": [],
        "jd_snapshot": {
            "title": None,
            "must_haves": [],
            "required": [],
            "preferred": [],
            "responsibilities": [],
            "hard_gates": {
                "degree_required": False,
                "min_years": None,
                "license": [],
                "work_auth": None,
                "clearance": None,
                "location_mode": None,
                "onsite_city": None,
                "shift": None,
                "travel": None,
            },
            "evidence": {"must": {}, "req": {}, "pref": {}, "resp": {}},
            "conf": {"must": {}, "req": {}, "pref": {}, "resp": {}},
        },
        "canon": {
            "skill_alias": {},              # NO domain hardcoding; embeddings will unify semantics later
            "normalized_skills": [],
            "normalized_required": [],
            "normalized_preferred": [],
        },
        "jd_alignment": {
            "must_have": [],
            "required": [],
            "preferred": [],
            "responsibilities": {"coverage": 0.0, "count": 0}
        },
        "coverage": {},
        "gates": {"failed": [], "notes": []},
        "complexity": {"scored": [], "bonus": 0.0},
        "final": {},
        "artifacts": {"base_dir": paths["base_dir"], "paths": paths},
        "audit": [],
        "_llm_calls": 0,
    }
    if options.get("keep_artifacts", True):
        base_dir.mkdir(parents=True, exist_ok=True)
    print(f"[ok] STATE initialized — run_id={st['run_id']} → artifacts: {paths['base_dir']}")
    return st

STATE = new_state(RESUME_FILE, RESUME_TEXT, JD_FILE, JD_TEXT, DEFAULT_OPTIONS)

# ---------- Prompt helpers ----------
def _escape_braces_keep_vars(template: str, keep_vars: List[str]) -> str:
    esc = template.replace("{", "{{").replace("}", "}}")
    for v in keep_vars:
        esc = esc.replace("{{" + v + "}}", "{" + v + "}")
    return esc

PROMPTS: Dict[str, str] = {}

# 1) JD snapshot + hard gates (detailed, domain-agnostic, evidence-bound, friendly to implied tokens)
PROMPTS["jd_snapshot_and_gates"] = _escape_braces_keep_vars(r"""
You are a **strict but fair** parser for an Applicant Tracking System. Read the Job Description (any domain)
and convert it into a compact, **evidence-bound** JSON snapshot used for automated matching.

General rules:
- Tokens must be **atomic capabilities/credentials**, in **lowercase** (e.g., "object oriented programming", "excel", "b2b sales", "sterile technique", "lean six sigma").
- Include items that are **explicit** or **clearly implied** by phrasing; do **NOT** list examples not present.
- Do **NOT** expand families (no listing "gcc/clang" if only "compilers" is mentioned).
- For every token/phrase/gate, supply a short **evidence** snippet (≤160 chars). If unclear, use "".
- Include a **confidence** score (0.0–1.0) reflecting how certain the token/phrase was required or implied.

Return STRICT JSON only:
{
  "title": "<short role title>",
  "must_haves": [{"token": "", "evidence": "", "conf": 0.0}],
  "required": [{"token": "", "evidence": "", "conf": 0.0}],
  "preferred": [{"token": "", "evidence": "", "conf": 0.0}],
  "responsibilities": [{"phrase": "", "evidence": "", "conf": 0.0}],
  "hard_gates": {
    "degree_required": { "value": true|false, "evidence": "" },
    "min_years": { "value": null | 0, "evidence": "" },
    "license": [{ "token": "", "evidence": "" }],
    "work_auth": { "value": null | "us citizen|eu work permit|...", "evidence": "" },
    "clearance": { "value": null | "active secret|public trust|...", "evidence": "" },
    "location_mode": { "value": null | "onsite|hybrid|remote", "evidence": "" },
    "onsite_city": { "value": null | "<city or region>", "evidence": "" },
    "shift": { "value": null | "night|rotational|weekend|...", "evidence": "" },
    "travel": { "value": null | "<% or phrase>", "evidence": "" }
  }
}

Guidance:
- Prefer role-agnostic phrasing; avoid technology bias. Examples across domains: "inventory optimization", "sterile technique", "crm", "lead generation", "risk analysis", "six sigma", "autocad", "wound care".
- Keep lists **minimal & atomic**; never merge two different tokens into one.

JD:
---
{jd_text}
---
""", ["jd_text"])

# 2) Chunk extractor (one call per selected chunk) — capture verbatim items + short evidence per entry
PROMPTS["extract_all_from_chunk"] = _escape_braces_keep_vars(r"""
Extract **only** what appears in THIS chunk of a resume. Do not infer content from outside this chunk.

Return STRICT JSON only:
{
  "contacts": { "name": null, "email": null, "phone": null, "links": ["urls"] },
  "education": [
    { "degree": "", "field": "", "institution": "", "start": null, "end": null, "location": null, "evidence": "" }
  ],
  "experience": [
    { "title": "", "company": "", "location": null, "start": null, "end": null,
      "highlights": ["impact/achievements, keep concise; include metrics if any"], "evidence": "" }
  ],
  "projects": [
    { "name": "", "tech": ["as written tokens"], "impact": null, "duration": null, "role": null, "links": ["urls"], "evidence": "" }
  ],
  "skills": ["as written tokens"],
  "certifications": ["as written short tokens"],
  "awards": ["as written"],
  "locations": ["city/state/country names mentioned"]
}

Chunk ID: {chunk_id}
Chunk:
---
{chunk_text}
---
""", ["chunk_id","chunk_text"])

# 3) Consolidator (single call) — dedupe, unify synonyms ONLY if evidenced; keep multiple evidences
PROMPTS["consolidate_resume_json"] = _escape_braces_keep_vars(r"""
You are merging multiple chunk-level JSON extractions of one resume.

Rules:
- **Deduplicate** across chunks.
- **Unify obvious synonyms** ONLY if each synonym has some evidence (e.g., "o.o.p", "oop", "object oriented programming").
  Use a single **canonical** string; keep all synonyms in **aliases**.
- **Keep evidence arrays** for each item (gather from input snippets).
- Normalize dates to "YYYY-MM" when possible; if ambiguous, retain the raw string.
- Preserve short, high-signal highlights (metrics, scale, scope).

Return STRICT JSON only:
{
  "contacts": { "name": null, "email": null, "phone": null, "links": ["urls"] },
  "education": [ { "degree": "", "field": "", "institution": "", "start": null, "end": null, "location": null, "evidence": ["..."] } ],
  "experience": [ { "title": "", "company": "", "location": null, "start": null, "end": null, "highlights": ["..."], "evidence": ["..."] } ],
  "projects": [ { "name": "", "tech": ["tokens"], "impact": null, "duration": null, "role": null, "links": ["urls"], "evidence": ["..."] } ],
  "skills": [ { "canonical": "", "aliases": ["..."], "evidence": ["..."] } ],
  "certifications": [ { "canonical": "", "aliases": ["..."], "evidence": ["..."] } ],
  "awards": [ { "canonical": "", "evidence": ["..."] } ],
  "locations": [ { "canonical": "", "evidence": ["..."] } ]
}

Inputs:
---
{chunk_json}
---
""", ["chunk_json"])

# 4) Project/Internship complexity & transferability scorer (batched once for top-N projects)
PROMPTS["score_project_complexity_batch"] = _escape_braces_keep_vars(r"""
Evaluate each PROJECT/INTERNSHIP against the Job Description **in principle** (domain-agnostic).
Score two axes from 0.0–1.0, and provide a concise rationale:

- "complexity": problem difficulty, scale (#users, data/transactions), technical or domain depth,
  constraints (latency, safety, compliance), duration/tenure, ownership, novelty, integration breadth.
- "transferability": how well the **capabilities and patterns** demonstrated can apply to the JD role, even if the domain differs
  (e.g., experimentation, stakeholder comms, process automation, safety standards, optimization, data handling,
   customer workflows, regulations, reliability, cost control, quality metrics, design-to-constraints).

Important:
- Do NOT penalize domain mismatch if fundamentals/approaches are transferable.
- Use the project text only (no invention). Prefer metrics when present.

Return STRICT JSON only:
{
  "items": [
    { "name": "", "complexity": 0.0, "transferability": 0.0, "rationale": "1–2 lines grounded in the text" }
  ]
}

Context:
JD_title: {jd_title}
JD_tokens: {jd_tokens}

PROJECTS_JSON:
---
{projects_json}
---
""", ["jd_title","jd_tokens","projects_json"])

print("[ok] Prompts ready (detailed, domain-agnostic): jd_snapshot_and_gates, extract_all_from_chunk, consolidate_resume_json, score_project_complexity_batch")
print("[ok] Cell 2 updated — context-aware, not strict, supports project complexity/transferability, eligibility ≥ 50.")


[ok] STATE initialized — run_id=52d59f5aaedc → artifacts: tmp\52d59f5aaedc
[ok] Prompts ready: jd_snapshot_and_gates (1 LLM call), extract_all_from_chunk (per selected chunk).
[ok] Cell 2 complete — state/options/prompts configured for ATS-style scoring with Gemini-only APIs.


In [20]:
# ==== Cell 3 (updated): Load → Chunk → Batched Embeddings → Hybrid Retrieval → Resume Sanity ====
import json, re, zipfile, xml.etree.ElementTree as ET
from pathlib import Path
from typing import Optional, List, Dict

# Ensure flags exists
STATE.setdefault("flags", {})
STATE["flags"].setdefault("is_resume", False)

# --- helpers: file reading ---
def read_pdf_pymupdf(path: str) -> str:
    loader = PyMuPDFLoader(path)
    docs = loader.load()
    return "\n".join((d.page_content or "") for d in docs)

def read_docx_quick(path: str) -> str:
    with zipfile.ZipFile(path) as z:
        xml_bytes = z.read("word/document.xml")
    root = ET.fromstring(xml_bytes)
    ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
    lines = []
    for p in root.findall(".//w:p", ns):
        txt = "".join((t.text or "") for t in p.findall(".//w:t", ns)).strip()
        if txt:
            lines.append(txt)
    return "\n".join(lines)

def read_text_file(path: str) -> str:
    return Path(path).read_text(encoding="utf-8", errors="ignore")

def read_any(path_or_text: Optional[str]) -> str:
    """Accept either a filesystem path or raw text."""
    if not path_or_text:
        return ""
    p = Path(path_or_text)
    if p.exists():
        ext = p.suffix.lower()
        if ext == ".pdf":  return read_pdf_pymupdf(str(p))
        if ext == ".docx": return read_docx_quick(str(p))
        return read_text_file(str(p))
    return str(path_or_text)

# --- ensure artifacts dir ---
paths = STATE["artifacts"]["paths"]
base_dir = Path(paths["base_dir"])
if STATE["options"].get("keep_artifacts", True):
    base_dir.mkdir(parents=True, exist_ok=True)

# --- load resume & JD into STATE["raw"] ---
resume_text = STATE["raw"].get("resume_text") or (
    read_any(STATE["inputs"].get("resume_file")) if STATE["inputs"].get("resume_file") else read_any(STATE["inputs"].get("resume_text"))
)
jd_text = STATE["raw"].get("jd_text") or (
    read_any(STATE["inputs"].get("jd_file")) if STATE["inputs"].get("jd_file") else read_any(STATE["inputs"].get("jd_text"))
)

if not resume_text.strip():
    raise RuntimeError("No resume content found. Provide RESUME_FILE or RESUME_TEXT in Cell 2.")
if not jd_text.strip():
    raise RuntimeError("No JD content found. Provide JD_FILE or JD_TEXT in Cell 2.")

STATE["raw"]["resume_text"] = resume_text
STATE["raw"]["jd_text"] = jd_text

# --- chunking (char proxy for tokens) ---
opts = STATE["options"]
chunk_chars = int(opts["chunk_tokens"] * 4)  # ~4 chars/token
overlap_chars = int(chunk_chars * opts["chunk_overlap"])

splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_chars,
    chunk_overlap=overlap_chars,
    separators=["\n\n", "\n", ". ", " ", ""],
)
chunks_texts: List[str] = splitter.split_text(resume_text)
STATE["chunks"] = [{"id": f"c_{i:02d}", "text": t} for i, t in enumerate(chunks_texts)]

if STATE["options"].get("keep_artifacts", True):
    with open(paths["chunks_json"], "w", encoding="utf-8") as f:
        json.dump(STATE["chunks"], f, ensure_ascii=False, indent=2)
print(f"[ok] Chunked resume into {len(STATE['chunks'])} chunks")

# --- Batched embeddings (Gemini) for all chunks (one batched op, not per-chunk) ---
texts = [c["text"] for c in STATE["chunks"]]
chunk_vecs = embed_texts(texts, batch_size=opts.get("embed_batch_size", 64))  # batched
STATE["provenance"]["chunk_vecs"] = None  # don’t persist big arrays by default
# (Keep vectors in memory only; avoids artifact bloat)

# --- Hybrid retrieval: semantic (cosine) + lexical coverage (JD tokens) ---
# Rationale: robust retrieval mixes semantics with lexical overlap; avoids overweighting generic semantics.
# See enterprise guidance on combining semantic + traditional scores.  # refs in comments
def _lex_keywords(s: str) -> List[str]:
    s = re.sub(r"[^A-Za-z0-9+#.\-/ ]+", " ", s.lower())
    toks = [t for t in s.split() if len(t) >= 3]
    # keep symbols like c++, c#, .net as tokens too
    return list(dict.fromkeys(toks))

jd_tokens = _lex_keywords(jd_text)
jd_token_set = set(jd_tokens)

# precompute lexical scores per chunk = (# JD tokens present) / (sqrt(len(chunk_tokens)) to damp long chunks)
lex_scores = []
for c in STATE["chunks"]:
    toks = set(_lex_keywords(c["text"]))
    overlap = len(jd_token_set.intersection(toks))
    norm = max(1.0, (len(toks) ** 0.5))
    lex_scores.append(overlap / norm)

# semantic scores via cosine with single JD query vector
jd_vec = embed_query(jd_text[:4000])
sem_scores = [cosine_sim(v, jd_vec) for v in chunk_vecs]

# combine (alpha semantic, (1-alpha) lexical) → select top-k
alpha = 0.6  # heavier weight to semantic; lexical still matters
# normalize lexical to [0,1]
lex_max = max(lex_scores) if lex_scores else 1.0
lex_norm = [ls / (lex_max or 1.0) for ls in lex_scores]
combo = [alpha * s + (1 - alpha) * l for s, l in zip(sem_scores, lex_norm)]

ranked_idx = sorted(range(len(combo)), key=lambda i: combo[i], reverse=True)
topk = max(1, min(opts["faiss_topk"], len(ranked_idx)))
sel_idx = ranked_idx[:topk]
STATE["faiss"]["topk_ids"] = [STATE["chunks"][i]["id"] for i in sel_idx]

# Save a tiny audit
hybrid_audit = [
    {"id": STATE["chunks"][i]["id"], "semantic": round(sem_scores[i], 3), "lexical": round(lex_norm[i], 3), "score": round(combo[i], 3)}
    for i in sel_idx
]
STATE["provenance"]["hybrid_retrieval"] = hybrid_audit
if STATE["options"].get("keep_artifacts", True):
    with open(base_dir / "hybrid_scores.json", "w", encoding="utf-8") as f:
        json.dump(hybrid_audit, f, ensure_ascii=False, indent=2)

print(f"[ok] Hybrid JD-relevant chunks (top-{len(STATE['faiss']['topk_ids'])}): {STATE['faiss']['topk_ids']}")

# --- resume/not_resume detector (LOCAL heuristic; no LLM call) ---
lower = resume_text.lower()
signals = sum([
    bool(re.search(r"\beducation\b", lower)),
    bool(re.search(r"\bexperience\b|\bwork\b", lower)),
    bool(re.search(r"\bskills?\b", lower)),
    bool(re.search(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", resume_text)),
])
STATE["flags"]["is_resume"] = bool(signals >= 2 or len(resume_text.split()) > 150)
STATE["audit"].append(f"resume_detector.local={STATE['flags']['is_resume']}; signals={signals}")

print(f"[ok] Detector → {'resume-like' if STATE['flags']['is_resume'] else 'not-resume'}")
if not STATE["flags"]["is_resume"]:
    STATE["final"] = {
        "score_100": 0,
        "selected": False,
        "breakdown": {},
        "reasons": ["Document did not appear to be a resume (insufficient structural signals)."],
        "strong_matches": [],
        "skill_gaps": [],
        "risk_flags": ["resume_detector_failed"]
    }
    if STATE["options"].get("keep_artifacts", True):
        with open(paths["final_json"], "w", encoding="utf-8") as f:
            json.dump(STATE["final"], f, ensure_ascii=False, indent=2)
    print("[END] not_resume → score=0. Stop here.")
else:
    print("[ok] Resume confirmed — proceed to Cell 4 (strict JD snapshot + batched AI extraction).")


[ok] Chunked resume into 2 chunks
[ok] Hybrid JD-relevant chunks (top-2): ['c_01', 'c_00']
[ok] Detector → resume-like
[ok] Resume confirmed — proceed to Cell 4 (strict JD snapshot + batched AI extraction).


In [22]:
# ==== Cell 4 (fixed): Strict JD snapshot → Batched LLM extraction (evidence) → Consolidation → Semantic alignment ====
# API plan:
#   • 1x LLM call: strict JD snapshot (NO expansions; evidence-bound)
#   • ≤N x LLM calls: per-chunk extraction (N = extract_max_chunks & budget)
#   • 1x LLM call: consolidation (dedupe & synonym unify only when evidence exists)
# Embeddings are batched (Gemini) for matching; no extra LLM for matching.

import json, re
from typing import Any, Dict, List, Optional
from pathlib import Path
from datetime import datetime
from langchain_core.prompts import ChatPromptTemplate

# ---------------- helpers & budget ----------------
def _json_loose(s: str) -> Any:
    s = (s or "").strip()
    try:
        return json.loads(s)
    except Exception:
        m = re.search(r"\{.*\}|\[.*\]", s, flags=re.S)
        if m:
            return json.loads(m.group(0))
        raise

def _budget_ok(n: int = 1) -> bool:
    return STATE["_llm_calls"] + n <= STATE["options"]["llm_budget_calls"]

def _bump(n: int = 1):
    STATE["_llm_calls"] += n

def _clip(s: Optional[str], n: int = 1200) -> str:
    if not s: return ""
    s = str(s)
    return s if len(s) <= n else s[:n]

def _now() -> datetime:
    return datetime.utcnow()

# date parser (defined early so we can use it later)
def _parse_date_soft(s: Optional[str]) -> Optional[datetime]:
    """Forgiving date parser; supports 'MMM YYYY', 'YYYY', 'Present'."""
    if not s: return None
    tl = str(s).strip().lower()
    if any(k in tl for k in ["present", "current", "now"]):
        return _now()
    try:
        from dateutil import parser as _dp
        dt = _dp.parse(tl, default=datetime(2000,1,1), fuzzy=True)
        if 1900 <= dt.year <= 2100:
            return datetime(dt.year, dt.month if dt.month else 1, 1)
    except Exception:
        pass
    m = re.search(r"(20\d{2}|19\d{2})", tl)
    if m:
        return datetime(int(m.group(1)), 1, 1)
    return None

def _months_between(a: Optional[datetime], b: Optional[datetime]) -> int:
    if not a or not b: return 0
    return max(0, (b.year - a.year) * 12 + (b.month - a.month))

# ---------------- 1) STRICT JD snapshot (1 LLM call; no expansions; with evidence) ----------------
JD_PROMPT_STRICT = r"""
You are assisting an Applicant Tracking System.

TASK: Convert the Job Description into a STRICT, EVIDENCE-BOUND JSON snapshot.
RULES (read carefully):
- Do NOT invent or expand lists. Only include items that appear EXPLICITLY in the JD text.
- If a family is mentioned (e.g., "compilers"), record the family term ONLY, do NOT enumerate (no "gcc", "clang" unless written).
- Tokens should be atomic capability/credential terms in lowercase.
- Provide a short evidence snippet (<=160 chars) for every token and gate you return.

Return STRICT JSON ONLY:
{
  "title": "<short>",
  "must_haves": [{"token": "", "evidence": ""}],
  "required":    [{"token": "", "evidence": ""}],
  "preferred":   [{"token": "", "evidence": ""}],
  "responsibilities": [{"phrase": "", "evidence": ""}],
  "hard_gates": {
    "degree_required": { "value": true|false, "evidence": "" },
    "min_years": { "value": null | 0, "evidence": "" },
    "license": [{ "token": "", "evidence": "" }],
    "work_auth": { "value": null | "us citizen|eu work permit|...", "evidence": "" },
    "clearance": { "value": null | "active secret|...", "evidence": "" },
    "location_mode": { "value": null | "onsite|hybrid|remote", "evidence": "" },
    "onsite_city": { "value": null | "<city>", "evidence": "" },
    "shift": { "value": null | "night|rotational|...", "evidence": "" },
    "travel": { "value": null | "<% or phrase>", "evidence": "" }
  }
}

JD:
---
{jd_text}
---
"""
# escape braces for LangChain template (only keep {jd_text} as a variable)
JD_PROMPT_STRICT = _escape_braces_keep_vars(JD_PROMPT_STRICT, ["jd_text"])

if not STATE["jd_snapshot"].get("required") and _budget_ok(1):
    jd_prompt = ChatPromptTemplate.from_template(JD_PROMPT_STRICT)
    jd_chain = jd_prompt | llm
    jd_raw = jd_chain.invoke({"jd_text": STATE["raw"]["jd_text"]})
    _bump(1)
    obj = _json_loose(getattr(jd_raw, "content", str(jd_raw)))

    # Normalize & store (lists + evidence maps)
    js = STATE["jd_snapshot"]
    js["title"] = obj.get("title") or js.get("title") or None

    def _pull_tokens(items, key="token"):
        out = []
        evid = {}
        for it in items or []:
            tok = (it.get(key) or "").strip().lower()
            ev  = (it.get("evidence") or "").strip()
            if tok:
                out.append(tok)
                evid[tok] = ev
        return list(dict.fromkeys(out)), evid

    must_list, must_ev = _pull_tokens(obj.get("must_haves") or [])
    req_list,  req_ev  = _pull_tokens(obj.get("required") or [])
    pref_list, pref_ev = _pull_tokens(obj.get("preferred") or [])
    resp_list, resp_ev = [], {}
    for it in obj.get("responsibilities") or []:
        phr = (it.get("phrase") or "").strip().lower()
        ev  = (it.get("evidence") or "").strip()
        if phr:
            resp_list.append(phr); resp_ev[phr] = ev

    js["must_haves"] = must_list
    js["required"]   = req_list
    js["preferred"]  = pref_list
    js["responsibilities"] = resp_list
    js["evidence"] = {"must": must_ev, "req": req_ev, "pref": pref_ev, "resp": resp_ev}

    hg_in = obj.get("hard_gates") or {}
    def _gate_val(key, default=None):
        g = hg_in.get(key) or {}
        return (g.get("value") if isinstance(g, dict) else None) if g else default
    def _gate_ev(key):
        g = hg_in.get(key) or {}
        return (g.get("evidence") or "") if isinstance(g, dict) else ""

    js["hard_gates"] = {
        "degree_required": _gate_val("degree_required", False),
        "min_years": _gate_val("min_years", None),
        "license": [x.get("token","").lower() for x in (hg_in.get("license") or []) if x.get("token")],
        "work_auth": _gate_val("work_auth", None),
        "clearance": _gate_val("clearance", None),
        "location_mode": _gate_val("location_mode", None),
        "onsite_city": _gate_val("onsite_city", None),
        "shift": _gate_val("shift", None),
        "travel": _gate_val("travel", None),
    }
    js["hard_gate_evidence"] = {
        k: _gate_ev(k) for k in [
            "degree_required","min_years","license","work_auth",
            "clearance","location_mode","onsite_city","shift","travel"
        ]
    }

    if STATE["options"].get("keep_artifacts", True):
        with open(STATE["artifacts"]["paths"]["jd_snapshot_json"], "w", encoding="utf-8") as f:
            json.dump(STATE["jd_snapshot"], f, ensure_ascii=False, indent=2)

print("[ok] JD snapshot (strict) ready; no expansions were allowed.")

# ---------------- 2) Select chunks within budget (from Cell 3 hybrid retrieval) ----------------
opts = STATE["options"]
sel_ids = list(STATE["faiss"].get("topk_ids") or [])
if not sel_ids:
    sel_ids = [c["id"] for c in STATE["chunks"][:opts["extract_max_chunks"]]]
sel_ids = sel_ids[:opts["extract_max_chunks"]]

id_to_chunk = {c["id"]: c for c in STATE["chunks"]}
sel_chunks = [id_to_chunk[i] for i in sel_ids if i in id_to_chunk]

# Leave room for consolidation if possible
remaining_llm = max(0, opts["llm_budget_calls"] - STATE["_llm_calls"])
reserve_for_consolidation = 1 if remaining_llm > 1 else 0
allowed = max(0, min(len(sel_chunks), remaining_llm - reserve_for_consolidation))
if allowed < len(sel_chunks):
    sel_chunks = sel_chunks[:allowed]
print(f"[ok] Planning extraction for {len(sel_chunks)} chunk(s) within budget (used={STATE['_llm_calls']}/{opts['llm_budget_calls']}).")

# ---------------- 3) Per-chunk AI extraction with EVIDENCE (LLM; strict; one chunk per call) ----------------
EXTRACT_PROMPT_STRICT = r"""
You are extracting structured resume data from ONE chunk. 
RULES:
- Return ONLY items that are EXPLICITLY present in this chunk. Do NOT infer or expand.
- For every field, include an "evidence" snippet (<=160 chars) copied from this chunk.
- Keep tokens lowercase and atomic where applicable.

Return STRICT JSON only:
{
  "contacts": { "name": null, "email": null, "phone": null, "links": ["urls"], "evidence": {"name":"", "email":"", "phone":"", "links":["..."]} },
  "education": [ { "degree": "", "field": "", "institution": "", "start": null, "end": null, "location": null, "evidence": "" } ],
  "experience": [ { "title": "", "company": "", "location": null, "start": null, "end": null, "highlights": ["..."], "evidence": "" } ],
  "projects": [ { "name": "", "tech": ["tokens"], "impact": null, "links": ["urls"], "evidence": "" } ],
  "skills": [ { "token": "", "evidence": "" } ],
  "certifications": [ { "token": "", "evidence": "" } ],
  "awards": [ { "token": "", "evidence": "" } ],
  "locations": [ { "token": "", "evidence": "" } ]
}

Chunk ID: {chunk_id}
Chunk:
---
{chunk_text}
---
"""
EXTRACT_PROMPT_STRICT = _escape_braces_keep_vars(EXTRACT_PROMPT_STRICT, ["chunk_id","chunk_text"])

entities_by_chunk: Dict[str, Dict[str, Any]] = {}
if sel_chunks:
    extract_chain = ChatPromptTemplate.from_template(EXTRACT_PROMPT_STRICT) | llm
    for ch in sel_chunks:
        if not _budget_ok(1): break
        out = extract_chain.invoke({"chunk_id": ch["id"], "chunk_text": ch["text"]})
        _bump(1)
        try:
            obj = _json_loose(getattr(out, "content", str(out)))
        except Exception:
            obj = {}
        entities_by_chunk[ch["id"]] = obj

if STATE["options"].get("keep_artifacts", True):
    with open(Path(STATE["artifacts"]["base_dir"]) / "extract_pass_raw.json", "w", encoding="utf-8") as f:
        json.dump(entities_by_chunk, f, ensure_ascii=False, indent=2)

# ---------------- 4) CONSOLIDATION (1 LLM call): dedupe & unify synonyms ONLY with evidence ----------------
CONSOLIDATE_PROMPT = r"""
You are merging multiple chunk-level JSON extractions of a single resume.
RULES:
- Deduplicate across chunks.
- Unify obvious synonyms ONLY if each synonym has evidence text somewhere in the inputs (e.g., "oop","o.o.p","object oriented programming" → canonical "object oriented programming").
- If there is no evidence for an item, DROP it.
- Keep for each final item an array of "evidence" snippets (from the inputs).

Return STRICT JSON only:
{
  "contacts": { "name": null, "email": null, "phone": null, "links": ["urls"] },
  "education": [ { "degree": "", "field": "", "institution": "", "start": null, "end": null, "location": null, "evidence": ["..."] } ],
  "experience": [ { "title": "", "company": "", "location": null, "start": null, "end": null, "highlights": ["..."], "evidence": ["..."] } ],
  "projects": [ { "name": "", "tech": ["tokens"], "impact": null, "links": ["urls"], "evidence": ["..."] } ],
  "skills": [ { "canonical": "", "aliases": ["..."], "evidence": ["..."] } ],
  "certifications": [ { "canonical": "", "aliases": ["..."], "evidence": ["..."] } ],
  "awards": [ { "canonical": "", "evidence": ["..."] } ],
  "locations": [ { "canonical": "", "evidence": ["..."] } ]
}

Inputs:
---
{chunk_json}
---
"""
CONSOLIDATE_PROMPT = _escape_braces_keep_vars(CONSOLIDATE_PROMPT, ["chunk_json"])

merged: Dict[str, Any] = {}
if entities_by_chunk and _budget_ok(1):
    cons_chain = ChatPromptTemplate.from_template(CONSOLIDATE_PROMPT) | llm
    payload = json.dumps(entities_by_chunk, ensure_ascii=False)
    cons_raw = cons_chain.invoke({"chunk_json": payload})
    _bump(1)
    merged = _json_loose(getattr(cons_raw, "content", str(cons_raw)))
else:
    merged = {}

# ---------------- 5) Write consolidated into STATE ----------------
def _uniq_preserve(xs):
    seen=set(); out=[]
    for x in xs:
        k = json.dumps(x, sort_keys=True) if isinstance(x,(dict,list)) else str(x).lower()
        if k not in seen:
            seen.add(k); out.append(x)
    return out

# Contacts
if isinstance(merged.get("contacts"), dict):
    c = merged["contacts"]
    if c.get("name"):  STATE["contacts"]["name"]  = STATE["contacts"].get("name")  or c["name"]
    if c.get("email"): STATE["contacts"]["email"] = STATE["contacts"].get("email") or c["email"]
    if c.get("phone"): STATE["contacts"]["phone"] = STATE["contacts"].get("phone") or c["phone"]
    if isinstance(c.get("links"), list):
        for u in c["links"]:
            if "linkedin.com" in u and not STATE["contacts"]["links"].get("linkedin"): STATE["contacts"]["links"]["linkedin"] = u
            elif "github.com" in u and not STATE["contacts"]["links"].get("github"):   STATE["contacts"]["links"]["github"] = u

# Lists
for key in ["education","timeline","projects","skills","certs","awards","locations"]:
    STATE[key] = []

def _extend(dst_key: str, seq: List[Dict[str, Any]]):
    for it in seq or []:
        STATE[dst_key].append(dict(it))

_extend("education", merged.get("education") or [])
_extend("timeline",  merged.get("experience") or [])
_extend("projects",  merged.get("projects") or [])

for s in merged.get("skills") or []:
    STATE["skills"].append({
        "name": (s.get("canonical") or "").lower(),
        "aliases": [a.lower() for a in (s.get("aliases") or [])],
        "evidence": s.get("evidence") or [],
        "chunk_ids": [],
    })
for c in merged.get("certifications") or []:
    STATE["certs"].append({"name": (c.get("canonical") or "").lower(), "evidence": c.get("evidence") or []})
for a in merged.get("awards") or []:
    STATE["awards"].append({"name": (a.get("canonical") or "").lower(), "evidence": a.get("evidence") or []})
for l in merged.get("locations") or []:
    STATE["locations"].append((l.get("canonical") or "").lower())

# De-dup simple lists
STATE["education"] = _uniq_preserve(STATE["education"])
STATE["timeline"]  = _uniq_preserve(STATE["timeline"])
STATE["projects"]  = _uniq_preserve(STATE["projects"])
STATE["skills"]    = _uniq_preserve(STATE["skills"])
STATE["certs"]     = _uniq_preserve(STATE["certs"])
STATE["awards"]    = _uniq_preserve(STATE["awards"])
STATE["locations"] = _uniq_preserve(STATE["locations"])

# ---------------- 6) High-level: years of experience ----------------
total_months = 0
now = _now()
for r in STATE["timeline"]:
    st = _parse_date_soft(r.get("start"))
    en = _parse_date_soft(r.get("end")) or now
    if st:
        total_months += _months_between(st, en)
STATE["high_level"]["years_experience"] = round(total_months / 12.0, 2) if total_months else None

# ---------------- 7) Canonical resume terms (aliases + semantic clustering over observed terms) ----------------
alias = STATE["canon"]["skill_alias"]
raw_terms: List[str] = []
for s in STATE["skills"]:
    if s.get("name"): raw_terms.append(s["name"])
    for a in s.get("aliases") or []: raw_terms.append(a)
for p in STATE["projects"]:
    for t in (p.get("tech") or []): raw_terms.append(str(t).lower())
for r in STATE["timeline"]:
    if r.get("title"): raw_terms.append(str(r["title"]).lower())
    for h in (r.get("highlights") or []): raw_terms.append(str(h).lower())
for c in STATE["certs"]:
    if c.get("name"): raw_terms.append(str(c["name"]).lower())

norm_terms = []
for tok in raw_terms:
    t = tok.strip().lower()
    t = alias.get(t, t)
    norm_terms.append(t)
unique_terms = list(dict.fromkeys([t for t in norm_terms if t]))

# semantic clustering (observed terms only; cosine≥0.82)
term_vecs = embed_texts(unique_terms, batch_size=STATE["options"]["embed_batch_size"])
canons: List[Dict[str, Any]] = []
def _find_bucket(vec):
    best_i, best_sc = None, -1.0
    for i, c in enumerate(canons):
        sc = cosine_sim(vec, c["vec"])
        if sc > best_sc:
            best_sc, best_i = sc, i
    return best_i, best_sc
THRESH_CANON = 0.82
for term, vec in zip(unique_terms, term_vecs):
    if not canons:
        canons.append({"name": term, "vec": vec, "members": {term}})
        continue
    bi, sc = _find_bucket(vec)
    if sc >= THRESH_CANON:
        canons[bi]["members"].add(term)
    else:
        canons.append({"name": term, "vec": vec, "members": {term}})

TERM_TO_CANON: Dict[str, str] = {}
for c in canons:
    label = min(c["members"], key=len)
    for m in c["members"]:
        TERM_TO_CANON[m] = label
RESUME_CANON_TERMS = sorted(set(TERM_TO_CANON.get(t, t) for t in unique_terms))
STATE["canon"]["normalized_skills"] = RESUME_CANON_TERMS

# ---------------- 8) JD tokens (NO expansions) ----------------
jd = STATE["jd_snapshot"]
JD_MUST = jd.get("must_haves", []) or []
JD_REQ  = jd.get("required", []) or []
JD_PREF = jd.get("preferred", []) or []
JD_RESP = jd.get("responsibilities", []) or []

# ---------------- 9) Alignment (semantic, batched) ----------------
def _best_match(token: str, candidate_terms: List[str]) -> Dict[str, Any]:
    if not token or not candidate_terms:
        return {"match": None, "score": 0.0}
    pairs = [token] + candidate_terms
    vecs = embed_texts(pairs, batch_size=STATE["options"]["embed_batch_size"])
    tv = vecs[0]; cvs = vecs[1:]
    best_sc = -1.0; best_term = None
    for tm, vv in zip(candidate_terms, cvs):
        sc = cosine_sim(tv, vv)
        if sc > best_sc:
            best_sc = sc; best_term = tm
    return {"match": best_term, "score": float(best_sc)}

def _align_list(tokens: List[str]) -> List[Dict[str, Any]]:
    out = []
    STRONG = STATE["options"]["bert_strong"]
    PART   = STATE["options"]["bert_partial"]
    for t in tokens:
        bm = _best_match(t, RESUME_CANON_TERMS)
        status = "missing"
        if bm["score"] >= STRONG: status = "present_strong"
        elif bm["score"] >= PART: status = "present_partial"
        out.append({"name": t, "status": status, "evidence": bm["match"], "similarity": round(bm["score"], 3)})
    return out

STATE["jd_alignment"]["must_have"] = _align_list(JD_MUST)
STATE["jd_alignment"]["required"]  = _align_list(JD_REQ)
STATE["jd_alignment"]["preferred"] = _align_list(JD_PREF)

# responsibilities coverage via semantic best-hit
resp_cover = 0.0
highlights: List[str] = []
for r in STATE["timeline"]:
    for h in (r.get("highlights") or []):
        highlights.append(h)
for p in STATE["projects"]:
    if p.get("impact"): highlights.append(p["impact"])
if JD_RESP and highlights:
    rv = embed_texts(JD_RESP, batch_size=STATE["options"]["embed_batch_size"])
    hv = embed_texts(highlights[:50], batch_size=STATE["options"]["embed_batch_size"])  # cap 50 for efficiency
    PART = STATE["options"]["bert_partial"]
    hits = 0
    for rvec in rv:
        best = max(cosine_sim(rvec, hvec) for hvec in hv) if hv else 0.0
        hits += 1 if best >= PART else 0
    resp_cover = hits / max(1, len(JD_RESP))
STATE["jd_alignment"]["responsibilities"] = {"coverage": round(resp_cover, 3), "count": len(JD_RESP)}

# role & project relevance vs a single JD vector
jd_vec = embed_query(" | ".join(JD_REQ + JD_RESP)[:4000])

def _recency_weight(months: Optional[int]) -> float:
    if months is None: return 0.7
    if months <= 6: return 1.0
    if months <= 24: return 0.5 + 0.5 * (24 - months) / 18.0
    if months <= 60: return 0.3 + 0.2 * (60 - months) / 36.0
    return 0.3

now = _now()
for r in STATE["timeline"]:
    # compute recency months if missing
    end_dt = _parse_date_soft(r.get("end")) or now
    r["recency_months"] = max(0, (now.year - end_dt.year) * 12 + (now.month - end_dt.month))
    blob = " ".join([_clip(r.get("title")), _clip(r.get("company"))] + [ _clip(h) for h in (r.get("highlights") or []) ])
    rv = embed_query(blob)
    r["jd_relevance"] = float(cosine_sim(rv, jd_vec))

for p in STATE["projects"]:
    blob = " ".join([_clip(p.get("name"))] + [ _clip(t) for t in (p.get("tech") or []) ] + ([ _clip(p.get("impact")) ] if p.get("impact") else []))
    pv = embed_query(blob)
    p["jd_relevance"] = float(cosine_sim(pv, jd_vec))

print("[ok] Strict extraction + consolidation + semantic alignment complete.")
print(f"  - roles: {len(STATE['timeline'])}, projects: {len(STATE['projects'])}, skills (canonical observed): {len(STATE['canon']['normalized_skills'])}")
print(f"  - alignment: must_have={len(STATE['jd_alignment']['must_have'])}, req={len(STATE['jd_alignment']['required'])}, pref={len(STATE['jd_alignment']['preferred'])}")
print(f"  - responsibilities coverage≈{int(STATE['jd_alignment']['responsibilities']['coverage']*100)}%")


[ok] JD snapshot (strict) ready; no expansions were allowed.
[ok] Planning extraction for 2 chunk(s) within budget (used=1/6).
[ok] Strict extraction + consolidation + semantic alignment complete.
  - roles: 1, projects: 4, skills (canonical observed): 14
  - alignment: must_have=12, req=0, pref=7
  - responsibilities coverage≈100%


In [23]:
# ==== Cell 5: Scoring → Gates → Final Decision → Persist (Gemini-only embeddings; no extra LLM calls) ====
# Produces a 0–100 score, selected flag, detailed coverage/gaps/reasons, and persists final JSON.

import json, re
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Any, Optional

opts = STATE["options"]
weights = opts["weights"]

def _now():
    return datetime.utcnow()

def _parse_date_soft(s: Optional[str]) -> Optional[datetime]:
    if not s: return None
    tl = str(s).strip().lower()
    if any(k in tl for k in ["present", "current", "now"]): return _now()
    from dateutil import parser
    try:
        dt = parser.parse(tl, default=datetime(2000,1,1), fuzzy=True)
        if 1900 <= dt.year <= 2100:
            return datetime(dt.year, dt.month if dt.month else 1, 1)
    except Exception:
        pass
    m = re.search(r"(20\d{2}|19\d{2})", tl)
    if m: return datetime(int(m.group(1)), 1, 1)
    return None

def _months_between(a: Optional[datetime], b: Optional[datetime]) -> int:
    if not a or not b: return 0
    return max(0, (b.year - a.year) * 12 + (b.month - a.month))

def _recency_weight(months: Optional[int]) -> float:
    if months is None: return 0.7
    if months <= 6: return 1.0
    if months <= 24: return 0.5 + 0.5 * (24 - months) / 18.0
    if months <= 60: return 0.3 + 0.2 * (60 - months) / 36.0
    return 0.3

# ---------- 1) Evidence depth (metrics/impact signals) ----------
highlights: List[str] = []
for r in STATE["timeline"]:
    highlights.extend(r.get("highlights") or [])
for p in STATE["projects"]:
    if p.get("impact"): highlights.append(p["impact"])

metric_pat = re.compile(r"\b\d+(?:\.\d+)?%|\b\d+(?:\.\d+)?(?:k|m|b)\b|\b\d{2,}\b|\$\s?\d[\d,]*(?:\.\d+)?", re.I)
metric_count = sum(len(metric_pat.findall(h or "")) for h in highlights)
# Smooth to [0,1] with gentle saturation: ~8+ metrics => 1.0
evidence_depth = min(1.0, metric_count / 8.0)

# ---------- 2) Seniority fit ----------
jd = STATE["jd_snapshot"]
req_years = None
try:
    req_years = int(jd.get("hard_gates", {}).get("min_years") or 0) or None
except Exception:
    req_years = None

yrs = STATE["high_level"].get("years_experience")
seniority_fit = 1.0
if yrs is not None and req_years is not None:
    # Treat 70% of JD years as near-fit for friendliness
    denom = max(1.0, 0.7 * req_years)
    seniority_fit = max(0.0, min(1.0, float(yrs) / denom))

# ---------- 3) Responsibilities overlap (already computed in Cell 4) ----------
resp_cover = float(STATE["jd_alignment"].get("responsibilities", {}).get("coverage", 0.0))

# ---------- 4) Role & Project alignment ----------
# Ensure recency_months & tenure months computed, and combine with jd_relevance
now = _now()
role_scores = []
for r in STATE["timeline"]:
    st = _parse_date_soft(r.get("start"))
    en = _parse_date_soft(r.get("end")) or now
    tenure_mo = _months_between(st, en) if st else 0
    r["tenure_months"] = tenure_mo
    rec_w = _recency_weight(r.get("recency_months", _months_between(en, now)))
    ten_w = min(1.0, (tenure_mo or 0) / 9.0)  # 9+ months ~ full credit
    rel = float(r.get("jd_relevance") or 0.0)  # 0..1 from Cell 4
    role_scores.append(rel * rec_w * ten_w)

role_alignment = sum(role_scores) / max(1, len(role_scores)) if role_scores else 0.0

proj_scores = [float(p.get("jd_relevance") or 0.0) for p in STATE["projects"]]
project_alignment = sum(proj_scores) / max(1, len(proj_scores)) if proj_scores else 0.0

# ---------- 5) Coverage (must-have / required / preferred) ----------
def _coverage(items: List[Dict[str, Any]]) -> float:
    if not items: return 0.0
    total = len(items)
    strong = sum(1 for x in items if x.get("status") == "present_strong")
    partial = sum(1 for x in items if x.get("status") == "present_partial")
    # partial credit weighting
    return (strong + 0.6 * partial) / max(1, total)

must_cov = _coverage(STATE["jd_alignment"].get("must_have", []))
req_cov  = _coverage(STATE["jd_alignment"].get("required", []))
pref_cov = _coverage(STATE["jd_alignment"].get("preferred", []))

# Small contextual boosts
if role_alignment >= 0.6: req_cov = min(1.0, req_cov + 0.05)
if project_alignment >= 0.6: pref_cov = min(1.0, pref_cov + 0.04)

# ---------- 6) Hard gates evaluation ----------
failed_gates: List[str] = []
gate_notes: List[str] = []
hg = jd.get("hard_gates", {}) or {}

# degree gate
degree_required = bool(hg.get("degree_required"))
has_degree = any((e.get("degree") or e.get("field")) for e in STATE["education"])
if degree_required and not has_degree:
    failed_gates.append("degree_required")
    gate_notes.append("JD requires a degree; none detected in education section")

# min years gate (hard gate; we already compute seniority_fit but also check)
if req_years is not None and (yrs is None or yrs + 0.01 < 0.6 * req_years):  # allow some leeway
    failed_gates.append("min_years_experience")
    gate_notes.append(f"Requires ~{req_years}y; inferred ~{yrs or 0}y")

# license gate (CPA/RN/PE/PMP/etc.)
licenses_needed = [str(x).lower() for x in (hg.get("license") or [])]
if licenses_needed:
    resume_lics = [str(c.get("name","")).lower() for c in STATE["certs"]]
    lic_missing = [ln for ln in licenses_needed if all(ln not in rl for rl in resume_lics)]
    if lic_missing:
        failed_gates.append("license_required")
        gate_notes.append(f"Missing license(s): {', '.join(lic_missing)}")

# work authorization (very light; text scan)
if hg.get("work_auth"):
    wanted = str(hg.get("work_auth")).lower()
    resume_text_lower = (STATE["raw"].get("resume_text") or "").lower()
    if wanted not in resume_text_lower:
        failed_gates.append("work_authorization")
        gate_notes.append(f"Work authorization required: {wanted}")

# clearance
if hg.get("clearance"):
    need = str(hg.get("clearance")).lower()
    have = (STATE["raw"].get("resume_text") or "").lower()
    if need not in have:
        failed_gates.append("security_clearance")
        gate_notes.append(f"Security clearance required: {need}")

# location mode (onsite/hybrid/remote) — treat mismatch as soft penalty unless explicitly "onsite only"
location_soft_penalty = 0
if hg.get("location_mode") == "onsite":
    city = (hg.get("onsite_city") or "").lower()
    # if resume locations don't mention the city, flag soft penalty
    resume_locs = [str(x).lower() for x in STATE.get("locations") or []]
    if city and all(city not in l for l in resume_locs):
        location_soft_penalty = 5  # soft, not a hard fail
        gate_notes.append(f"Onsite location preference '{city}' not evidenced in resume")

# shift/travel — informational; turn into soft penalties if extreme (optional)
shift_note = hg.get("shift")
travel_note = hg.get("travel")
if shift_note: gate_notes.append(f"Shift requirement: {shift_note}")
if travel_note: gate_notes.append(f"Travel requirement: {travel_note}")

STATE["gates"]["failed"] = failed_gates
STATE["gates"]["notes"] = gate_notes

# ---------- 7) Score computation ----------
score = (
    weights["must_have_coverage"]      * must_cov +
    weights["required_coverage"]       * req_cov +
    weights["preferred_coverage"]      * pref_cov +
    weights["role_alignment"]          * role_alignment +
    weights["project_alignment"]       * project_alignment +
    weights["evidence_depth"]          * evidence_depth +
    weights["seniority_fit"]           * seniority_fit +
    weights["responsibility_overlap"]  * resp_cover
)

# Soft penalties
missing_must = [x["name"] for x in STATE["jd_alignment"].get("must_have", []) if x["status"] == "missing"]
if len(missing_must) > 4:
    score -= 5  # too many missing must-haves
if degree_required and not has_degree:
    score -= 3  # extra nudge if JD explicitly asked for a degree
if metric_count == 0:
    score -= 3  # no measurable outcomes
score -= location_soft_penalty

# Hard gate cap
if failed_gates:
    score = min(score, float(opts.get("gate_hard_cap", 59)))

# Clamp and round
score_100 = int(round(max(0.0, min(100.0, score))))

# Selection rule
selected = (score_100 >= 70) and (len(failed_gates) == 0)

# ---------- 8) Reasons / Strong matches / Gaps / Risks ----------
def _names_with_status(items: List[Dict[str, Any]], status: str) -> List[str]:
    return [x["name"] for x in items if x.get("status") == status]

strong_from_must = _names_with_status(STATE["jd_alignment"].get("must_have", []), "present_strong")
strong_from_req  = _names_with_status(STATE["jd_alignment"].get("required", []), "present_strong")
strong_matches = sorted(set(strong_from_must + strong_from_req))[:20]

gaps = [x["name"] for x in STATE["jd_alignment"].get("must_have", []) if x["status"] == "missing"]
gaps += [x["name"] for x in STATE["jd_alignment"].get("required", []) if x["status"] == "missing"]
gaps = sorted(set(gaps))[:20]

reasons: List[str] = []
reasons.append(f"Must-have coverage: {int(round(100*must_cov))}% ; Required: {int(round(100*req_cov))}% ; Preferred: {int(round(100*pref_cov))}%")
if yrs is not None:
    if req_years is not None:
        reasons.append(f"Experience: {yrs} yrs vs JD ~{req_years} yrs (fit≈{int(round(100*seniority_fit))}%).")
    else:
        reasons.append(f"Experience: {yrs} yrs (JD years not specified).")
reasons.append(f"Responsibilities overlap≈{int(round(100*resp_cover))}% ; Evidence signals={metric_count}.")
reasons.append(f"Role alignment≈{int(round(100*role_alignment))}% ; Project alignment≈{int(round(100*project_alignment))}%.")

risk_flags: List[str] = []
if failed_gates:
    risk_flags.append("Hard gates failed: " + ", ".join(failed_gates))
if not STATE["contacts"].get("email"):
    risk_flags.append("No email detected")
if yrs is None:
    risk_flags.append("Years of experience could not be inferred")
if metric_count == 0:
    risk_flags.append("No quantifiable achievements detected")
if location_soft_penalty:
    risk_flags.append("Onsite location not evidenced")

# ---------- 9) Persist final ----------
STATE["final"] = {
    "score_100": score_100,
    "selected": bool(selected),
    "breakdown": {
        "must_have_coverage": round(weights["must_have_coverage"] * must_cov, 2),
        "required_coverage": round(weights["required_coverage"] * req_cov, 2),
        "preferred_coverage": round(weights["preferred_coverage"] * pref_cov, 2),
        "role_alignment": round(weights["role_alignment"] * role_alignment, 2),
        "project_alignment": round(weights["project_alignment"] * project_alignment, 2),
        "evidence_depth": round(weights["evidence_depth"] * evidence_depth, 2),
        "seniority_fit": round(weights["seniority_fit"] * seniority_fit, 2),
        "responsibility_overlap": round(weights["responsibility_overlap"] * resp_cover, 2),
        "soft_penalties": {
            "missing_many_must_haves": int(len(missing_must) > 4) * 5,
            "degree_missing_penalty": (3 if (degree_required and not has_degree) else 0),
            "zero_metrics_penalty": (3 if metric_count == 0 else 0),
            "location_penalty": location_soft_penalty,
        },
        "hard_gate_cap": (opts.get("gate_hard_cap", 59) if failed_gates else None),
    },
    "reasons": reasons,
    "strong_matches": strong_matches,
    "skill_gaps": gaps,
    "risk_flags": risk_flags,
    "gates": {
        "failed": failed_gates,
        "notes": gate_notes,
    },
}

if STATE["options"].get("keep_artifacts", True):
    Path(STATE["artifacts"]["base_dir"]).mkdir(parents=True, exist_ok=True)
    with open(STATE["artifacts"]["paths"]["final_json"], "w", encoding="utf-8") as f:
        json.dump(STATE["final"], f, ensure_ascii=False, indent=2)

print(f"[ok] Final score: {STATE['final']['score_100']}/100  | selected={STATE['final']['selected']}")
print("[ok] Breakdown:", json.dumps(STATE["final"]["breakdown"], indent=2))
print("[ok] Reasons:", *STATE["final"]["reasons"], sep="\n  - ")
print("[ok] Strong matches:", ", ".join(STATE["final"]["strong_matches"]) or "—")
print("[ok] Gaps:", ", ".join(STATE["final"]["skill_gaps"]) or "—")
print("[ok] Risk flags:", ", ".join(STATE["final"]["risk_flags"]) or "—")
print(f"[ok] Saved → {STATE['artifacts']['paths']['final_json']}")


[ok] Final score: 53/100  | selected=False
[ok] Breakdown: {
  "must_have_coverage": 30.0,
  "required_coverage": 0.0,
  "preferred_coverage": 8.0,
  "role_alignment": 0.0,
  "project_alignment": 2.73,
  "evidence_depth": 0.75,
  "seniority_fit": 8.0,
  "responsibility_overlap": 4.0,
  "soft_penalties": {
    "missing_many_must_haves": 0,
    "degree_missing_penalty": 0,
    "zero_metrics_penalty": 0,
    "location_penalty": 0
  },
  "hard_gate_cap": null
}
[ok] Reasons:
  - Must-have coverage: 100% ; Required: 0% ; Preferred: 100%
  - Responsibilities overlap≈100% ; Evidence signals=1.
  - Role alignment≈0% ; Project alignment≈34%.
[ok] Strong matches: algorithms, c++, cmake, data structures, exception handling, file handling, git, memory management, multithreading, object-oriented programming, sdlc, stl
[ok] Gaps: —
[ok] Risk flags: Years of experience could not be inferred
[ok] Saved → tmp\52d59f5aaedc\final.json
