### Adding path of the project folder in system variable to find modules

In [1]:
import sys
import os

# get project root path (parent of 'notebooks' directory)
root_path = os.path.abspath(os.path.join(os.getcwd(), ".."))
if root_path not in sys.path:
    sys.path.append(root_path)

print(sys.path)

['C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\\python311.zip', 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\\DLLs', 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\\Lib', 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0', 'd:\\Work\\Capstone_Project\\resume-nlp\\venv', '', 'd:\\Work\\Capstone_Project\\resume-nlp\\venv\\Lib\\site-packages', 'd:\\Work\\Capstone_Project\\resume-nlp']


### Step 1: Extract text from pdf

In [2]:
import re
from src.ingest import extract_text_pymupdf


text = extract_text_pymupdf('../data/Abhishek_Singh_Resume.pdf')
print(text[:400])
print('email found:', bool(re.search(r"[\w\.-]+@[\w\.-]+", text)))
print('phone found:', bool(re.search(r"\+?\d[\d\s\-()]{6,}\d", text)))
print('education header:', 'Education' in text or 'EDUCATION' in text)

Abhishek Singh
8010852459 | abhisheksingh.vizag@gmail.com | LinkedIn | GitHub | LeetCode
Education
•
VIT Bhopal University | CGPA 9.01
Oct 2022 – Present
Bachelor of Technology in Computer Science and Engineering
Bhopal, Madhya Pradesh
•
Higher Secondary Education | Grade: 95.4%
July 2021
Navy Children School, Goa
Vasco Da Gama, Goa
Technical Skills
• Languages/Databases: C++, Python, SQL
• Framew
email found: True
phone found: True
education header: True


- setting up some configuration for the project

In [3]:
# CONFIG
from pathlib import Path
DATA_ROOT = Path('../data/resumes_raw_pdf')
PDF_DIR = DATA_ROOT / 'pdfs'
TXT_DIR = DATA_ROOT / 'txt'
FAIL_DIR = DATA_ROOT / 'failures'
REPORT_JSON = DATA_ROOT / 'extraction_report.json'
REPORT_CSV = DATA_ROOT / 'extraction_report.csv'
for d in [PDF_DIR, TXT_DIR, FAIL_DIR]:
    d.mkdir(parents=True, exist_ok=True)


# How many resumes to download for testing (start small: 50-100)
N_SAMPLES = 100


# Toggle OCR fallback (requires system Tesseract and pytesseract)
ENABLE_OCR = False
OCR_LANGUAGE = 'eng'


# thresholds for checks
MIN_TEXT_CHARS = 200
MIN_ALPHA_RATIO = 0.2

In [None]:
# Option 1: set token for this notebook session and re-run the download cell
import os
from dotenv import load_dotenv
load_dotenv()  # take environment variables from .env file

token = os.getenv("HUGGINGFACE_HUB_TOKEN")   # <-- paste your token here
os.environ["HUGGINGFACE_HUB_TOKEN"] = token

# optional check who you are
from huggingface_hub import whoami
print("Authenticated as:", whoami(token=token).get("name"))

  from .autonotebook import tqdm as notebook_tqdm


Authenticated as: AbhishekProgrammer22


### Step 2: Selecting a dataset and proceeding with NER Topic modelling

In [25]:
# Use cached HF snapshot to copy PDFs -> run extraction and produce report
from huggingface_hub import list_repo_files, hf_hub_download
from pathlib import Path
import os, shutil, json, csv, traceback
from tqdm.auto import tqdm

# your extractors
from src.ingest import extract_text_pymupdf, extract_text_pdfplumber

REPO_ID = "d4rk3r/resumes-raw-pdf"
OUT_ROOT = Path("../data/resumes_raw_pdf_direct")
PDF_DIR = OUT_ROOT / "pdfs"
TXT_DIR = OUT_ROOT / "txt"
FAIL_DIR = OUT_ROOT / "failures"
REPORT_JSON = OUT_ROOT / "extraction_report.json"
REPORT_CSV  = OUT_ROOT / "extraction_report.csv"

for d in (OUT_ROOT, PDF_DIR, TXT_DIR, FAIL_DIR):
    d.mkdir(parents=True, exist_ok=True)

N_SAMPLES = 100          # change if you want fewer
MIN_TEXT_CHARS = 200
MIN_ALPHA_RATIO = 0.2

token = os.getenv("HUGGINGFACE_HUB_TOKEN")
if not token:
    raise RuntimeError("HUGGINGFACE_HUB_TOKEN missing - set it before running.")

# list files (we already saw this works)
all_files = list_repo_files(REPO_ID, repo_type="dataset", token=token)
pdf_files = [f for f in all_files if f.lower().endswith(".pdf")]
print(f"Found {len(pdf_files)} pdf files in the repo. Will copy/process first {min(N_SAMPLES, len(pdf_files))}.")

# helper: hf_hub_download returns a local cached path when available
def get_cached_path(fname):
    try:
        local = hf_hub_download(repo_id=REPO_ID, filename=fname, repo_type="dataset", token=token)
        return Path(local)
    except Exception as e:
        print("hf_hub_download failed for", fname, "->", type(e).__name__, str(e)[:200])
        return None

report = []
to_process = pdf_files[:min(N_SAMPLES, len(pdf_files))]

for idx, fname in enumerate(tqdm(to_process, desc="files")):
    rec = {"filename": Path(fname).name, "repo_path": fname, "downloaded": False,
           "extraction_method": None, "ocr_used": False, "error": None, "checks": None,
           "failure_reasons": [], "failure_file": None}
    try:
        cached = get_cached_path(fname)
        if cached is None:
            rec["error"] = "cache_lookup_failed"
            report.append(rec)
            continue

        # copy cached file to our PDF_DIR with normalized name
        tgt = PDF_DIR / f"resume_{idx:05d}.pdf"
        shutil.copyfile(cached, tgt)
        rec["downloaded"] = True

        # extract text: pymupdf primary, pdfplumber fallback
        txt = ""
        try:
            txt = extract_text_pymupdf(str(tgt))
            rec["extraction_method"] = "pymupdf"
            if len(txt) < MIN_TEXT_CHARS // 2:
                txt2 = extract_text_pdfplumber(str(tgt))
                if len(txt2) > len(txt):
                    txt = txt2
                    rec["extraction_method"] += "+pdfplumber"
        except Exception as e1:
            try:
                txt = extract_text_pdfplumber(str(tgt))
                rec["extraction_method"] = "pdfplumber"
            except Exception as e2:
                rec["error"] = f"both_extractors_failed: {e1} | {e2}"
                badf = FAIL_DIR / f"downloaded_but_extractfail_{idx:05d}.txt"
                with open(badf, "w", encoding="utf-8") as f:
                    f.write(f"Cached path: {cached}\\nErrors:\\n{e1}\\n{e2}")
                rec["failure_file"] = str(badf)
                report.append(rec)
                continue

        txt = (txt or "").replace("\r", "\n").strip()
        # save text
        with open(TXT_DIR / (tgt.stem + ".txt"), "w", encoding="utf-8") as f:
            f.write(txt)

        # checks
        import re
        EMAIL_RE = re.compile(r"[\w\.-]+@[\w\.-]+\.\w+")
        PHONE_RE = re.compile(r"\+?\d[\d\s\-\(\)]{6,}\d")
        SECTION_KEYWORDS = ['education','experience','skills','projects','certifications','publications','summary','objective']

        email = bool(EMAIL_RE.search(txt))
        phone = bool(PHONE_RE.search(txt))
        txt_low = txt.lower()
        sections = {kw: (kw in txt_low) for kw in SECTION_KEYWORDS}
        any_section = any(sections.values())
        letters = sum(c.isalpha() for c in txt)
        alpha_ratio = letters / max(1, len(txt))

        checks = {'email': email, 'phone': phone, 'sections': sections, 'any_section': any_section,
                  'len_chars': len(txt), 'alpha_ratio': alpha_ratio, 'preview': txt[:800].replace("\n","\\n")}
        rec["checks"] = checks

        failures = []
        if rec["error"]:
            failures.append("extractor_error")
        if checks['len_chars'] < MIN_TEXT_CHARS:
            failures.append("short_text")
        if checks['alpha_ratio'] < MIN_ALPHA_RATIO:
            failures.append("low_alpha_ratio")
        if not checks['any_section']:
            failures.append("no_key_section")
        if not (checks['email'] and checks['phone']):
            failures.append("missing_contact")

        rec["failure_reasons"] = failures
        if failures:
            fname_fail = FAIL_DIR / (tgt.stem + "_failure.txt")
            with open(fname_fail, "w", encoding="utf-8") as f:
                f.write(f"FILENAME: {tgt.name}\\nREPO_PATH: {fname}\\nEXTRACTION_METHOD: {rec['extraction_method']}\\nFAILURE_REASONS: {failures}\\n\\n---PREVIEW---\\n\\n")
                f.write(txt)
            rec["failure_file"] = str(fname_fail)

    except Exception as e:
        rec["error"] = f"fatal:{type(e).__name__}:{e}"
        rec["failure_file"] = None
    report.append(rec)

# save reports
with open(REPORT_JSON, "w", encoding="utf-8") as f:
    json.dump(report, f, indent=2)
with open(REPORT_CSV, "w", newline="", encoding="utf-8") as csvf:
    writer = csv.writer(csvf)
    writer.writerow(['filename','repo_path','downloaded','extraction_method','len_chars','alpha_ratio','email','phone','any_key_section','failure_reasons','failure_file','error'])
    for r in report:
        ch = r.get('checks') or {}
        writer.writerow([r.get('filename'), r.get('repo_path'), r.get('downloaded'), r.get('extraction_method'),
                         ch.get('len_chars'), round(ch.get('alpha_ratio',0),3), ch.get('email'), ch.get('phone'), ch.get('any_section'),
                         ';'.join(r.get('failure_reasons',[])), r.get('failure_file',''), r.get('error','')])

# pretty summary
from collections import Counter
total = len(report)
failures = [r for r in report if r.get('failure_reasons')]
success = total - len(failures)
fail_reasons = Counter()
for r in failures:
    fail_reasons.update(r.get('failure_reasons',[]))

print("\\n=== SUMMARY ===")
print("Total files processed:", total)
print("Success (no detected failure reasons):", success, f"({round(100*success/total if total else 0,2)}%)")
print("Files with failures:", len(failures))
print("Failure reasons counts:", dict(fail_reasons))
print("Report saved ->", REPORT_JSON)
print("PDFs saved  ->", PDF_DIR)
print("Text saved  ->", TXT_DIR)
print("Failures    ->", FAIL_DIR)


Found 1940 pdf files in the repo. Will copy/process first 100.


files:  29%|██▉       | 29/100 [00:27<01:48,  1.53s/it]Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
files:  31%|███       | 31/100 [00:32<02:08,  1.87s/it]Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
files:  32%|███▏      | 32/100 [00:35<02:26,  2.15s/it]Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
files:  36%|███▌      | 36/100 [00:41<01:42,  1.60s/it]Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Fal

\n=== SUMMARY ===
Total files processed: 100
Success (no detected failure reasons): 1 (1.0%)
Files with failures: 99
Failure reasons counts: {'missing_contact': 99, 'no_key_section': 67, 'short_text': 24, 'low_alpha_ratio': 24}
Report saved -> ..\data\resumes_raw_pdf_direct\extraction_report.json
PDFs saved  -> ..\data\resumes_raw_pdf_direct\pdfs
Text saved  -> ..\data\resumes_raw_pdf_direct\txt
Failures    -> ..\data\resumes_raw_pdf_direct\failures





In [9]:
# Load HF dataset properly and write out text files per example
# Paste & run in the same notebook environment

from datasets import load_dataset
from pathlib import Path
import json, csv, shutil
from tqdm.auto import tqdm
import os

OUT_ROOT = Path("../data/resumes_preocr")
TXT_DIR = OUT_ROOT / "txt"
PDF_DIR = OUT_ROOT / "pdfs"   # may remain empty for this dataset
REPORT = OUT_ROOT / "report_preocr.json"
SUMMARY_CSV = OUT_ROOT / "summary_preocr.csv"

OUT_ROOT.mkdir(parents=True, exist_ok=True)
TXT_DIR.mkdir(exist_ok=True)
PDF_DIR.mkdir(exist_ok=True)

REPO_ID = "lhoestq/resumes-raw-pdf-for-ocr"
SPLIT = "train"   # dataset split
N_SAMPLES = 500   # adjust to how many you want to extract (max ~ full dataset size ~ 1585)

print(f"Loading dataset {REPO_ID} split={SPLIT} (this may take a minute)...")
ds = load_dataset(REPO_ID, split=SPLIT)

print("Dataset loaded. Columns:", ds.column_names, "Num examples:", len(ds))

# Which field contains text? Common names: 'text'
text_field = None
for candidate in ("text", "ocr", "page_text", "raw_text"):
    if candidate in ds.column_names:
        text_field = candidate
        break
# fallback: look for any string column
if text_field is None:
    for c in ds.column_names:
        # sample few rows to check if column is string-like and non-empty
        try:
            sample = ds[0].get(c)
            if isinstance(sample, str):
                text_field = c
                break
        except Exception:
            pass

if text_field is None:
    raise RuntimeError(f"Couldn't find a text column in dataset. Columns: {ds.column_names}")

print("Using text field:", text_field)

report = []
count = 0
for i, ex in enumerate(tqdm(ds, desc="writing text")):
    if count >= N_SAMPLES:
        break
    txt = ex.get(text_field) or ""
    # sometimes the text may be empty (filter as desired)
    if not txt or not txt.strip():
        # skip empty text entries (optionally you can save them)
        continue
    fname = f"sample_{count:05d}.txt"
    tgt = TXT_DIR / fname
    with open(tgt, "w", encoding="utf-8") as f:
        f.write(txt)
    report.append({"index": i, "filename": fname, "text_len": len(txt), "preview": txt[:500].replace("\n","\\n")})
    count += 1

# save the report json and a CSV summary with basic checks
with open(REPORT, "w", encoding="utf-8") as f:
    json.dump(report, f, indent=2)

import re
EMAIL_RE = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+")
PHONE_RE = re.compile(r"(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{6,12}")
SECTIONS = ['education','experience','skills','projects','certifications','publications','summary','objective','work experience']

rows = []
for r in report:
    txt = (TXT_DIR / r['filename']).read_text(encoding="utf-8", errors="ignore")
    email = bool(EMAIL_RE.search(txt))
    phone = bool(PHONE_RE.search(txt))
    any_section = any(k in txt.lower() for k in SECTIONS)
    rows.append({
        "file": r['filename'],
        "chars": len(txt),
        "email": email,
        "phone": phone,
        "any_section": any_section,
        "preview": txt[:400].replace("\n","\\n")
    })

# write CSV
import csv
with open(SUMMARY_CSV, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=list(rows[0].keys()) if rows else ["file"])
    writer.writeheader()
    for row in rows:
        writer.writerow(row)

print(f"Saved {len(report)} text samples to {TXT_DIR}")
print("Report:", REPORT)
print("Summary CSV:", SUMMARY_CSV)


Loading dataset lhoestq/resumes-raw-pdf-for-ocr split=train (this may take a minute)...


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Generating train split: 100%|██████████| 1585/1585 [00:00<00:00, 3149.88 examples/s]


Dataset loaded. Columns: ['label', 'images', 'text'] Num examples: 1585
Using text field: text


writing text:  32%|███▏      | 500/1585 [00:03<00:07, 139.17it/s]


Saved 500 text samples to ..\data\resumes_preocr\txt
Report: ..\data\resumes_preocr\report_preocr.json
Summary CSV: ..\data\resumes_preocr\summary_preocr.csv


##### Multilingual NER using a different model than SpaCy

In [1]:
# === Multilingual NER for PERSON names (fixes name extraction) ===
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import json
from pathlib import Path
from tqdm.auto import tqdm

ROOT = Path("../data/resumes_preocr")
TXT_DIR = ROOT / "txt"
NER_ENH = ROOT / "ner_enhanced.json"
OUT = ROOT / "ner_fixed_names.json"

# load previous enhanced NER
with open(NER_ENH, "r", encoding="utf-8") as f:
    data = json.load(f)

MODEL = "Davlan/xlm-roberta-base-ner-hrl"   # multilingual NER (very good on PERSON)
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForTokenClassification.from_pretrained(MODEL)
ner_pipe = pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

def load_text(fname):
    p = TXT_DIR / fname
    return p.read_text(encoding="utf-8", errors="ignore") if p.exists() else ""

def extract_person_name(text):
    """Return first PERSON detected by multilingual NER."""
    try:
        ents = ner_pipe(text[:800])  # only beginning of CV for speed
    except:
        return None
    persons = [e["word"] for e in ents if e["entity_group"] == "PER"]
    return persons[0] if persons else None

fixed = []
for entry in tqdm(data):
    txt = load_text(entry["file"])
    person = extract_person_name(txt)
    entry["primary_name_fixed"] = person if person else entry["primary_name"]
    fixed.append(entry)

with open(OUT, "w", encoding="utf-8") as f:
    json.dump(fixed, f, indent=2)

print("Saved fixed names to:", OUT)
print("Sample:", fixed[0]["primary_name"], "->", fixed[0]["primary_name_fixed"])


  from .autonotebook import tqdm as notebook_tqdm
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Device set to use cpu
100%|██████████| 500/500 [02:33<00:00,  3.25it/s]

Saved fixed names to: ..\data\resumes_preocr\ner_fixed_names.json
Sample: Nguyen Dang Binh -> Nguyen Dang Binh





In [2]:
# 1) Name-cleaning heuristics
import json, re
from pathlib import Path

ROOT = Path("../data/resumes_preocr")
TXT_DIR = ROOT / "txt"
NER_FIXED = ROOT / "ner_fixed_names.json"   # or ner_fixed_names_spacy.json if you used spaCy fallback
OUT_CLEAN = ROOT / "ner_fixed_names_clean.json"

BAD_NAME_PATTERNS = [
    re.compile(r"^\s*$"),
    re.compile(r"^(?:ph|pv|hr|cv)$", re.I),
    re.compile(r"^\d{4}[\s\-:]\d{4}$"),    # year ranges
    re.compile(r"^(?:jun|jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\b", re.I),
    re.compile(r"^(?:\d{1,2}[:/.-])"),     # starts with numbers like dates
    re.compile(r"^[\W_]+$")                # punctuation-only
]

def looks_bad_name(s):
    if s is None: return True
    s = s.strip()
    if len(s) < 2 or len(s.split()) > 6 and len(s) > 80:
        # too short or suspiciously long
        return True if len(s) < 2 or len(s) > 80 else False
    low = s.lower()
    if any(p.search(s) for p in BAD_NAME_PATTERNS):
        return True
    # contains words that are headings
    if any(k in low for k in ("experience","objective","summary","profile","address","phone","email","marital","date","present","current","managed")):
        return True
    # contains many digits -> bad
    if sum(c.isdigit() for c in s) / max(1,len(s)) > 0.15:
        return True
    return False

def fallback_name_from_text(text):
    # return first plausible short line near top that isn't a heading
    for ln in text.splitlines()[:12]:
        ln = ln.strip()
        if not ln: continue
        if len(ln) > 100: continue
        low = ln.lower()
        if any(k in low for k in ("objective","cv","curriculum","resume","skills","experience","education","address","phone","email","profile","summary","contact")):
            continue
        # filter lines that are mostly dates/locations or bullets
        if re.match(r"^[\-\u2022\•\*]\s*", ln): 
            # remove bullet char and keep going
            ln = re.sub(r"^[\-\u2022\•\*]\s*", "", ln).strip()
        if len(ln) < 3 or len(ln) > 80:
            continue
        # good candidate
        return ln
    return None

# load
with open(NER_FIXED, "r", encoding="utf-8") as f:
    data = json.load(f)

cleaned = []
fix_count = 0
for entry in data:
    orig = entry.get("primary_name_fixed") or entry.get("primary_name") or ""
    if looks_bad_name(orig):
        txt_path = TXT_DIR / entry["file"]
        txt = txt_path.read_text(encoding="utf-8", errors="ignore") if txt_path.exists() else ""
        fallback = fallback_name_from_text(txt)
        if fallback:
            entry["primary_name_clean"] = fallback
            fix_count += 1
        else:
            entry["primary_name_clean"] = None
    else:
        entry["primary_name_clean"] = orig
    cleaned.append(entry)

with open(OUT_CLEAN, "w", encoding="utf-8") as f:
    json.dump(cleaned, f, indent=2)

print(f"Saved cleaned NER with 'primary_name_clean' -> {OUT_CLEAN}  (fixed {fix_count} names)")
# show a small sample mapping
for e in cleaned[:6]:
    print(e["file"], "->", e.get("primary_name"), "->", e.get("primary_name_fixed"), "->", e.get("primary_name_clean"))


Saved cleaned NER with 'primary_name_clean' -> ..\data\resumes_preocr\ner_fixed_names_clean.json  (fixed 83 names)
sample_00000.txt -> Nguyen Dang Binh -> Nguyen Dang Binh -> Nguyen Dang Binh
sample_00001.txt -> Phường Lái Thiêu -> Phường Lái Thiêu -> Phường Lái Thiêu
sample_00002.txt -> Kumar Tiwari -> Akhilesh Kumar Tiwari -> Akhilesh Kumar Tiwari
sample_00003.txt -> Thiện Chí Trần
IOS -> Thiện Chí Trần -> Thiện Chí Trần
sample_00004.txt -> Địa -> ĐINH PHƯƠNG HUYỀN -> ĐINH PHƯƠNG HUYỀN
sample_00005.txt -> nghiệp chuyên -> Lê Trung Giang -> Lê Trung Giang


### Step 3: Generating BERT sentence embedding for matching cosine similarity

In [3]:
# 2) Regenerate embeddings using primary_name_clean
from sentence_transformers import SentenceTransformer
import numpy as np, json
from pathlib import Path
from tqdm.auto import tqdm

ROOT = Path("../data/resumes_preocr")
TXT_DIR = ROOT / "txt"
CLEAN = ROOT / "ner_fixed_names_clean.json"
EMB_DIR = ROOT / "embeddings"
EMB_DIR.mkdir(parents=True, exist_ok=True)
MODEL_NAME = "all-mpnet-base-v2"   # good BERT-like sentence embedder

# load cleaned metadata
with open(CLEAN, "r", encoding="utf-8") as f:
    meta = json.load(f)

model = SentenceTransformer(MODEL_NAME)
print("Model loaded:", MODEL_NAME)

def load_text(fname, n_chars=1600):
    p = TXT_DIR / fname
    return p.read_text(encoding="utf-8", errors="ignore")[:n_chars] if p.exists() else ""

count = 0
for entry in tqdm(meta, desc="Embedding (clean names)"):
    fname = entry["file"]
    name = entry.get("primary_name_clean") or ""
    snippet = load_text(fname)
    skills = ", ".join(entry.get("skills",[])[:12])
    orgs = ", ".join(entry.get("orgs",[])[:6])
    combined = (name + "\n\n" + snippet + "\n\nSkills: " + skills + "\nOrgs: " + orgs).strip()
    if not combined:
        combined = snippet or " "
    emb = model.encode(combined, show_progress_bar=False)
    npy_path = EMB_DIR / (Path(fname).stem + ".npy")
    np.save(npy_path, emb)
    count += 1

# write emb_index.json
emb_index = [{"file": e["file"], "npy": str(EMB_DIR / (Path(e["file"]).stem + ".npy"))} for e in meta]
with open(ROOT / "emb_index.json", "w", encoding="utf-8") as f:
    json.dump(emb_index, f, indent=2)

print("Regenerated embeddings for", count, "resumes; emb_index updated at", ROOT / "emb_index.json")


Model loaded: all-mpnet-base-v2


Embedding (clean names): 100%|██████████| 500/500 [04:36<00:00,  1.81it/s]

Regenerated embeddings for 500 resumes; emb_index updated at ..\data\resumes_preocr\emb_index.json





### Step 4: Scoring resumes using custom scoring heurisitcs

In [4]:
# 3) Recompute refined scoring using cleaned names
import json, re, csv
from pathlib import Path
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

ROOT = Path("../data/resumes_preocr")
TXT_DIR = ROOT / "txt"
CLEAN = ROOT / "ner_fixed_names_clean.json"
EMB_INDEX = ROOT / "emb_index.json"
OUT_JSON = ROOT / "resume_scores_refined_clean.json"
OUT_CSV = ROOT / "leaderboard_refined_clean.csv"

# loads
with open(CLEAN, "r", encoding="utf-8") as f: meta = json.load(f)
with open(EMB_INDEX, "r", encoding="utf-8") as f: emb_index = json.load(f)

file_to_entry = {e["file"]: e for e in meta}
file_to_npy = {e["file"]: e["npy"] for e in emb_index}

# Job description embedding (same JD or modify)
JOB_DESC = """
We are hiring an NLP Engineer with strong Python skills, proven experience with PyTorch and Transformers,
solid foundations in NLP methods (tokenization, attention, sequence models), and experience deploying models to production.
Experience with ML pipelines, REST APIs, and cloud deployment (AWS/GCP) is a plus.
"""
MODEL_NAME = "all-mpnet-base-v2"
model = SentenceTransformer(MODEL_NAME)
job_emb = model.encode(JOB_DESC)

# small helper to estimate experience (reuse previous helper)
YEARS_PHRASE_RE = re.compile(r"(\d+)\s+(?:years|yrs)\b", re.I)
def estimate_experience(text):
    m = YEARS_PHRASE_RE.search(text)
    if m:
        try: return min(int(m.group(1)), 40)
        except: pass
    # fallback: look for 2010-2014 style
    m2 = re.search(r"((19|20)\d{2})\s*[\-–—]\s*((19|20)\d{2})", text)
    if m2:
        try:
            return min(int(m2.group(3)) - int(m2.group(1)), 40)
        except: pass
    return 0

# scoring weights (tweak as needed)
WEIGHT_CONTACT = 8
WEIGHT_SKILL = 5
WEIGHT_SIM = 45
WEIGHT_NAME_QUALITY = 6
WEIGHT_LANG_MATCH = 6
WEIGHT_EXP_PER_YEAR = 1.2
CAP_SKILLS = 6
CAP_EXP = 15

results = []
for rec in emb_index:
    fname = rec["file"]
    npy = rec["npy"]
    if fname not in file_to_entry: continue
    entry = file_to_entry[fname]
    try:
        emb = np.load(npy)
    except Exception:
        continue
    raw_sim = float(cosine_similarity([emb], [job_emb])[0,0])
    sim_norm = (raw_sim + 1)/2.0
    contact_bonus = WEIGHT_CONTACT if entry.get("contact_status") == "found" else 0
    n_skills = len(entry.get("skills",[]))
    skill_score = min(CAP_SKILLS, n_skills) * WEIGHT_SKILL
    # name_quality heuristic: prefer names with at least 2 alphabetic words
    name = entry.get("primary_name_clean") or ""
    name_quality = 1.0 if (len(name.split())>=2 and re.search(r"[A-Za-z\u00C0-\u017F]", name)) else 0.6 if len(name.split())==1 and re.search(r"[A-Za-z\u00C0-\u017F]", name) else 0
    name_score = name_quality * WEIGHT_NAME_QUALITY
    # language bonus
    lang_bonus = WEIGHT_LANG_MATCH if entry.get("language","") == "en" else 0
    # experience
    text = (TXT_DIR / fname).read_text(encoding="utf-8", errors="ignore") if (TXT_DIR/ fname).exists() else ""
    exp = estimate_experience(text)
    exp_score = min(exp, CAP_EXP) * WEIGHT_EXP_PER_YEAR
    sim_score = sim_norm * WEIGHT_SIM
    total = contact_bonus + skill_score + sim_score + name_score + lang_bonus + exp_score
    results.append({
        "file": fname,
        "primary_name": name,
        "contact_status": entry.get("contact_status","missing"),
        "n_skills": n_skills,
        "sim_raw": round(raw_sim,4),
        "sim_norm": round(sim_norm,4),
        "name_quality": round(name_quality,3),
        "exp_years": exp,
        "score": round(total,3),
        "top_skills": entry.get("skills",[])[:8],
        "preview": entry.get("original_preview","")[:300].replace("\n","\\n")
    })

results_sorted = sorted(results, key=lambda x: x["score"], reverse=True)

# save
with open(OUT_JSON, "w", encoding="utf-8") as f: json.dump(results_sorted, f, indent=2)
with open(OUT_CSV, "w", newline='', encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["score","sim_raw","sim_norm","name_quality","exp_years","contact_status","n_skills","primary_name","file","top_skills","preview"])
    writer.writeheader()
    for r in results_sorted: writer.writerow({k:r.get(k,"") for k in writer.fieldnames})

# print top 10
print("\nCleaned Refined Top 10 resumes:")
for s in results_sorted[:10]:
    print(f"{s['score']:7.3f} | sim={s['sim_raw']:.3f} ({s['sim_norm']:.3f}) | nameQ={s['name_quality']:.2f} | exp={s['exp_years']:2} | contact={s['contact_status']:<7} | skills={s['n_skills']:2} | {s['primary_name'][:40]:40} | {s['file']}")
print("\nSaved:", OUT_JSON, "and", OUT_CSV)



Cleaned Refined Top 10 resumes:
101.219 | sim=0.476 (0.738) | nameQ=1.00 | exp=25 | contact=found   | skills= 9 | LE HOANG                                 | sample_00419.txt
 99.868 | sim=0.416 (0.708) | nameQ=1.00 | exp=17 | contact=found   | skills=10 | Hoàng Quang Hưng                         | sample_00218.txt
 98.123 | sim=0.339 (0.669) | nameQ=1.00 | exp=40 | contact=found   | skills= 8 | Doan Minh Hoang                          | sample_00220.txt
 97.977 | sim=0.332 (0.666) | nameQ=1.00 | exp=17 | contact=found   | skills= 9 | Nguyen Ngoc Dang                         | sample_00159.txt
 96.126 | sim=0.517 (0.758) | nameQ=1.00 | exp=10 | contact=found   | skills= 9 | NGUYEN VAN HUONG
Day                     | sample_00261.txt
 92.210 | sim=0.432 (0.716) | nameQ=1.00 | exp=16 | contact=missing | skills=11 | Chung Vi Huy                             | sample_00267.txt
 91.011 | sim=0.378 (0.689) | nameQ=1.00 | exp=15 | contact=missing | skills= 6 | DINH NGUYEN DANG KHOA            

##### OLD NER Technique using SpaCy -> Drawback: Only english compatible

In [10]:
# Step 3 (run): SpaCy NER + rule-based extraction
import json, re, sys
from pathlib import Path
from tqdm.auto import tqdm

# ensure spaCy model installed
try:
    import spacy
    nlp = spacy.load("en_core_web_sm")
except Exception as e:
    print("spaCy model en_core_web_sm not found. Install it and re-run:")
    print("    python -m spacy download en_core_web_sm")
    raise

ROOT = Path("../data/resumes_preocr")
TXT_DIR = ROOT / "txt"
OUT_EXTRACT = ROOT / "ner_extract.json"

EMAIL_RE = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+")
PHONE_RE = re.compile(r"(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{6,12}")

# small initial skills vocabulary — expand with your domain
SKILLS = {
    "python","java","c++","c","sql","nlp","natural language processing","deep learning",
    "machine learning","tensorflow","pytorch","react","docker","kubernetes","git",
    "aws","gcp","azure","linux","pandas","numpy","scikit-learn","keras"
}

def extract_skills(text):
    tl = text.lower()
    found = []
    for s in SKILLS:
        if s in tl:
            found.append(s)
    return sorted(found)

results = []
files = sorted(TXT_DIR.glob("*.txt"))
print("Processing", len(files), "text files with spaCy NER...")

for p in tqdm(files, desc="NER"):
    txt = p.read_text(encoding="utf-8", errors="ignore")
    doc = nlp(txt)
    names = []
    orgs = []
    gpes = []
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            names.append(ent.text)
        elif ent.label_ in ("ORG","NORP"):
            orgs.append(ent.text)
        elif ent.label_ == "GPE":
            gpes.append(ent.text)
    emails = EMAIL_RE.findall(txt)
    phones = PHONE_RE.findall(txt)
    skills = extract_skills(txt)
    results.append({
        "file": p.name,
        "names": list(dict.fromkeys(names))[:3],
        "orgs": list(dict.fromkeys(orgs))[:6],
        "locations": list(dict.fromkeys(gpes))[:4],
        "emails": emails,
        "phones": phones,
        "skills": skills,
        "chars": len(txt),
        "preview": txt[:600].replace("\n","\\n")
    })

# save results
with open(OUT_EXTRACT, "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2)

# print quick stats
total = len(results)
emails_found = sum(1 for r in results if r["emails"])
phones_found = sum(1 for r in results if r["phones"])
skills_found = sum(1 for r in results if r["skills"])
avg_len = sum(r["chars"] for r in results) / total if total else 0

print("\n=== NER SUMMARY ===")
print(f"Files processed: {total}")
print(f"Emails found: {emails_found}")
print(f"Phones found: {phones_found}")
print(f"Files with detected skills: {skills_found}")
print(f"Average text length: {avg_len:.0f} chars")
print("Saved ner extract ->", OUT_EXTRACT)
print("\nSample entry (first file):")
if results:
    import pprint
    pprint.pprint(results[0], compact=True, width=120)
else:
    print("No text files found in", TXT_DIR)


Processing 500 text files with spaCy NER...


NER: 100%|██████████| 500/500 [03:54<00:00,  2.13it/s]


=== NER SUMMARY ===
Files processed: 500
Emails found: 1
Phones found: 43
Files with detected skills: 499
Average text length: 3762 chars
Saved ner extract -> ..\data\resumes_preocr\ner_extract.json

Sample entry (first file):
{'chars': 2812,
 'emails': [],
 'file': 'sample_00000.txt',
 'locations': ['Hanoi', 'Robot', 'Tkinter', 'Taiwan'],
 'names': ['Nguyen Dang Binh', 'Luster LightTech', 'Debug'],
 'orgs': ['AI/Computer Vision Engineer\nAddress', 'Vision Software Senior', 'BacNinh', 'AOI', 'Luxshare-ICT',
          'Medical Image Segmentation'],
 'phones': [],
 'preview': 'Nguyen Dang Binh – AI/Computer Vision Engineer\\nAddress: Bac Ninh City\\nE '
            'mail:\\nMobile/Zalo:\\nBirthday: Jan 8th 1986\\nK EY STRENGTHS\\nMachine vision \uf020Team '
            'Leader\\n\uf0a7\uf020\\n\uf0a7\uf020AI engineer Automation software\\nP ROFESSIONAL EXPERIENCE\\nVision '
            'Software Senior (Aug 2023 – Now): Luster LightTech, BacNinh, VN:\\n\uf0d8 Develop the low-code softwa




In [11]:
# Post-process NER outputs: normalize names, improve phone/email detection, expand skills, detect language
import json, re, os
from pathlib import Path
from collections import Counter
from pprint import pprint

ROOT = Path("../data/resumes_preocr")
NER_IN = ROOT / "ner_extract.json"
NER_OUT = ROOT / "ner_enhanced.json"
TXT_DIR = ROOT / "txt"

# load NER results
with open(NER_IN, "r", encoding="utf-8") as f:
    ner = json.load(f)

# stronger regexes
EMAIL_RE = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+")
# phone: look for sequences of digits with common separators, allow country code, require 7-15 digits total
PHONE_RE = re.compile(r"(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{1,4}\)?[-.\s]?)?(?:\d[-.\s]?){6,14}\d")
LINKEDIN_RE = re.compile(r"(linkedin\.com/[A-Za-z0-9_\-./]+|linkedin:[A-Za-z0-9_\-/]+)", re.I)
GITHUB_RE = re.compile(r"(github\.com/[A-Za-z0-9_.\-]+|github:[A-Za-z0-9_.\-]+)", re.I)

# expanded skills vocabulary (extendable)
MORE_SKILLS = [
    "python","java","c++","c","c#","sql","nlp","natural language processing","deep learning",
    "machine learning","tensorflow","pytorch","keras","scikit-learn","spark","hadoop",
    "pandas","numpy","matplotlib","seaborn","docker","kubernetes","aws","gcp","azure",
    "react","node.js","express","flask","django","rest api","graphql","git","linux",
    "bash","mlops","computer vision","opencv","pandas","communication","leadership",
    "excel","tableau","power bi","spark","hive","bigquery","seo","marketing","sales"
]
# normalize to lowercase for substring matching
MORE_SKILLS = list(dict.fromkeys([s.lower() for s in MORE_SKILLS]))

# optional fuzzy matching: use rapidfuzz if installed
try:
    from rapidfuzz import process, fuzz
    FUZZY_AVAILABLE = True
except Exception:
    FUZZY_AVAILABLE = False

# helper: load raw text for a sample file if present
def load_text_for_file(filename):
    txt_path = TXT_DIR / filename
    if txt_path.exists():
        return txt_path.read_text(encoding="utf-8", errors="ignore")
    return ""

# helper: clean orgs (remove entries that look like addresses or job titles)
JOB_TITLE_KEYWORDS = set(["engineer","developer","manager","senior","lead","intern","assistant","consultant",
                          "officer","analyst","specialist","architect","director","president","coordinator",
                          "supervisor"])
def clean_org_list(orgs):
    cleaned = []
    for o in orgs:
        s = o.strip()
        # skip if empty or obviously a sentence fragment or contains newline markers
        if not s or len(s) < 2:
            continue
        # skip if it contains 'address' or 'mobile' or 'email' (likely not org)
        low = s.lower()
        if any(x in low for x in ("address","mobile","email","phone","birthday","birth","cv","c.v","objective","profile")):
            continue
        # skip if it's too long garbage with many punctuation characters
        punct_ratio = sum(1 for ch in s if not ch.isalnum() and not ch.isspace()) / max(1,len(s))
        if punct_ratio > 0.25 and len(s) < 50:
            continue
        # optionally skip entries that are likely job titles (we want organizations)
        if any(jk in low for jk in JOB_TITLE_KEYWORDS):
            # allow if it contains a comma and an apparent company name later
            if "," not in s and len(s.split()) < 6:
                # likely a job title, skip
                continue
        cleaned.append(s)
    # dedupe keeping order
    seen = set()
    out = []
    for o in cleaned:
        if o not in seen:
            out.append(o); seen.add(o)
    return out

# helper: expand skills by substring (and fuzzy if available)
def extract_skills_from_text(text):
    tl = text.lower()
    found = set()
    for s in MORE_SKILLS:
        if s in tl:
            found.add(s)
    # fuzzy: if available, match tokens
    if FUZZY_AVAILABLE and not found:
        # take top fuzzy matches for single-word tokens
        tokens = set(re.findall(r"[A-Za-z0-9+#\.\-]+", tl))
        for tok in tokens:
            best = process.extractOne(tok, MORE_SKILLS, scorer=fuzz.partial_ratio)
            if best and best[1] >= 90:
                found.add(best[0])
    return sorted(found)

# try langdetect if available
try:
    from langdetect import detect, DetectorFactory
    DetectorFactory.seed = 0
    LANGDET_AVAILABLE = True
except Exception:
    LANGDET_AVAILABLE = False

# process and enhance
enhanced = []
counters = Counter()
for entry in ner:
    fname = entry.get("file")
    raw_text = load_text_for_file(fname)
    # combine original fields and raw text for better detection
    emails = entry.get("emails", []) or EMAIL_RE.findall(raw_text)
    phones = entry.get("phones", []) or PHONE_RE.findall(raw_text)
    linkedin = LINKEDIN_RE.findall(raw_text) or LINKEDIN_RE.findall(" ".join(entry.get("orgs",[])))
    github = GITHUB_RE.findall(raw_text) or GITHUB_RE.findall(" ".join(entry.get("orgs",[])))
    # choose primary name: prefer first PERSON entity; if no person, try first line of text
    primary_name = None
    if entry.get("names"):
        # pick first name-like token but avoid when it's obviously organization/job (heuristic)
        for nm in entry["names"]:
            if nm and len(nm) > 1 and not any(t.lower() in nm.lower() for t in ("engineer","developer","company","address","mobile")):
                primary_name = nm.strip()
                break
        if primary_name is None:
            primary_name = entry["names"][0].strip()
    if not primary_name and raw_text:
        # take the first non-empty line up to 80 chars as a fallback
        for ln in raw_text.splitlines():
            ln = ln.strip()
            if ln and len(ln) < 80:
                primary_name = ln
                break
    # clean orgs
    orgs_clean = clean_org_list(entry.get("orgs",[]))
    # expand skills
    text_for_skills = " ".join([raw_text, entry.get("preview","")])
    skills = sorted(set(entry.get("skills",[])) | set(extract_skills_from_text(text_for_skills)))
    # detect language
    lang = "unknown"
    if LANGDET_AVAILABLE and raw_text.strip():
        try:
            lang = detect(raw_text[:2000])
        except Exception:
            lang = "unknown"
    # contact status
    if emails or phones:
        contact_status = "found"
    else:
        # can't reliably detect redaction here (we didn't run redaction detector on this dataset)
        contact_status = "missing"
    # assemble enhanced entry
    e = {
        "file": fname,
        "primary_name": primary_name or "",
        "orgs": orgs_clean,
        "emails": emails,
        "phones": phones,
        "linkedin": linkedin,
        "github": github,
        "skills": skills,
        "chars": entry.get("chars", 0),
        "language": lang,
        "contact_status": contact_status,
        "original_preview": entry.get("preview","")
    }
    enhanced.append(e)
    counters.update([contact_status])
    if emails: counters.update(["has_email"])
    if phones: counters.update(["has_phone"])
    counters.update(["lang:"+lang])

# save
with open(NER_OUT, "w", encoding="utf-8") as f:
    json.dump(enhanced, f, indent=2)

# print a concise summary
total = len(enhanced)
print("Wrote enhanced NER ->", NER_OUT)
print("Total entries:", total)
print("Contact found:", counters["found"] if "found" in counters else sum(1 for e in enhanced if e["contact_status"]=="found"))
print("States:", {k:v for k,v in counters.items() if k.startswith("lang:") or k in ('has_email','has_phone')})
print("\nSample enhanced entry (first):")
pprint(enhanced[0])


Wrote enhanced NER -> ..\data\resumes_preocr\ner_enhanced.json
Total entries: 500
Contact found: 152
States: {'lang:en': 183, 'lang:vi': 315, 'has_phone': 152, 'lang:nl': 1, 'lang:zh-cn': 1, 'has_email': 1}

Sample enhanced entry (first):
{'chars': 2812,
 'contact_status': 'missing',
 'emails': [],
 'file': 'sample_00000.txt',
 'github': [],
 'language': 'en',
 'linkedin': [],
 'orgs': ['BacNinh', 'AOI', 'Luxshare-ICT', 'Medical Image Segmentation'],
 'original_preview': 'Nguyen Dang Binh – AI/Computer Vision '
                     'Engineer\\nAddress: Bac Ninh City\\nE '
                     'mail:\\nMobile/Zalo:\\nBirthday: Jan 8th 1986\\nK EY '
                     'STRENGTHS\\nMachine vision \uf020Team '
                     'Leader\\n\uf0a7\uf020\\n\uf0a7\uf020AI engineer '
                     'Automation software\\nP ROFESSIONAL EXPERIENCE\\nVision '
                     'Software Senior (Aug 2023 – Now): Luster LightTech, '
                     'BacNinh, VN:\\n\uf0d8 Develop th

In [3]:
# === Create BERT-based embeddings (all-mpnet-base-v2) and compute baseline scores ===
from sentence_transformers import SentenceTransformer
import numpy as np, json, os
from pathlib import Path
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.auto import tqdm

ROOT = Path("../data/resumes_preocr")
TXT_DIR = ROOT / "txt"
NER_ENH = ROOT / "ner_enhanced.json"
EMB_DIR = ROOT / "embeddings"
EMB_DIR.mkdir(parents=True, exist_ok=True)

# 1) load enhanced NER entries
with open(NER_ENH, "r", encoding="utf-8") as f:
    ner = json.load(f)

# helper to read resume text
def load_text(fname):
    p = TXT_DIR / fname
    return p.read_text(encoding="utf-8", errors="ignore") if p.exists() else ""

# 2) select BERT-style sentence-transformer model (MPNet = BERT-family)
MODEL_NAME = "all-mpnet-base-v2"   # strong BERT-like sentence embedding model
print("Loading model:", MODEL_NAME)
model = SentenceTransformer(MODEL_NAME)

# 3) build & save embeddings for each resume (uses name + chunk + skills + orgs)
emb_index = []
for entry in tqdm(ner, desc="Embedding resumes"):
    fname = entry["file"]
    txt = load_text(fname) or ""
    snippet = txt[:1600]  # representative chunk; increase if you'd like
    name = entry.get("primary_name","") or ""
    skills_str = ", ".join(entry.get("skills",[]))
    orgs_str = ", ".join(entry.get("orgs",[]))
    combined = (name + "\n\n" + snippet + "\n\nSkills: " + skills_str + "\nOrgs: " + orgs_str).strip()
    if not combined:
        combined = " "
    emb = model.encode(combined, show_progress_bar=False)
    npy_path = EMB_DIR / (Path(fname).stem + ".npy")
    np.save(npy_path, emb)
    emb_index.append({"file": fname, "npy": str(npy_path)})

# save emb index
with open(ROOT / "emb_index.json", "w", encoding="utf-8") as f:
    json.dump(emb_index, f, indent=2)

print("Saved embeddings:", len(emb_index), "->", EMB_DIR)

# 4) Job description (edit this to match the role you want)
JOB_DESC = """
We are hiring an NLP Engineer with strong Python skills, proven experience with PyTorch and Transformers,
solid foundations in NLP methods (tokenization, attention, sequence models), and experience deploying models to production.
Experience with ML pipelines, REST APIs, and cloud deployment (AWS/GCP) is a plus.
"""

job_emb = model.encode(JOB_DESC)

# 5) scoring parameters (adjustable later)
WEIGHT_CONTACT = 12        # bonus if contact exists
WEIGHT_SKILL = 6           # per matched skill (cap)
WEIGHT_SIM = 55            # multiplier for similarity score
CAP_SKILLS = 5

# 6) compute similarity + baseline score
file_to_entry = {e["file"]: e for e in ner}
scores = []
for rec in emb_index:
    fname = rec["file"]
    emb = np.load(rec["npy"])
    sim = float(cosine_similarity([emb], [job_emb])[0,0])
    entry = file_to_entry.get(fname, {})
    contact_bonus = WEIGHT_CONTACT if entry.get("contact_status") == "found" else 0
    skills_count = len(entry.get("skills", []))
    skill_score = min(CAP_SKILLS, skills_count) * WEIGHT_SKILL
    sim_score = sim * WEIGHT_SIM
    total = contact_bonus + skill_score + sim_score
    scores.append({
        "file": fname,
        "primary_name": entry.get("primary_name",""),
        "contact_status": entry.get("contact_status","missing"),
        "n_skills": skills_count,
        "sim": round(sim, 4),
        "score": round(total, 4),
        "top_skills": entry.get("skills", [])[:6],
        "preview": entry.get("original_preview","")[:300].replace("\n","\\n")
    })

scores_sorted = sorted(scores, key=lambda x: x["score"], reverse=True)

# save results
with open(ROOT / "resume_scores.json", "w", encoding="utf-8") as f:
    json.dump(scores_sorted, f, indent=2)

# print top 10
print("\nTop 10 resumes by baseline score:")
for s in scores_sorted[:10]:
    print(f"{s['score']:7.3f} | sim={s['sim']:.3f} | contact={s['contact_status']:<7} | skills={s['n_skills']:2} | {s['primary_name'][:40]:40} | {s['file']}")

# optionally: save leaderboard CSV for inspection
import csv
with open(ROOT / "leaderboard.csv", "w", newline='', encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["score","sim","contact_status","n_skills","primary_name","file","top_skills","preview"])
    writer.writeheader()
    for r in scores_sorted:
        writer.writerow({k: r.get(k,"") for k in writer.fieldnames})

print("Saved resume_scores.json and leaderboard.csv in", ROOT)


  from .autonotebook import tqdm as notebook_tqdm


Loading model: all-mpnet-base-v2


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Embedding resumes: 100%|██████████| 500/500 [05:25<00:00,  1.54it/s]


Saved embeddings: 500 -> ..\data\resumes_preocr\embeddings

Top 10 resumes by baseline score:
 72.619 | sim=0.557 | contact=found   | skills= 9 | Python                                   | sample_00396.txt
 70.418 | sim=0.517 | contact=found   | skills= 9 | NGUYEN VAN HUONG
Day                     | sample_00261.txt
 68.397 | sim=0.480 | contact=found   | skills=11 | Trần Dưỡng                               | sample_00444.txt
 67.514 | sim=0.464 | contact=found   | skills= 7 | Yen Hoa                                  | sample_00075.txt
 67.091 | sim=0.456 | contact=found   | skills= 9 | Duy Duc Thien                            | sample_00289.txt
 66.588 | sim=0.447 | contact=found   | skills= 9 | Marital Status                           | sample_00419.txt
 65.818 | sim=0.433 | contact=found   | skills= 6 | Ho Chi Minh                              | sample_00066.txt
 65.018 | sim=0.418 | contact=found   | skills= 8 | Shipworks                                | sample_00174.txt
 64.756 | 

In [1]:
# === Improved scoring: name_quality, language_match, rough_experience, sim normalization ===
import json, re, csv
from pathlib import Path
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.auto import tqdm

ROOT = Path("../data/resumes_preocr")
TXT_DIR = ROOT / "txt"
NER_ENH = ROOT / "ner_enhanced.json"
EMB_INDEX = ROOT / "emb_index.json"   # created by previous embedding run
OUT_JSON = ROOT / "resume_scores_refined.json"
OUT_CSV = ROOT / "leaderboard_refined.csv"

# load data
with open(NER_ENH, "r", encoding="utf-8") as f:
    ner = json.load(f)
with open(EMB_INDEX, "r", encoding="utf-8") as f:
    emb_index = json.load(f)

file_to_entry = {e["file"]: e for e in ner}
file_to_npy = {e["file"]: e["npy"] for e in emb_index}

# load spaCy for name quality check
try:
    import spacy
    nlp = spacy.load("en_core_web_sm")
    SPACY_AVAILABLE = True
except Exception:
    SPACY_AVAILABLE = False
    print("spaCy not available — name quality check will be skipped. Install en_core_web_sm to enable it.")

# langdetect for JD language
try:
    from langdetect import detect, DetectorFactory
    DetectorFactory.seed = 0
    LANGDET_AVAILABLE = True
except Exception:
    LANGDET_AVAILABLE = False
    print("langdetect not available — language match will be skipped. pip install langdetect to enable it.")

# helper: load text
def load_text(fname):
    p = TXT_DIR / fname
    return p.read_text(encoding="utf-8", errors="ignore") if p.exists() else ""

# rough experience extractor: looks for "X years" or year ranges like 2016-2021, and counts span
YEARS_RANGE_RE = re.compile(r"(\b(19|20)\d{2})\s*[\-–—]\s*(\b(19|20)\d{2})")
YEARS_PHRASE_RE = re.compile(r"(\d+)\s+(?:years|yrs)\b", re.I)

def estimate_experience(text):
    # try "X years" phrase
    m = YEARS_PHRASE_RE.search(text)
    if m:
        try:
            val = int(m.group(1))
            return min(val, 40)  # cap at 40
        except:
            pass
    # try year ranges
    m = YEARS_RANGE_RE.search(text)
    if m:
        try:
            start = int(m.group(1))
            end = int(m.group(3))
            span = max(0, end - start)
            return min(span, 40)
        except:
            pass
    return 0

# helper: check if primary_name is likely a PERSON using spaCy
def name_quality(name):
    if not name or not SPACY_AVAILABLE:
        return 0
    doc = nlp(name)
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            return 1
    # fallback: heuristic — at least two words and letters
    if len(name.split()) >= 2 and re.search(r"[A-Za-z\u00C0-\u017F]", name):
        return 0.6
    return 0

# detect JD language (use English as fallback)
JOB_DESC = """
We are hiring an NLP Engineer with strong Python skills, proven experience with PyTorch and Transformers,
solid foundations in NLP methods (tokenization, attention, sequence models), and experience deploying models to production.
Experience with ML pipelines, REST APIs, and cloud deployment (AWS/GCP) is a plus.
"""
JD_LANG = "en"
if LANGDET_AVAILABLE:
    try:
        JD_LANG = detect(JOB_DESC[:2000])
    except Exception:
        JD_LANG = "en"

# load embeddings model (for job embedding)
MODEL_NAME = "all-mpnet-base-v2"
print("Loading embedding model:", MODEL_NAME)
model = SentenceTransformer(MODEL_NAME)
job_emb = model.encode(JOB_DESC)

# scoring hyperparams (tweakable)
WEIGHT_CONTACT = 8          # smaller contact bonus
WEIGHT_SKILL = 5            # per matched skill
WEIGHT_SIM = 45             # semantic sim weight
WEIGHT_NAME_QUALITY = 6     # boost if name looks like a person
WEIGHT_LANG_MATCH = 6       # bonus if resume language == JD language
WEIGHT_EXP_PER_YEAR = 1.2   # per estimated year of experience (capped)
CAP_EXP = 15
CAP_SKILLS = 6

# compute all scores
results = []
for rec in emb_index:
    fname = rec["file"]
    npy = rec["npy"]
    if fname not in file_to_entry:
        continue
    entry = file_to_entry[fname]
    # load emb
    try:
        emb = np.load(npy)
    except Exception as e:
        continue
    # raw cosine sim
    raw_sim = float(cosine_similarity([emb], [job_emb])[0,0])
    # normalize sim from [-1,1] to [0,1]
    sim_norm = (raw_sim + 1) / 2.0
    # contact
    contact_bonus = WEIGHT_CONTACT if entry.get("contact_status") == "found" else 0
    # skills
    n_skills = len(entry.get("skills", []))
    skill_score = min(CAP_SKILLS, n_skills) * WEIGHT_SKILL
    # name quality
    nq = name_quality(entry.get("primary_name",""))
    name_score = nq * WEIGHT_NAME_QUALITY
    # language match
    lang = entry.get("language","unknown")
    lang_bonus = WEIGHT_LANG_MATCH if (LANGDET_AVAILABLE and lang and JD_LANG and lang == JD_LANG) else 0
    # rough experience
    text = load_text(fname)
    exp_years = estimate_experience(text)
    exp_score = min(exp_years, CAP_EXP) * WEIGHT_EXP_PER_YEAR
    # final score
    sim_score = sim_norm * WEIGHT_SIM
    total = contact_bonus + skill_score + sim_score + name_score + lang_bonus + exp_score
    results.append({
        "file": fname,
        "primary_name": entry.get("primary_name",""),
        "contact_status": entry.get("contact_status","missing"),
        "n_skills": n_skills,
        "sim_raw": round(raw_sim,4),
        "sim_norm": round(sim_norm,4),
        "name_quality": round(nq,3),
        "lang": lang,
        "exp_years": exp_years,
        "score": round(total,3),
        "top_skills": entry.get("skills", [])[:8],
        "preview": entry.get("original_preview","")[:300].replace("\n","\\n")
    })

results_sorted = sorted(results, key=lambda x: x["score"], reverse=True)

# save results
with open(OUT_JSON, "w", encoding="utf-8") as f:
    json.dump(results_sorted, f, indent=2)

# save CSV
with open(OUT_CSV, "w", newline='', encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["score","sim_raw","sim_norm","name_quality","exp_years","lang","contact_status","n_skills","primary_name","file","top_skills","preview"])
    writer.writeheader()
    for r in results_sorted:
        writer.writerow({k: r.get(k,"") for k in writer.fieldnames})

# print top 10
print("\nRefined Top 10 resumes:")
for s in results_sorted[:10]:
    print(f"{s['score']:7.3f} | sim={s['sim_raw']:.3f} ({s['sim_norm']:.3f}) | nameQ={s['name_quality']:.2f} | exp={s['exp_years']:2} | contact={s['contact_status']:<7} | skills={s['n_skills']:2} | {s['primary_name'][:40]:40} | {s['file']}")
print("\nSaved:", OUT_JSON, "and", OUT_CSV)


  from .autonotebook import tqdm as notebook_tqdm


Loading embedding model: all-mpnet-base-v2

Refined Top 10 resumes:
100.559 | sim=0.447 (0.724) | nameQ=1.00 | exp=25 | contact=found   | skills= 9 | Marital Status                           | sample_00419.txt
 99.809 | sim=0.414 (0.707) | nameQ=1.00 | exp=17 | contact=found   | skills=10 | Quang Hưng                               | sample_00218.txt
 98.979 | sim=0.377 (0.688) | nameQ=1.00 | exp=40 | contact=found   | skills= 8 | Ly Van Sam                               | sample_00220.txt
 97.977 | sim=0.332 (0.666) | nameQ=1.00 | exp=17 | contact=found   | skills= 9 | Nguyen Ngoc Dang                         | sample_00159.txt
 96.126 | sim=0.517 (0.758) | nameQ=1.00 | exp=10 | contact=found   | skills= 9 | NGUYEN VAN HUONG
Day                     | sample_00261.txt
 92.210 | sim=0.432 (0.716) | nameQ=1.00 | exp=16 | contact=missing | skills=11 | Chung Vi Huy                             | sample_00267.txt
 90.922 | sim=0.374 (0.687) | nameQ=1.00 | exp=15 | contact=missing | skills= 6 

### Step 5: Giving summary to users using the score of the resume via Gemini API (To be completed)

In [8]:
# Jupyter cell: Google GenAI (Gemini) quick examples
# 1) install (run once in your env / notebook). Uncomment if needed.
# !pip install -q -U google-genai pydantic

import os
from google import genai
from pydantic import BaseModel
from typing import List, Optional
from dotenv import load_dotenv
load_dotenv()  # load .env if present

# --- AUTH: either set environment variable GEMINI_API_KEY (recommended)
# In a terminal / OS:
#   export GEMINI_API_KEY="your_api_key_here"      # linux / mac
#   setx GEMINI_API_KEY "your_api_key_here"       # windows (restart shell)
# Or pass the key explicitly to Client(api_key=...)
#
# The client will pick GEMINI_API_KEY from env automatically:
client = genai.Client()  # uses os.environ['GEMINI_API_KEY'] if present

# -----------------------
# Example A: simple text generation
# -----------------------
resp = client.models.generate_content(
    model="gemini-2.5-flash",               # recommended model from quickstart
    contents="Explain how an NLP resume parser should work in 3 sentences."
)
print("=== Text generation ===")
print(resp.text)   # concise plain text result

# -----------------------
# Example B: Structured output (JSON) using a Pydantic schema
# Useful for extracting structured fields from a resume text.
# NOTE: structured output requires `response_mime_type="application/json"`,
# and we pass a Pydantic model as response_schema.
# -----------------------
class ResumeSchema(BaseModel):
    name: Optional[str]
    email: Optional[str]
    phone: Optional[str]
    skills: List[str] = []
    years_experience: Optional[int]

resume_prompt = """
Extract the main contact information and skills from this resume text.
Return only JSON that conforms to the schema: name, email, phone, skills (array of skill strings),
years_experience (approximate integer).
Resume text:
---
Software engineer with 6 years experience in backend and cloud, expert in Python, Docker, and SQL.
Contact: john.doe@example.com, +1 (555) 123-4567
Worked at Acme Corp and Beta Systems.
---
"""

# call with 'config' specifying response_mime_type and response_schema (Pydantic model)
resp_json = client.models.generate_content(
    model="gemini-2.5-flash",
    contents=resume_prompt,
    config={
        "response_mime_type": "application/json",   # ask for JSON output
        "response_schema": ResumeSchema,            # pydantic model to validate/parse
    },
)

print("\n=== Structured (parsed) output ===")
# .parsed (if available) gives the parsed Pydantic instance; else fallback to text
try:
    parsed = resp_json.parsed  # a ResumeSchema instance (if SDK parsed successfully)
    print(parsed.json(indent=2))
except Exception:
    # fallback: print model text (raw JSON string from model)
    print(resp_json.text)


=== Text generation ===
An NLP resume parser first extracts text from various document formats (PDF, DOCX). It then uses Natural Language Processing techniques like Named Entity Recognition (NER) and text classification to identify and categorize key information such as skills, experience, education, and contact details. Finally, it structures this extracted data into a machine-readable format, making it easy for Applicant Tracking Systems (ATS) to store, search, and analyze candidate profiles.

=== Structured (parsed) output ===
{
  "name": null,
  "email": "john.doe@example.com",
  "phone": "+1 (555) 123-4567",
  "skills": [
    "Python",
    "Docker",
    "SQL"
  ],
  "years_experience": 6
}


C:\Users\abhis\AppData\Local\Temp\ipykernel_18112\2897953912.py:70: PydanticDeprecatedSince20: The `json` method is deprecated; use `model_dump_json` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.12/migration/
  print(parsed.json(indent=2))


In [9]:
# === Gemini structured summarization for scored resumes (ready-to-run) ===
# Requirements:
# pip install -U google-genai pydantic pandas
#
# Set env var: GEMINI_API_KEY (or use genai.Client(api_key="...") below)
#
import os, json, re, time, csv
from pathlib import Path
from typing import List, Optional
from pydantic import BaseModel
import google.genai as genai  # updated name in SDK

# ---------- Config ----------
DATA_DIR = Path("../data/resumes_preocr")
SCORES_FILE = DATA_DIR / "resume_scores_refined_clean.json"
OUT_JSON = DATA_DIR / "summaries_gemini.json"
OUT_CSV = DATA_DIR / "summaries_gemini.csv"
MODEL = "gemini-2.5-flash"   # change if needed (use model you have access to)
BATCH_SIZE = 6               # lower to respect rate limits
TOP_N = 200                  # how many resumes to summarize (set smaller to test)
MAX_RETRIES = 2
BACKOFF_BASE = 1.5

# ---------- Pydantic schema for structured output ----------
class ResumeSummary(BaseModel):
    resume_id: str
    name: Optional[str] = None
    best_role: Optional[str] = None
    years_experience: Optional[int] = None
    top_skills: List[str] = []
    key_achievements: List[str] = []
    education: List[str] = []
    contact: dict = {}  # {email:..., phone:..., linkedin:...}
    fit_score: Optional[float] = None
    summary: Optional[str] = None

# ---------- Helpers ----------
def mask_pii(text: str) -> str:
    if not text:
        return text
    t = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w+\b', '<EMAIL>', text)
    t = re.sub(r'\b(?:\+?\d[\d\s\-\(\)]{3,}\d)\b', '<PHONE>', t)
    return t

def extract_json_from_text(s: str):
    """Naive JSON extraction; we prefer SDK's parsed output but keep fallback."""
    if not s:
        return None
    try:
        return json.loads(s)
    except Exception:
        start = s.find('{')
        if start == -1:
            return None
        depth = 0
        for i in range(start, len(s)):
            if s[i] == '{': depth += 1
            elif s[i] == '}':
                depth -= 1
                if depth == 0:
                    try:
                        return json.loads(s[start:i+1])
                    except Exception:
                        return None
    return None

SYSTEM_PROMPT = (
    "You are an assistant that reads a compact resume context and returns a strict JSON summary. "
    "Do not invent facts. If a field is not present, return null or an empty list. Output valid JSON matching the schema exactly."
)

USER_SCHEMA_AND_INSTR = """
Input fields:
- resume_id, name, snippet, skills, orgs, years_experience, contact_status, resume_score

Return JSON matching the ResumeSummary schema exactly:
{
 "resume_id": "<same>",
 "name": "<string or null>",
 "best_role": "<one-line role/title or null>",
 "years_experience": <int or null>,
 "top_skills": ["..."],
 "key_achievements": ["..."],
 "education": ["..."],
 "contact": {"email": null, "phone": null, "linkedin": null},
 "fit_score": <float 0-100>,
 "summary": "<2-4 sentence summary>"
}

Rules:
- Only use facts present in snippet, skills, or orgs. If uncertain, use null/empty.
- Fit_score should be consistent with resume_score (you may rescale directly).
- summary must be concise (2-4 short sentences), factual and use provided text.
- Return JSON only.
"""

# ---------- Prepare client ----------
API_KEY = os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY")
if not API_KEY:
    raise RuntimeError("Set GEMINI_API_KEY (or GOOGLE_API_KEY) environment variable with your API key.")
client = genai.Client(api_key=API_KEY)

# ---------- Load scored resumes ----------
with open(SCORES_FILE, "r", encoding="utf-8") as f:
    scored = json.load(f)

scored = scored[:TOP_N] if TOP_N and len(scored) > TOP_N else scored
print(f"Preparing to summarize {len(scored)} resumes with model {MODEL}...")

summaries = []
errors = []

# ---------- Loop and call Gemini ----------
for idx, rec in enumerate(scored, start=1):
    resume_id = rec.get("file")
    name = rec.get("primary_name") or rec.get("primary_name_clean") or None
    snippet = rec.get("preview", "")[:1200]  # short excerpt; adjust length if needed
    snippet_masked = mask_pii(snippet)
    skills = rec.get("top_skills") or rec.get("skills") or []
    orgs = rec.get("orgs") or []
    years = rec.get("exp_years") or rec.get("exp_years", None) or None
    contact_status = rec.get("contact_status", "missing")
    resume_score = float(rec.get("score", 0.0))

    input_json = {
        "resume_id": resume_id,
        "name": name,
        "snippet": snippet_masked,
        "skills": skills,
        "orgs": orgs,
        "years_experience": years,
        "contact_status": contact_status,
        "resume_score": resume_score
    }

    prompt_text = SYSTEM_PROMPT + "\n\n" + USER_SCHEMA_AND_INSTR + "\n\n" + "Input JSON:\n" + json.dumps(input_json, ensure_ascii=False, indent=2)

    parsed_obj = None
    last_err = None
    for attempt in range(MAX_RETRIES + 1):
        try:
            resp = client.models.generate_content(
                model=MODEL,
                contents=prompt_text,
                config={
                    "response_mime_type": "application/json",
                    "response_schema": ResumeSummary,  # Pydantic model, SDK will validate/parse
                    # You can add other options supported by SDK if needed
                },
            )
            # SDK tries to parse into Pydantic model; access .parsed
            parsed = getattr(resp, "parsed", None)
            if parsed is not None:
                # parsed is a Pydantic model instance or list; ensure dict form
                if isinstance(parsed, list):
                    parsed_obj = parsed[0]
                else:
                    parsed_obj = parsed
            else:
                # fallback: attempt to extract JSON from resp.text
                raw_text = getattr(resp, "text", None) or str(resp)
                j = extract_json_from_text(raw_text)
                if j is None:
                    raise ValueError("No JSON parsed by SDK and fallback failed.")
                parsed_obj = ResumeSummary(**j)
            # ensure resume_id and fit_score
            if not parsed_obj.resume_id:
                parsed_obj.resume_id = resume_id
            if parsed_obj.fit_score is None:
                parsed_obj.fit_score = round(min(100.0, resume_score), 2)
            summaries.append(parsed_obj.model_dump())  # convert to plain dict
            last_err = None
            break
        except Exception as e:
            last_err = str(e)
            time.sleep(BACKOFF_BASE * (2 ** attempt))
    if last_err:
        errors.append({"resume_id": resume_id, "error": last_err})
        # fallback minimal summary
        fallback = ResumeSummary(
            resume_id=resume_id,
            name=name,
            best_role=None,
            years_experience=years,
            top_skills=skills[:6],
            key_achievements=[],
            education=[],
            contact={"email": None, "phone": None, "linkedin": None},
            fit_score=round(min(100.0, resume_score),2),
            summary=f"Auto-fallback summary (resume_score={resume_score})"
        )
        summaries.append(fallback.model_dump())

    # polite pacing
    if idx % BATCH_SIZE == 0:
        time.sleep(1.0)

# ---------- Save outputs ----------
with open(OUT_JSON, "w", encoding="utf-8") as f:
    json.dump({"summaries": summaries, "errors": errors}, f, ensure_ascii=False, indent=2)

# CSV quick summary
csv_fields = ["resume_id","name","best_role","years_experience","top_skills","fit_score","summary"]
with open(OUT_CSV, "w", newline='', encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=csv_fields)
    writer.writeheader()
    for s in summaries:
        writer.writerow({
            "resume_id": s.get("resume_id"),
            "name": s.get("name"),
            "best_role": s.get("best_role"),
            "years_experience": s.get("years_experience"),
            "top_skills": ",".join(s.get("top_skills") or []),
            "fit_score": s.get("fit_score"),
            "summary": (s.get("summary") or "")[:300]
        })

print("Wrote JSON ->", OUT_JSON)
print("Wrote CSV  ->", OUT_CSV)
if errors:
    print("Completed with errors for", len(errors), "resumes. Check JSON 'errors' field.")
else:
    print("All summaries produced successfully.")


Preparing to summarize 200 resumes with model gemini-2.5-flash...
Wrote JSON -> ..\data\resumes_preocr\summaries_gemini.json
Wrote CSV  -> ..\data\resumes_preocr\summaries_gemini.csv
Completed with errors for 200 resumes. Check JSON 'errors' field.


In [15]:
# Aggressive, token-safe full cell for Gemini batch retry
# Very small batches and tiny snippets to avoid MAX_TOKENS issues.

import os, json, re, time, math
from pathlib import Path
from typing import List, Optional
import google.genai as genai

# ---------- Config ----------
DATA_DIR = Path("../data/resumes_preocr")
IN_SUMMARIES = DATA_DIR / "summaries_gemini.json"
SCORES_FILE = DATA_DIR / "resume_scores_refined_clean.json"
OUT_RETRY = DATA_DIR / "summaries_gemini_batch_retry.json"
RAW_DIR = DATA_DIR / "gemini_batch_raw"
RAW_DIR.mkdir(parents=True, exist_ok=True)

MODEL = "gemini-2.5-flash-lite"
BATCH_SIZE = 10           # tiny batch to be safe
TEST_ONLY = True
MAX_BATCH_RETRIES = 2
SLEEP_BETWEEN_BATCHES = 0.8
SNIPPET_MAX = 120        # very short snippet
MAX_OUTPUT_TOKENS = 800

API_KEY = os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY")
if not API_KEY:
    raise RuntimeError("Set GEMINI_API_KEY or GOOGLE_API_KEY in environment.")
client = genai.Client(api_key=API_KEY)

# ---------- Helpers ----------
def flatten_json_array_text(text: str) -> Optional[list]:
    if not text:
        return None
    try:
        obj = json.loads(text)
        if isinstance(obj, list):
            return obj
    except Exception:
        pass
    a = text.find('[')
    if a != -1:
        depth = 0
        for i in range(a, len(text)):
            ch = text[i]
            if ch == '[':
                depth += 1
            elif ch == ']':
                depth -= 1
                if depth == 0:
                    candidate = text[a:i+1]
                    candidate = re.sub(r',\s*]', ']', candidate)
                    candidate = re.sub(r',\s*}', '}', candidate)
                    try:
                        parsed = json.loads(candidate)
                        if isinstance(parsed, list):
                            return parsed
                    except Exception:
                        pass
                    break
    objs = []
    start = text.find('{')
    while start != -1:
        depth = 0; end = None
        for i in range(start, len(text)):
            if text[i] == '{':
                depth += 1
            elif text[i] == '}':
                depth -= 1
                if depth == 0:
                    end = i; break
        if end is None: break
        candidate = text[start:end+1]
        cand2 = re.sub(r',\s*}', '}', candidate).replace("'", '"')
        try:
            parsed = json.loads(cand2); objs.append(parsed)
        except Exception:
            pass
        start = text.find('{', end+1)
    return objs if objs else None

def extract_raw_text_from_resp(resp) -> str:
    # Try resp.text
    raw = getattr(resp, "text", None)
    if raw:
        return raw
    # Try candidates
    cand_list = getattr(resp, "candidates", None)
    if cand_list and len(cand_list) > 0:
        first = cand_list[0]
        # content may be dict-like or object
        content = getattr(first, "content", None)
        if isinstance(content, dict):
            if "text" in content and content["text"]:
                return content["text"]
            if "parts" in content and isinstance(content["parts"], list):
                return "".join(content["parts"])
            return json.dumps(content)
        txt = getattr(content, "text", None)
        if txt:
            return txt
        parts = getattr(content, "parts", None)
        if parts:
            return "".join(parts)
        t = getattr(first, "text", None)
        if t:
            return t
        return repr(first)
    # older choices
    choices = getattr(resp, "choices", None)
    if choices and len(choices) > 0:
        ch = choices[0]
        if isinstance(ch, dict) and "text" in ch:
            return ch["text"]
        return getattr(ch, "text", None) or repr(ch)
    return repr(resp)

def clean_and_shorten_snippet(raw: str, max_len: int = SNIPPET_MAX) -> str:
    if not raw: return ""
    s = re.sub(r'[\x00-\x1f\x7f-\x9f]', ' ', raw)
    s = re.sub(r'\s+', ' ', s).strip()
    s = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w+\b', '<EMAIL>', s)
    s = re.sub(r'\b(?:\+?\d[\d\s\-\(\)]{3,}\d)\b', '<PHONE>', s)
    return s[:max_len]

def build_batch_prompt(inputs: List[dict]) -> str:
    # Minimal inline example (single-line) to save tokens
    example_line = (
        '[{"resume_id":"sample_EX.txt","name":"Jane Doe","best_role":"Backend Engineer",'
        '"years_experience":6,"top_skills":["python","sql"],'
        '"key_achievements":["Built ETL"],"education":["BSc"],'
        '"contact":{"email":null,"phone":null,"linkedin":null},"fit_score":78.3,"summary":"Backend engineer."}]'
    )
    # Fallback object instruction ensures a parseable array
    prompt = (
        "Return JSON ARRAY ONLY. No commentary. Do NOT think out loud.\n\n"
        "Schema for each element (must contain exactly these keys):\n"
        "resume_id,name,best_role,years_experience,top_skills,key_achievements,education,contact,fit_score,summary\n\n"
        "Example output (single-line):\n" + example_line + "\n\n"
        "If you cannot produce a valid entry for an input, return for that item:\n"
        '{"resume_id":"<same id>","error":"cannot_parse","best_role":null,"years_experience":null,'
        '"top_skills":[],"key_achievements":[],"education":[],"contact":{"email":null,"phone":null,"linkedin":null},'
        '"fit_score":0.0,"summary":null}\n\n'
        "Now process the inputs below and return a JSON array where i-th element corresponds to the i-th input.\n\n"
        "Inputs:\n" + json.dumps(inputs, ensure_ascii=False)
    )
    return prompt

# ---------- Load summaries and failed set ----------
with open(IN_SUMMARIES, "r", encoding="utf-8") as f:
    prev = json.load(f)
summaries = prev.get("summaries", prev) if isinstance(prev, dict) else prev
failed = [s for s in summaries if isinstance(s.get("summary",""), str) and s["summary"].startswith("Auto-fallback summary")]
print("Failed count:", len(failed))
if not failed:
    print("Nothing to retry. Exiting.")
else:
    all_scored = {}
    if SCORES_FILE.exists():
        sf = json.load(open(SCORES_FILE, "r", encoding="utf-8"))
        all_scored = {r.get("file"): r for r in sf}

    inputs = []
    for fentry in failed:
        rid = fentry["resume_id"]
        scored_rec = all_scored.get(rid, {})
        raw_snip = scored_rec.get("preview") or fentry.get("summary") or ""
        snippet = clean_and_shorten_snippet(raw_snip, max_len=SNIPPET_MAX)
        inp = {
            "resume_id": rid,
            "name": (scored_rec.get("primary_name") or fentry.get("name") or "")[:80],
            "snippet": snippet,
            "skills": scored_rec.get("top_skills") or fentry.get("top_skills") or [],
            "years_experience": scored_rec.get("exp_years") or fentry.get("years_experience") or None,
            "resume_score": float(scored_rec.get("score") or fentry.get("fit_score") or 0.0)
        }
        inputs.append(inp)

    num_batches = math.ceil(len(inputs) / BATCH_SIZE)
    print(f"Batches to process: {num_batches} (BATCH_SIZE={BATCH_SIZE}, SNIPPET_MAX={SNIPPET_MAX})")
    outputs_by_id = {}
    raw_logs = {}

    batches_to_run = 1 if TEST_ONLY else num_batches
    for b in range(batches_to_run):
        batch_inputs = inputs[b*BATCH_SIZE:(b+1)*BATCH_SIZE]
        if not batch_inputs:
            continue
        prompt = build_batch_prompt(batch_inputs)

        parsed_list = None
        last_err = None
        for attempt in range(MAX_BATCH_RETRIES + 1):
            try:
                resp = client.models.generate_content(
                    model=MODEL,
                    contents=prompt,
                    config={"max_output_tokens": MAX_OUTPUT_TOKENS, "temperature": 0.0}
                )
                raw_text = extract_raw_text_from_resp(resp)
                raw_logs[f"batch_{b}"] = raw_text
                # Save a small repr of resp for debugging token reasons
                raw_logs[f"batch_{b}_repr"] = repr(resp)[:1000]
                parsed_list = flatten_json_array_text(raw_text)
                if parsed_list is None:
                    raise ValueError("No JSON array or objects parsed from model output.")
                break
            except Exception as e:
                last_err = str(e)
                wait = 1.2 * (2 ** attempt)
                print(f" Batch {b} attempt {attempt} failed: {last_err} — sleeping {wait}s")
                time.sleep(wait)
        if parsed_list is None:
            print(f"Batch {b} failed after retries: {last_err}. Saving raw output for debug.")
            raw_logs[f"batch_{b}_error"] = last_err
            for inp in batch_inputs:
                outputs_by_id[inp["resume_id"]] = None
        else:
            if len(parsed_list) != len(batch_inputs):
                for obj in parsed_list:
                    rid = obj.get("resume_id")
                    if rid:
                        outputs_by_id[rid] = obj
                for i, inp in enumerate(batch_inputs):
                    if inp["resume_id"] not in outputs_by_id:
                        if i < len(parsed_list) and isinstance(parsed_list[i], dict):
                            outputs_by_id[inp["resume_id"]] = parsed_list[i]
                        else:
                            outputs_by_id[inp["resume_id"]] = None
            else:
                for inp, out in zip(batch_inputs, parsed_list):
                    outputs_by_id[inp["resume_id"]] = out

        print(f"Batch {b+1}/{batches_to_run} done.")
        time.sleep(SLEEP_BETWEEN_BATCHES)

    # Merge results
    merged = []
    for s in summaries:
        rid = s.get("resume_id")
        if rid in outputs_by_id and outputs_by_id[rid]:
            out = outputs_by_id[rid]
            if out.get("fit_score") is None:
                out["fit_score"] = round(min(100.0, out.get("resume_score") or s.get("fit_score") or 0.0), 2)
            merged.append(out)
        else:
            merged.append(s)

    with open(OUT_RETRY, "w", encoding="utf-8") as f:
        json.dump({"summaries": merged}, f, ensure_ascii=False, indent=2)
    with open(RAW_DIR / "batch_raw_logs.json", "w", encoding="utf-8") as f:
        json.dump(raw_logs, f, ensure_ascii=False, indent=2)

    print("Done. Saved:", OUT_RETRY)
    if TEST_ONLY:
        print("Processed one batch only (TEST_ONLY=True). Inspect batch_0 raw logs then set TEST_ONLY=False to continue.")


Failed count: 200
Batches to process: 20 (BATCH_SIZE=10, SNIPPET_MAX=120)
Batch 1/1 done.
Done. Saved: ..\data\resumes_preocr\summaries_gemini_batch_retry.json
Processed one batch only (TEST_ONLY=True). Inspect batch_0 raw logs then set TEST_ONLY=False to continue.


In [13]:
from google.genai import Client
import os
from dotenv import load_dotenv
load_dotenv()  # load .env if present

client = Client(api_key=os.getenv("GEMINI_API_KEY"))
resp = client.models.generate_content(
    model="gemini-2.5-flash-lite",
    contents="Say the word OK.",
    config={"max_output_tokens": 10}
)
print(resp)


sdk_http_response=HttpResponse(
  headers=<dict len=11>
) candidates=[Candidate(
  content=Content(
    parts=[
      Part(
        text='OK'
      ),
    ],
    role='model'
  ),
  finish_reason=<FinishReason.STOP: 'STOP'>,
  index=0
)] create_time=None model_version='gemini-2.5-flash-lite' prompt_feedback=None response_id='XecZaeXjJNCBqfkPgJqF6Qs' usage_metadata=GenerateContentResponseUsageMetadata(
  candidates_token_count=1,
  prompt_token_count=6,
  prompt_tokens_details=[
    ModalityTokenCount(
      modality=<MediaModality.TEXT: 'TEXT'>,
      token_count=6
    ),
  ],
  total_token_count=7
) automatic_function_calling_history=[] parsed=None
