In [1]:
import sys
import os

# get project root path (parent of 'notebooks' directory)
root_path = os.path.abspath(os.path.join(os.getcwd(), ".."))
if root_path not in sys.path:
    sys.path.append(root_path)

print(sys.path)

['C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\\python311.zip', 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\\DLLs', 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\\Lib', 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0', 'd:\\Work\\Capstone_Project\\resume-nlp\\venv', '', 'd:\\Work\\Capstone_Project\\resume-nlp\\venv\\Lib\\site-packages', 'd:\\Work\\Capstone_Project\\resume-nlp']


In [2]:
import re
from src.ingest import extract_text_pymupdf


text = extract_text_pymupdf('../data/Abhishek_Singh_Resume.pdf')
print(text[:400])
print('email found:', bool(re.search(r"[\w\.-]+@[\w\.-]+", text)))
print('phone found:', bool(re.search(r"\+?\d[\d\s\-()]{6,}\d", text)))
print('education header:', 'Education' in text or 'EDUCATION' in text)

Abhishek Singh
8010852459 | abhisheksingh.vizag@gmail.com | LinkedIn | GitHub | LeetCode
Education
•
VIT Bhopal University | CGPA 9.01
Oct 2022 – Present
Bachelor of Technology in Computer Science and Engineering
Bhopal, Madhya Pradesh
•
Higher Secondary Education | Grade: 95.4%
July 2021
Navy Children School, Goa
Vasco Da Gama, Goa
Technical Skills
• Languages/Databases: C++, Python, SQL
• Framew
email found: True
phone found: True
education header: True


In [3]:
# CONFIG
from pathlib import Path
DATA_ROOT = Path('../data/resumes_raw_pdf')
PDF_DIR = DATA_ROOT / 'pdfs'
TXT_DIR = DATA_ROOT / 'txt'
FAIL_DIR = DATA_ROOT / 'failures'
REPORT_JSON = DATA_ROOT / 'extraction_report.json'
REPORT_CSV = DATA_ROOT / 'extraction_report.csv'
for d in [PDF_DIR, TXT_DIR, FAIL_DIR]:
    d.mkdir(parents=True, exist_ok=True)


# How many resumes to download for testing (start small: 50-100)
N_SAMPLES = 100


# Toggle OCR fallback (requires system Tesseract and pytesseract)
ENABLE_OCR = False
OCR_LANGUAGE = 'eng'


# thresholds for checks
MIN_TEXT_CHARS = 200
MIN_ALPHA_RATIO = 0.2

In [None]:
# Option 1: set token for this notebook session and re-run the download cell
import os
from dotenv import load_dotenv
load_dotenv()  # take environment variables from .env file

token = os.getenv("HUGGINGFACE_HUB_TOKEN")   # <-- paste your token here
os.environ["HUGGINGFACE_HUB_TOKEN"] = token

# optional check who you are
from huggingface_hub import whoami
print("Authenticated as:", whoami(token=token).get("name"))

  from .autonotebook import tqdm as notebook_tqdm


Authenticated as: AbhishekProgrammer22


In [25]:
# Use cached HF snapshot to copy PDFs -> run extraction and produce report
from huggingface_hub import list_repo_files, hf_hub_download
from pathlib import Path
import os, shutil, json, csv, traceback
from tqdm.auto import tqdm

# your extractors
from src.ingest import extract_text_pymupdf, extract_text_pdfplumber

REPO_ID = "d4rk3r/resumes-raw-pdf"
OUT_ROOT = Path("../data/resumes_raw_pdf_direct")
PDF_DIR = OUT_ROOT / "pdfs"
TXT_DIR = OUT_ROOT / "txt"
FAIL_DIR = OUT_ROOT / "failures"
REPORT_JSON = OUT_ROOT / "extraction_report.json"
REPORT_CSV  = OUT_ROOT / "extraction_report.csv"

for d in (OUT_ROOT, PDF_DIR, TXT_DIR, FAIL_DIR):
    d.mkdir(parents=True, exist_ok=True)

N_SAMPLES = 100          # change if you want fewer
MIN_TEXT_CHARS = 200
MIN_ALPHA_RATIO = 0.2

token = os.getenv("HUGGINGFACE_HUB_TOKEN")
if not token:
    raise RuntimeError("HUGGINGFACE_HUB_TOKEN missing - set it before running.")

# list files (we already saw this works)
all_files = list_repo_files(REPO_ID, repo_type="dataset", token=token)
pdf_files = [f for f in all_files if f.lower().endswith(".pdf")]
print(f"Found {len(pdf_files)} pdf files in the repo. Will copy/process first {min(N_SAMPLES, len(pdf_files))}.")

# helper: hf_hub_download returns a local cached path when available
def get_cached_path(fname):
    try:
        local = hf_hub_download(repo_id=REPO_ID, filename=fname, repo_type="dataset", token=token)
        return Path(local)
    except Exception as e:
        print("hf_hub_download failed for", fname, "->", type(e).__name__, str(e)[:200])
        return None

report = []
to_process = pdf_files[:min(N_SAMPLES, len(pdf_files))]

for idx, fname in enumerate(tqdm(to_process, desc="files")):
    rec = {"filename": Path(fname).name, "repo_path": fname, "downloaded": False,
           "extraction_method": None, "ocr_used": False, "error": None, "checks": None,
           "failure_reasons": [], "failure_file": None}
    try:
        cached = get_cached_path(fname)
        if cached is None:
            rec["error"] = "cache_lookup_failed"
            report.append(rec)
            continue

        # copy cached file to our PDF_DIR with normalized name
        tgt = PDF_DIR / f"resume_{idx:05d}.pdf"
        shutil.copyfile(cached, tgt)
        rec["downloaded"] = True

        # extract text: pymupdf primary, pdfplumber fallback
        txt = ""
        try:
            txt = extract_text_pymupdf(str(tgt))
            rec["extraction_method"] = "pymupdf"
            if len(txt) < MIN_TEXT_CHARS // 2:
                txt2 = extract_text_pdfplumber(str(tgt))
                if len(txt2) > len(txt):
                    txt = txt2
                    rec["extraction_method"] += "+pdfplumber"
        except Exception as e1:
            try:
                txt = extract_text_pdfplumber(str(tgt))
                rec["extraction_method"] = "pdfplumber"
            except Exception as e2:
                rec["error"] = f"both_extractors_failed: {e1} | {e2}"
                badf = FAIL_DIR / f"downloaded_but_extractfail_{idx:05d}.txt"
                with open(badf, "w", encoding="utf-8") as f:
                    f.write(f"Cached path: {cached}\\nErrors:\\n{e1}\\n{e2}")
                rec["failure_file"] = str(badf)
                report.append(rec)
                continue

        txt = (txt or "").replace("\r", "\n").strip()
        # save text
        with open(TXT_DIR / (tgt.stem + ".txt"), "w", encoding="utf-8") as f:
            f.write(txt)

        # checks
        import re
        EMAIL_RE = re.compile(r"[\w\.-]+@[\w\.-]+\.\w+")
        PHONE_RE = re.compile(r"\+?\d[\d\s\-\(\)]{6,}\d")
        SECTION_KEYWORDS = ['education','experience','skills','projects','certifications','publications','summary','objective']

        email = bool(EMAIL_RE.search(txt))
        phone = bool(PHONE_RE.search(txt))
        txt_low = txt.lower()
        sections = {kw: (kw in txt_low) for kw in SECTION_KEYWORDS}
        any_section = any(sections.values())
        letters = sum(c.isalpha() for c in txt)
        alpha_ratio = letters / max(1, len(txt))

        checks = {'email': email, 'phone': phone, 'sections': sections, 'any_section': any_section,
                  'len_chars': len(txt), 'alpha_ratio': alpha_ratio, 'preview': txt[:800].replace("\n","\\n")}
        rec["checks"] = checks

        failures = []
        if rec["error"]:
            failures.append("extractor_error")
        if checks['len_chars'] < MIN_TEXT_CHARS:
            failures.append("short_text")
        if checks['alpha_ratio'] < MIN_ALPHA_RATIO:
            failures.append("low_alpha_ratio")
        if not checks['any_section']:
            failures.append("no_key_section")
        if not (checks['email'] and checks['phone']):
            failures.append("missing_contact")

        rec["failure_reasons"] = failures
        if failures:
            fname_fail = FAIL_DIR / (tgt.stem + "_failure.txt")
            with open(fname_fail, "w", encoding="utf-8") as f:
                f.write(f"FILENAME: {tgt.name}\\nREPO_PATH: {fname}\\nEXTRACTION_METHOD: {rec['extraction_method']}\\nFAILURE_REASONS: {failures}\\n\\n---PREVIEW---\\n\\n")
                f.write(txt)
            rec["failure_file"] = str(fname_fail)

    except Exception as e:
        rec["error"] = f"fatal:{type(e).__name__}:{e}"
        rec["failure_file"] = None
    report.append(rec)

# save reports
with open(REPORT_JSON, "w", encoding="utf-8") as f:
    json.dump(report, f, indent=2)
with open(REPORT_CSV, "w", newline="", encoding="utf-8") as csvf:
    writer = csv.writer(csvf)
    writer.writerow(['filename','repo_path','downloaded','extraction_method','len_chars','alpha_ratio','email','phone','any_key_section','failure_reasons','failure_file','error'])
    for r in report:
        ch = r.get('checks') or {}
        writer.writerow([r.get('filename'), r.get('repo_path'), r.get('downloaded'), r.get('extraction_method'),
                         ch.get('len_chars'), round(ch.get('alpha_ratio',0),3), ch.get('email'), ch.get('phone'), ch.get('any_section'),
                         ';'.join(r.get('failure_reasons',[])), r.get('failure_file',''), r.get('error','')])

# pretty summary
from collections import Counter
total = len(report)
failures = [r for r in report if r.get('failure_reasons')]
success = total - len(failures)
fail_reasons = Counter()
for r in failures:
    fail_reasons.update(r.get('failure_reasons',[]))

print("\\n=== SUMMARY ===")
print("Total files processed:", total)
print("Success (no detected failure reasons):", success, f"({round(100*success/total if total else 0,2)}%)")
print("Files with failures:", len(failures))
print("Failure reasons counts:", dict(fail_reasons))
print("Report saved ->", REPORT_JSON)
print("PDFs saved  ->", PDF_DIR)
print("Text saved  ->", TXT_DIR)
print("Failures    ->", FAIL_DIR)


Found 1940 pdf files in the repo. Will copy/process first 100.


files:  29%|██▉       | 29/100 [00:27<01:48,  1.53s/it]Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
files:  31%|███       | 31/100 [00:32<02:08,  1.87s/it]Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
files:  32%|███▏      | 32/100 [00:35<02:26,  2.15s/it]Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
files:  36%|███▌      | 36/100 [00:41<01:42,  1.60s/it]Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Fal

\n=== SUMMARY ===
Total files processed: 100
Success (no detected failure reasons): 1 (1.0%)
Files with failures: 99
Failure reasons counts: {'missing_contact': 99, 'no_key_section': 67, 'short_text': 24, 'low_alpha_ratio': 24}
Report saved -> ..\data\resumes_raw_pdf_direct\extraction_report.json
PDFs saved  -> ..\data\resumes_raw_pdf_direct\pdfs
Text saved  -> ..\data\resumes_raw_pdf_direct\txt
Failures    -> ..\data\resumes_raw_pdf_direct\failures





In [9]:
# Load HF dataset properly and write out text files per example
# Paste & run in the same notebook environment

from datasets import load_dataset
from pathlib import Path
import json, csv, shutil
from tqdm.auto import tqdm
import os

OUT_ROOT = Path("../data/resumes_preocr")
TXT_DIR = OUT_ROOT / "txt"
PDF_DIR = OUT_ROOT / "pdfs"   # may remain empty for this dataset
REPORT = OUT_ROOT / "report_preocr.json"
SUMMARY_CSV = OUT_ROOT / "summary_preocr.csv"

OUT_ROOT.mkdir(parents=True, exist_ok=True)
TXT_DIR.mkdir(exist_ok=True)
PDF_DIR.mkdir(exist_ok=True)

REPO_ID = "lhoestq/resumes-raw-pdf-for-ocr"
SPLIT = "train"   # dataset split
N_SAMPLES = 500   # adjust to how many you want to extract (max ~ full dataset size ~ 1585)

print(f"Loading dataset {REPO_ID} split={SPLIT} (this may take a minute)...")
ds = load_dataset(REPO_ID, split=SPLIT)

print("Dataset loaded. Columns:", ds.column_names, "Num examples:", len(ds))

# Which field contains text? Common names: 'text'
text_field = None
for candidate in ("text", "ocr", "page_text", "raw_text"):
    if candidate in ds.column_names:
        text_field = candidate
        break
# fallback: look for any string column
if text_field is None:
    for c in ds.column_names:
        # sample few rows to check if column is string-like and non-empty
        try:
            sample = ds[0].get(c)
            if isinstance(sample, str):
                text_field = c
                break
        except Exception:
            pass

if text_field is None:
    raise RuntimeError(f"Couldn't find a text column in dataset. Columns: {ds.column_names}")

print("Using text field:", text_field)

report = []
count = 0
for i, ex in enumerate(tqdm(ds, desc="writing text")):
    if count >= N_SAMPLES:
        break
    txt = ex.get(text_field) or ""
    # sometimes the text may be empty (filter as desired)
    if not txt or not txt.strip():
        # skip empty text entries (optionally you can save them)
        continue
    fname = f"sample_{count:05d}.txt"
    tgt = TXT_DIR / fname
    with open(tgt, "w", encoding="utf-8") as f:
        f.write(txt)
    report.append({"index": i, "filename": fname, "text_len": len(txt), "preview": txt[:500].replace("\n","\\n")})
    count += 1

# save the report json and a CSV summary with basic checks
with open(REPORT, "w", encoding="utf-8") as f:
    json.dump(report, f, indent=2)

import re
EMAIL_RE = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+")
PHONE_RE = re.compile(r"(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{6,12}")
SECTIONS = ['education','experience','skills','projects','certifications','publications','summary','objective','work experience']

rows = []
for r in report:
    txt = (TXT_DIR / r['filename']).read_text(encoding="utf-8", errors="ignore")
    email = bool(EMAIL_RE.search(txt))
    phone = bool(PHONE_RE.search(txt))
    any_section = any(k in txt.lower() for k in SECTIONS)
    rows.append({
        "file": r['filename'],
        "chars": len(txt),
        "email": email,
        "phone": phone,
        "any_section": any_section,
        "preview": txt[:400].replace("\n","\\n")
    })

# write CSV
import csv
with open(SUMMARY_CSV, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=list(rows[0].keys()) if rows else ["file"])
    writer.writeheader()
    for row in rows:
        writer.writerow(row)

print(f"Saved {len(report)} text samples to {TXT_DIR}")
print("Report:", REPORT)
print("Summary CSV:", SUMMARY_CSV)


Loading dataset lhoestq/resumes-raw-pdf-for-ocr split=train (this may take a minute)...


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Generating train split: 100%|██████████| 1585/1585 [00:00<00:00, 3149.88 examples/s]


Dataset loaded. Columns: ['label', 'images', 'text'] Num examples: 1585
Using text field: text


writing text:  32%|███▏      | 500/1585 [00:03<00:07, 139.17it/s]


Saved 500 text samples to ..\data\resumes_preocr\txt
Report: ..\data\resumes_preocr\report_preocr.json
Summary CSV: ..\data\resumes_preocr\summary_preocr.csv


In [10]:
# Step 3 (run): SpaCy NER + rule-based extraction
import json, re, sys
from pathlib import Path
from tqdm.auto import tqdm

# ensure spaCy model installed
try:
    import spacy
    nlp = spacy.load("en_core_web_sm")
except Exception as e:
    print("spaCy model en_core_web_sm not found. Install it and re-run:")
    print("    python -m spacy download en_core_web_sm")
    raise

ROOT = Path("../data/resumes_preocr")
TXT_DIR = ROOT / "txt"
OUT_EXTRACT = ROOT / "ner_extract.json"

EMAIL_RE = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+")
PHONE_RE = re.compile(r"(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{6,12}")

# small initial skills vocabulary — expand with your domain
SKILLS = {
    "python","java","c++","c","sql","nlp","natural language processing","deep learning",
    "machine learning","tensorflow","pytorch","react","docker","kubernetes","git",
    "aws","gcp","azure","linux","pandas","numpy","scikit-learn","keras"
}

def extract_skills(text):
    tl = text.lower()
    found = []
    for s in SKILLS:
        if s in tl:
            found.append(s)
    return sorted(found)

results = []
files = sorted(TXT_DIR.glob("*.txt"))
print("Processing", len(files), "text files with spaCy NER...")

for p in tqdm(files, desc="NER"):
    txt = p.read_text(encoding="utf-8", errors="ignore")
    doc = nlp(txt)
    names = []
    orgs = []
    gpes = []
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            names.append(ent.text)
        elif ent.label_ in ("ORG","NORP"):
            orgs.append(ent.text)
        elif ent.label_ == "GPE":
            gpes.append(ent.text)
    emails = EMAIL_RE.findall(txt)
    phones = PHONE_RE.findall(txt)
    skills = extract_skills(txt)
    results.append({
        "file": p.name,
        "names": list(dict.fromkeys(names))[:3],
        "orgs": list(dict.fromkeys(orgs))[:6],
        "locations": list(dict.fromkeys(gpes))[:4],
        "emails": emails,
        "phones": phones,
        "skills": skills,
        "chars": len(txt),
        "preview": txt[:600].replace("\n","\\n")
    })

# save results
with open(OUT_EXTRACT, "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2)

# print quick stats
total = len(results)
emails_found = sum(1 for r in results if r["emails"])
phones_found = sum(1 for r in results if r["phones"])
skills_found = sum(1 for r in results if r["skills"])
avg_len = sum(r["chars"] for r in results) / total if total else 0

print("\n=== NER SUMMARY ===")
print(f"Files processed: {total}")
print(f"Emails found: {emails_found}")
print(f"Phones found: {phones_found}")
print(f"Files with detected skills: {skills_found}")
print(f"Average text length: {avg_len:.0f} chars")
print("Saved ner extract ->", OUT_EXTRACT)
print("\nSample entry (first file):")
if results:
    import pprint
    pprint.pprint(results[0], compact=True, width=120)
else:
    print("No text files found in", TXT_DIR)


Processing 500 text files with spaCy NER...


NER: 100%|██████████| 500/500 [03:54<00:00,  2.13it/s]


=== NER SUMMARY ===
Files processed: 500
Emails found: 1
Phones found: 43
Files with detected skills: 499
Average text length: 3762 chars
Saved ner extract -> ..\data\resumes_preocr\ner_extract.json

Sample entry (first file):
{'chars': 2812,
 'emails': [],
 'file': 'sample_00000.txt',
 'locations': ['Hanoi', 'Robot', 'Tkinter', 'Taiwan'],
 'names': ['Nguyen Dang Binh', 'Luster LightTech', 'Debug'],
 'orgs': ['AI/Computer Vision Engineer\nAddress', 'Vision Software Senior', 'BacNinh', 'AOI', 'Luxshare-ICT',
          'Medical Image Segmentation'],
 'phones': [],
 'preview': 'Nguyen Dang Binh – AI/Computer Vision Engineer\\nAddress: Bac Ninh City\\nE '
            'mail:\\nMobile/Zalo:\\nBirthday: Jan 8th 1986\\nK EY STRENGTHS\\nMachine vision \uf020Team '
            'Leader\\n\uf0a7\uf020\\n\uf0a7\uf020AI engineer Automation software\\nP ROFESSIONAL EXPERIENCE\\nVision '
            'Software Senior (Aug 2023 – Now): Luster LightTech, BacNinh, VN:\\n\uf0d8 Develop the low-code softwa




In [11]:
# Post-process NER outputs: normalize names, improve phone/email detection, expand skills, detect language
import json, re, os
from pathlib import Path
from collections import Counter
from pprint import pprint

ROOT = Path("../data/resumes_preocr")
NER_IN = ROOT / "ner_extract.json"
NER_OUT = ROOT / "ner_enhanced.json"
TXT_DIR = ROOT / "txt"

# load NER results
with open(NER_IN, "r", encoding="utf-8") as f:
    ner = json.load(f)

# stronger regexes
EMAIL_RE = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+")
# phone: look for sequences of digits with common separators, allow country code, require 7-15 digits total
PHONE_RE = re.compile(r"(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{1,4}\)?[-.\s]?)?(?:\d[-.\s]?){6,14}\d")
LINKEDIN_RE = re.compile(r"(linkedin\.com/[A-Za-z0-9_\-./]+|linkedin:[A-Za-z0-9_\-/]+)", re.I)
GITHUB_RE = re.compile(r"(github\.com/[A-Za-z0-9_.\-]+|github:[A-Za-z0-9_.\-]+)", re.I)

# expanded skills vocabulary (extendable)
MORE_SKILLS = [
    "python","java","c++","c","c#","sql","nlp","natural language processing","deep learning",
    "machine learning","tensorflow","pytorch","keras","scikit-learn","spark","hadoop",
    "pandas","numpy","matplotlib","seaborn","docker","kubernetes","aws","gcp","azure",
    "react","node.js","express","flask","django","rest api","graphql","git","linux",
    "bash","mlops","computer vision","opencv","pandas","communication","leadership",
    "excel","tableau","power bi","spark","hive","bigquery","seo","marketing","sales"
]
# normalize to lowercase for substring matching
MORE_SKILLS = list(dict.fromkeys([s.lower() for s in MORE_SKILLS]))

# optional fuzzy matching: use rapidfuzz if installed
try:
    from rapidfuzz import process, fuzz
    FUZZY_AVAILABLE = True
except Exception:
    FUZZY_AVAILABLE = False

# helper: load raw text for a sample file if present
def load_text_for_file(filename):
    txt_path = TXT_DIR / filename
    if txt_path.exists():
        return txt_path.read_text(encoding="utf-8", errors="ignore")
    return ""

# helper: clean orgs (remove entries that look like addresses or job titles)
JOB_TITLE_KEYWORDS = set(["engineer","developer","manager","senior","lead","intern","assistant","consultant",
                          "officer","analyst","specialist","architect","director","president","coordinator",
                          "supervisor"])
def clean_org_list(orgs):
    cleaned = []
    for o in orgs:
        s = o.strip()
        # skip if empty or obviously a sentence fragment or contains newline markers
        if not s or len(s) < 2:
            continue
        # skip if it contains 'address' or 'mobile' or 'email' (likely not org)
        low = s.lower()
        if any(x in low for x in ("address","mobile","email","phone","birthday","birth","cv","c.v","objective","profile")):
            continue
        # skip if it's too long garbage with many punctuation characters
        punct_ratio = sum(1 for ch in s if not ch.isalnum() and not ch.isspace()) / max(1,len(s))
        if punct_ratio > 0.25 and len(s) < 50:
            continue
        # optionally skip entries that are likely job titles (we want organizations)
        if any(jk in low for jk in JOB_TITLE_KEYWORDS):
            # allow if it contains a comma and an apparent company name later
            if "," not in s and len(s.split()) < 6:
                # likely a job title, skip
                continue
        cleaned.append(s)
    # dedupe keeping order
    seen = set()
    out = []
    for o in cleaned:
        if o not in seen:
            out.append(o); seen.add(o)
    return out

# helper: expand skills by substring (and fuzzy if available)
def extract_skills_from_text(text):
    tl = text.lower()
    found = set()
    for s in MORE_SKILLS:
        if s in tl:
            found.add(s)
    # fuzzy: if available, match tokens
    if FUZZY_AVAILABLE and not found:
        # take top fuzzy matches for single-word tokens
        tokens = set(re.findall(r"[A-Za-z0-9+#\.\-]+", tl))
        for tok in tokens:
            best = process.extractOne(tok, MORE_SKILLS, scorer=fuzz.partial_ratio)
            if best and best[1] >= 90:
                found.add(best[0])
    return sorted(found)

# try langdetect if available
try:
    from langdetect import detect, DetectorFactory
    DetectorFactory.seed = 0
    LANGDET_AVAILABLE = True
except Exception:
    LANGDET_AVAILABLE = False

# process and enhance
enhanced = []
counters = Counter()
for entry in ner:
    fname = entry.get("file")
    raw_text = load_text_for_file(fname)
    # combine original fields and raw text for better detection
    emails = entry.get("emails", []) or EMAIL_RE.findall(raw_text)
    phones = entry.get("phones", []) or PHONE_RE.findall(raw_text)
    linkedin = LINKEDIN_RE.findall(raw_text) or LINKEDIN_RE.findall(" ".join(entry.get("orgs",[])))
    github = GITHUB_RE.findall(raw_text) or GITHUB_RE.findall(" ".join(entry.get("orgs",[])))
    # choose primary name: prefer first PERSON entity; if no person, try first line of text
    primary_name = None
    if entry.get("names"):
        # pick first name-like token but avoid when it's obviously organization/job (heuristic)
        for nm in entry["names"]:
            if nm and len(nm) > 1 and not any(t.lower() in nm.lower() for t in ("engineer","developer","company","address","mobile")):
                primary_name = nm.strip()
                break
        if primary_name is None:
            primary_name = entry["names"][0].strip()
    if not primary_name and raw_text:
        # take the first non-empty line up to 80 chars as a fallback
        for ln in raw_text.splitlines():
            ln = ln.strip()
            if ln and len(ln) < 80:
                primary_name = ln
                break
    # clean orgs
    orgs_clean = clean_org_list(entry.get("orgs",[]))
    # expand skills
    text_for_skills = " ".join([raw_text, entry.get("preview","")])
    skills = sorted(set(entry.get("skills",[])) | set(extract_skills_from_text(text_for_skills)))
    # detect language
    lang = "unknown"
    if LANGDET_AVAILABLE and raw_text.strip():
        try:
            lang = detect(raw_text[:2000])
        except Exception:
            lang = "unknown"
    # contact status
    if emails or phones:
        contact_status = "found"
    else:
        # can't reliably detect redaction here (we didn't run redaction detector on this dataset)
        contact_status = "missing"
    # assemble enhanced entry
    e = {
        "file": fname,
        "primary_name": primary_name or "",
        "orgs": orgs_clean,
        "emails": emails,
        "phones": phones,
        "linkedin": linkedin,
        "github": github,
        "skills": skills,
        "chars": entry.get("chars", 0),
        "language": lang,
        "contact_status": contact_status,
        "original_preview": entry.get("preview","")
    }
    enhanced.append(e)
    counters.update([contact_status])
    if emails: counters.update(["has_email"])
    if phones: counters.update(["has_phone"])
    counters.update(["lang:"+lang])

# save
with open(NER_OUT, "w", encoding="utf-8") as f:
    json.dump(enhanced, f, indent=2)

# print a concise summary
total = len(enhanced)
print("Wrote enhanced NER ->", NER_OUT)
print("Total entries:", total)
print("Contact found:", counters["found"] if "found" in counters else sum(1 for e in enhanced if e["contact_status"]=="found"))
print("States:", {k:v for k,v in counters.items() if k.startswith("lang:") or k in ('has_email','has_phone')})
print("\nSample enhanced entry (first):")
pprint(enhanced[0])


Wrote enhanced NER -> ..\data\resumes_preocr\ner_enhanced.json
Total entries: 500
Contact found: 152
States: {'lang:en': 183, 'lang:vi': 315, 'has_phone': 152, 'lang:nl': 1, 'lang:zh-cn': 1, 'has_email': 1}

Sample enhanced entry (first):
{'chars': 2812,
 'contact_status': 'missing',
 'emails': [],
 'file': 'sample_00000.txt',
 'github': [],
 'language': 'en',
 'linkedin': [],
 'orgs': ['BacNinh', 'AOI', 'Luxshare-ICT', 'Medical Image Segmentation'],
 'original_preview': 'Nguyen Dang Binh – AI/Computer Vision '
                     'Engineer\\nAddress: Bac Ninh City\\nE '
                     'mail:\\nMobile/Zalo:\\nBirthday: Jan 8th 1986\\nK EY '
                     'STRENGTHS\\nMachine vision \uf020Team '
                     'Leader\\n\uf0a7\uf020\\n\uf0a7\uf020AI engineer '
                     'Automation software\\nP ROFESSIONAL EXPERIENCE\\nVision '
                     'Software Senior (Aug 2023 – Now): Luster LightTech, '
                     'BacNinh, VN:\\n\uf0d8 Develop th