1. Import all required modules for src/

In [12]:
import sys, os
from pathlib import Path

# If notebooks/ is the cwd (typical), add repo root to sys.path
repo_root = Path("..").resolve()
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))

# Now import our modules
from src.pipeline import process_pdf_bytes
from src.extract import extract_text_from_pdf_bytes, sniff_is_text_pdf
from src.parse import parse_basic_fields
from src.score import enriched_score
from src.summarize import summarize_with_model
from src.review import generate_review_from_summary
from src.io import save_json, load_json

print("Imported pipeline helpers from src/")

Imported pipeline helpers from src/


In [15]:
# Patch src/score.py to make enriched_score accept optional summary, reload modules, and re-run smoke test.
from pathlib import Path
import importlib, sys, traceback

repo_root = Path("..").resolve()
src_dir = repo_root / "src"
score_path = src_dir / "score.py"
assert score_path.exists(), f"Expected {score_path} to exist"

# === 1) Overwrite src/score.py with corrected content ===
score_code = r'''
from typing import Dict, Any

def baseline_score(parsed: Dict[str, Any]) -> float:
    """
    Simple baseline score combining:
      - presence of contact
      - number of skills (capped)
      - character count (proxy for content)
    Returns score in 0-100 (not calibrated).
    """
    score = 0.0
    has_contact = bool(parsed.get("emails") or parsed.get("phones"))
    score += 30.0 if has_contact else 0.0
    n_skills = len(parsed.get("skills", []))
    score += min(n_skills, 10) * 5.0   # up to 50 points
    chars = len(parsed.get("preview",""))
    if chars > 4000:
        score += 20.0
    elif chars > 2000:
        score += 10.0
    elif chars > 800:
        score += 5.0
    return round(min(100.0, score), 3)

def enriched_score(parsed: Dict[str, Any], summary: Dict[str, Any]=None) -> Dict[str, Any]:
    """
    Compute base score and return a dict with details.
    The `summary` argument is optional to support callers that pass it or not.
    """
    base = baseline_score(parsed)
    return {"score": base, "reasons": {"has_contact": bool(parsed.get("emails") or parsed.get("phones")), "n_skills": len(parsed.get("skills",[]))}}
'''
score_path.write_text(score_code, encoding="utf-8")
print(f"Patched {score_path}")

# === 2) Reload modules so changes are visible in this running kernel ===
# ensure repo root is on sys.path
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))

try:
    import src.score as score_mod
    importlib.reload(score_mod)
    print("Reloaded src.score")
except Exception:
    traceback.print_exc()

try:
    import src.pipeline as pipeline_mod
    importlib.reload(pipeline_mod)
    print("Reloaded src.pipeline")
except Exception:
    traceback.print_exc()

# === 3) Re-run the same smoke test you ran before (adjust path if necessary) ===
from pathlib import Path
sample_pdf = Path("../data/resumes_preocr/pdfs/sample_00001.pdf")
if not sample_pdf.exists():
    candidates = list(sample_pdf.parent.glob("*.pdf")) if sample_pdf.parent.exists() else []
    if candidates:
        sample_pdf = candidates[0]
    else:
        raise FileNotFoundError("No sample PDF found at ../data/resumes_preocr/pdfs/*.pdf")

pdf_bytes = sample_pdf.read_bytes()
print("Processing:", sample_pdf.name)
out = pipeline_mod.process_pdf_bytes(pdf_bytes, resume_id=sample_pdf.name, model_fn=None)

from pprint import pprint
pprint({
    "parsed_keys": list(out["parsed"].keys()),
    "emails": out["parsed"].get("emails"),
    "phones": out["parsed"].get("phones"),
    "skills": out["parsed"].get("skills"),
    "score": out["score"],
    "summary_preview": out["summary"]["summary"][:300],
    "short_review": out["review"]["short_review"][:300]
})


Patched D:\Work\Capstone_Project\resume-nlp\src\score.py
Reloaded src.score
Reloaded src.pipeline
Processing: Abhishek_Singh_Resume.pdf
{'emails': ['abhisheksingh.vizag@gmail.com'],
 'parsed_keys': ['primary_name',
                 'emails',
                 'phones',
                 'skills',
                 'orgs',
                 'preview',
                 'education',
                 'resume_id'],
 'phones': ['8010852459'],
 'score': {'reasons': {'has_contact': True, 'n_skills': 5}, 'score': 60.0},
 'short_review': 'Detected skills: python, c++, sql, docker, git.',
 'skills': ['python', 'c++', 'sql', 'docker', 'git'],
 'summary_preview': 'Abhishek Singh — 5 skills detected: python, c++, sql, '
                    'docker, git.'}


In [16]:
# Cell C: smoke test — process a single PDF using the new pipeline
from pathlib import Path
sample_pdf = Path("../data/resumes_preocr/pdfs/Abhishek_Singh_Resume.pdf")
if not sample_pdf.exists():
    # attempt to find any pdf in that folder
    candidates = list(sample_pdf.parent.glob("*.pdf")) if sample_pdf.parent.exists() else []
    if candidates:
        sample_pdf = candidates[0]
    else:
        raise FileNotFoundError("No sample PDF found. Put a sample PDF at ../data/resumes_preocr/pdfs/ or update path.")

pdf_bytes = sample_pdf.read_bytes()
print("Processing:", sample_pdf.name)
out = process_pdf_bytes(pdf_bytes, resume_id=sample_pdf.name, model_fn=None)   # model_fn=None => rule-based summary
# quick examine
from pprint import pprint
pprint({
    "parsed_keys": list(out["parsed"].keys()),
    "emails": out["parsed"].get("emails"),
    "phones": out["parsed"].get("phones"),
    "skills": out["parsed"].get("skills"),
    "score": out["score"],
    "summary_preview": out["summary"]["summary"][:300],
    "short_review": out["review"]["short_review"][:300]
})


Processing: Abhishek_Singh_Resume.pdf
{'emails': ['abhisheksingh.vizag@gmail.com'],
 'parsed_keys': ['primary_name',
                 'emails',
                 'phones',
                 'skills',
                 'orgs',
                 'preview',
                 'education',
                 'resume_id'],
 'phones': ['8010852459'],
 'score': {'reasons': {'has_contact': True, 'n_skills': 5}, 'score': 60.0},
 'short_review': 'Detected skills: python, c++, sql, docker, git.',
 'skills': ['python', 'c++', 'sql', 'docker', 'git'],
 'summary_preview': 'Abhishek Singh — 5 skills detected: python, c++, sql, '
                    'docker, git.'}


In [17]:
# Cell D: process N resumes (modular pipeline) and save outputs
from pathlib import Path
DATA_DIR = Path("../data/resumes_preocr")
PDF_DIR = DATA_DIR / "pdfs"
OUT_DIR = DATA_DIR / "modular_outputs"
OUT_DIR.mkdir(parents=True, exist_ok=True)

pdf_paths = sorted(PDF_DIR.glob("*.pdf"))
N = 20   # change to smaller first (e.g., 20) to test; set to len(pdf_paths) to run all
pdf_paths = pdf_paths[:N]

results = []
for p in pdf_paths:
    try:
        b = p.read_bytes()
        res = process_pdf_bytes(b, resume_id=p.name, model_fn=None)
        save_json(res, OUT_DIR / (p.stem + ".json"))
        results.append({"file": p.name, "ok": True, "score": res["score"]["score"]})
    except Exception as e:
        results.append({"file": p.name, "ok": False, "error": str(e)})
    print(f"Processed {p.name} -> {results[-1]}")
print("Done. Saved per-resume JSON to", OUT_DIR)


Processed Abhishek_Singh_Resume.pdf -> {'file': 'Abhishek_Singh_Resume.pdf', 'ok': True, 'score': 60.0}
Done. Saved per-resume JSON to ..\data\resumes_preocr\modular_outputs
