In [1]:
import pdfplumber
import re

def extract_pdf_text(pdf_path: str) -> str:
    """
    Extracts text from PDF. 
    Using layout=False often works better for single-column resumes 
    to preserve reading order, but you can toggle to True if needed.
    """
    text = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # layout=True helps preserve visual spacing, which is sometimes useful
            # but for standard parsing, raw stream is often easier if columns aren't an issue.
            page_text = page.extract_text(layout=False)
            if page_text:
                text.append(page_text)
    return "\n".join(text)

def is_section_header(line: str) -> bool:
    """
    Detects if a line is a major section header (e.g., "EXPERIENCE", "PROJECTS").
    Criteria: Uppercase, reasonable length, not a bullet.
    """
    clean = line.strip()
    # "WORK EXPERIENCE" or "PROJECTS" match this
    if clean.isupper() and 3 <= len(clean) <= 40 and not is_bullet(clean):
        return True
    return False

def is_bullet(line: str) -> bool:
    """Detects standard bullet points."""
    clean = line.strip()
    return clean.startswith("•") or clean.startswith("- ") or clean.startswith("●")

def is_date_range(line: str) -> bool:
    """
    Detects date ranges to identify the start of a new entry.
    Matches: "Jun 2025 - Present", "2024-2025", "Oct 2024", "Sep 2020-Dec 2024"
    """
    # Regex to catch standard Month Year formats or "Present"
    date_pattern = r"(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{4}|Present|\d{4}\s*-\s*\d{4}|\d{4}"
    return bool(re.search(date_pattern, line, re.IGNORECASE))

def parse_resume_entries(text: str):
    # Split lines and remove pure whitespace
    lines = [l.rstrip() for l in text.split("\n") if l.strip()]
    
    results = []
    current_section = "UNCATEGORIZED"
    current_entry = "General"
    
    i = 0
    while i < len(lines):
        line = lines[i].strip()

        # ---------------------------------------------------------
        # 1. Detect SECTION HEADER
        # ---------------------------------------------------------
        if is_section_header(line):
            current_section = line
            current_entry = None # Reset entry on new section
            i += 1
            continue

        # ---------------------------------------------------------
        # 2. Detect NEW ENTRY (Job or Project Title)
        # Logic: If we are in Experience/Projects, and the line is NOT a bullet,
        # AND the NEXT line looks like a Date or a Bullet, this line is a Title.
        # ---------------------------------------------------------
        is_new_entry = False
        
        # Check constraints to avoid false positives in random text
        if current_section in ["EXPERIENCE", "WORK EXPERIENCE", "PROJECTS", "LEADERSHIP EXPERIENCE"]:
            if not is_bullet(line):
                # Look ahead 1 line
                if i + 1 < len(lines):
                    next_line = lines[i+1].strip()
                    
                    # If next line is a date, this line is definitely a Title
                    if is_date_range(next_line):
                        is_new_entry = True
                    
                    # If next line is a bullet, this line is a Title (Project w/o date on next line)
                    elif is_bullet(next_line):
                        is_new_entry = True

        if is_new_entry:
            current_entry = line
            
            # If the next line was a date, grab it for metadata and skip it
            if i + 1 < len(lines) and is_date_range(lines[i+1]):
                date_str = lines[i+1].strip()
                current_entry = f"{line} ({date_str})"
                i += 2 # Skip Title and Date
            else:
                i += 1 # Just skip Title
            continue

        # ---------------------------------------------------------
        # 3. Capture BULLETS
        # ---------------------------------------------------------
        if is_bullet(line):
            # Clean the bullet marker
            bullet_text = re.sub(r"^[•\-●]\s*", "", line).strip()
            
            # Handle multi-line bullets (look ahead for continuation)
            j = i + 1
            while j < len(lines):
                nxt = lines[j].strip()
                
                # BREAK conditions (End of bullet):
                # 1. Next line is a new bullet
                # 2. Next line is a section header
                # 3. Next line is a date (start of new entry)
                if is_bullet(nxt) or is_section_header(nxt) or is_date_range(nxt):
                    break
                
                # If none of the above, it's a continuation line. Merge it.
                bullet_text += " " + nxt
                j += 1
            
            # Save the result
            results.append({
                "section": current_section,
                "entry": current_entry or "General",
                "text": bullet_text
            })
            
            i = j # Jump parsing index to where we stopped
            continue

        # If line fits no category, skip it (usually address, noise, etc.)
        i += 1

    return results

In [2]:
# ==========================================
# Execution Block
# ==========================================
if __name__ == "__main__":
    # Replace with your actual PDF path
    pdf_path = "../user_data/raw/resume_ML/ChungYeh_Yang_resume.pdf" 
    
    # 1. Extract Raw Text
    raw_text = extract_pdf_text(pdf_path)
    # print("--- Raw Text Preview ---\n", raw_text[:500], "\n----------------------\n")

    # 2. Parse Logic
    parsed_data = parse_resume_entries(raw_text)

    # 3. Print Results
    print(f"Found {len(parsed_data)} bullets.\n")
    
    # Simple print loop to verify structure
    for item in parsed_data:
        # Only printing Projects/Experience for clarity
        if item['section'] in ["PROJECTS", "WORK EXPERIENCE", "EXPERIENCE"]:
            print(f"[{item['section']}] {item['entry']}")
            print(f"   -> {item['text'][:100]}...") # Printing first 100 chars
            print("-" * 50)

Found 22 bullets.

[WORK EXPERIENCE] PalAI New York, NY (AI/ML Intern Sep 2025 – Present)
   -> Engineered and prototyped a core ML and AI-driven college application and essay feedback platform...
--------------------------------------------------
[WORK EXPERIENCE] PalAI New York, NY (AI/ML Intern Sep 2025 – Present)
   -> Designed a data pipeline to standardize and augment 100+ essay/feedback pairs and fine-tuning the mo...
--------------------------------------------------
[WORK EXPERIENCE] PalAI New York, NY (AI/ML Intern Sep 2025 – Present)
   -> Collaborated with cross-functional backend and frontend engineers to deploy ML models into a product...
--------------------------------------------------
[WORK EXPERIENCE] Research Assistant in Applied AI in Business Aug 2023 - Jan 2025
   -> Collaborated with 5+ academic researchers to design and implement four AI solutions tailored to spec...
--------------------------------------------------
[WORK EXPERIENCE] Research Assistant in Appl