In [None]:
import spacy
from spacy.tokens import Doc
import re

# Load spaCy's language model
nlp = spacy.load("en_core_web_sm")

# Define key section headers commonly found in resumes
SECTION_HEADERS = [
    "education",
    "work experience",
    "skills",
    "certifications",
    "projects",
    "publications",
    "summary",
    "objective",
    "interests"
]

# Compile regex for matching section headers
SECTION_REGEX = re.compile(r"|".join([rf"\b{header}\b" for header in SECTION_HEADERS]), re.IGNORECASE)

def preprocess_text(text):
    """Cleans and preprocesses text for parsing."""
    text = re.sub(r"\s+", " ", text)  # Remove extra whitespace
    text = re.sub(r"[^\x00-\x7F]+", " ", text)  # Remove non-ASCII characters
    return text.strip()

def segment_sections(text):
    """Segments text into sections based on headers."""
    sections = {}
    lines = text.split("\n")

    current_section = None
    for line in lines:
        line = line.strip()
        if not line:
            continue

        # Check if the line matches a section header
        if SECTION_REGEX.match(line):
            current_section = line.lower()
            sections[current_section] = []
        elif current_section:
            sections[current_section].append(line)

    # Join section content
    for section, content in sections.items():
        sections[section] = " ".join(content)

    return sections

def extract_entities(section_text):
    """Uses spaCy to extract entities from section text."""
    doc = nlp(section_text)
    entities = {
        "ORG": [],
        "PERSON": [],
        "DATE": [],
        "SKILL": [],
        "TITLE": []
    }

    for ent in doc.ents:
        if ent.label_ in entities:
            entities[ent.label_].append(ent.text)

    return entities

def parse_resume(resume_text):
    """Main function to parse a resume and extract structured data."""
    processed_text = preprocess_text(resume_text)
    sections = segment_sections(processed_text)

    parsed_data = {}
    for section, content in sections.items():
        parsed_data[section] = {
            "content": content,
            "entities": extract_entities(content)
        }

    return parsed_data

# Example usage
if __name__ == "__main__":
    with open("C:\Users\Asura\Documents\Portfolio\resume_ds_2.pdf", "r") as file:
        resume_text = file.read()

    parsed_resume = parse_resume(resume_text)

    for section, data in parsed_resume.items():
        print(f"\n[{section.upper()}]")
        print(f"Content: {data['content']}")
        print(f"Entities: {data['entities']}")


FileNotFoundError: [Errno 2] No such file or directory: 'resume_ds_2.pdf'

In [1]:
import pandas