In [12]:
import spacy
import re
import pdfplumber  # For PDF text extraction
from pypdf import PdfReader 

# Load spaCy's language model
nlp = spacy.load("en_core_web_sm")

# Define key section headers commonly found in resumes
SECTION_HEADERS = [
    "education",
    "work experience",
    "skills",
    "certifications",
    "projects",
    "publications",
    "summary",
    "objective",
    "interests"
]



In [22]:
# Compile regex for matching section headers
SECTION_REGEX = re.compile(r"|".join([rf"\\b{header}\\b" for header in SECTION_HEADERS]), re.IGNORECASE)

def preprocess_text(text):
    """
    Cleans and preprocesses text for parsing.
    - Removes excessive whitespace.
    - Retains essential symbols and characters.
    - Converts text to consistent formatting where appropriate.
    """
    # Normalize whitespace
    text = re.sub(r"\s+", " ", text)  # Replace multiple spaces/newlines with a single space
    # Retain ASCII and common printable characters while removing control characters
    text = re.sub(r"[^\x20-\x7E]", "", text)
    # Strip leading and trailing whitespace
    return text.strip()

# def extract_text_from_pdf(pdf_path):
#     """Extracts text from a PDF file."""
#     text = ""
#     try:
#         with pdfplumber.open(pdf_path) as pdf:
#             for page in pdf.pages:
#                 text += page.extract_text() + "\n"
#     except Exception as e:
#         print(f"Error reading PDF: {e}")
#     return text

def segment_sections(text):
    """Segments text into sections based on headers."""
    sections = {}
    lines = text.split("\\n")
    print(len(lines))

    current_section = None
    for line in lines:
        line = line.strip()
        if not line:
            continue

        # Check if the line matches a section header
        if SECTION_REGEX.match(line):
            current_section = line.lower()
            sections[current_section] = []
        elif current_section:
            sections[current_section].append(line)

    # Join section content
    for section, content in sections.items():
        sections[section] = " ".join(content)

    return sections

def extract_entities(section_text):
    """Uses spaCy to extract entities from section text."""
    doc = nlp(section_text)
    entities = {
        "ORG": [],
        "PERSON": [],
        "DATE": [],
        "SKILL": [],
        "TITLE": []
    }

    for ent in doc.ents:
        if ent.label_ in entities:
            entities[ent.label_].append(ent.text)

    return entities

In [23]:

def parse_resume(resume_path):
    # """Main function to parse a resume and extract structured data."""
    # if file_format == "pdf":
    #     raw_text = extract_text_from_pdf(resume_path)
    # elif file_format == "txt":
    #     with open(resume_path, "r") as file:
    #         raw_text = file.read()
    # else:
    #     raise ValueError("Unsupported file format. Please use 'txt' or 'pdf'.")

    reader = PdfReader(resume_path) 
    raw_text = ""

    for page_no in range(len(reader.pages)):
        page = reader.pages[page_no] 
        raw_text += page.extract_text()

    processed_text = preprocess_text(raw_text)
    sections = segment_sections(processed_text)
    print(sections)


    parsed_data = {}
    for section, content in sections.items():
        parsed_data[section] = {
            "content": content,
            "entities": extract_entities(content)
        }

    return parsed_data

# Example usage
if __name__ == "__main__":
    resume_path = "resume_ds_2.pdf"
    file_format = "pdf"  # Specify "pdf" or "txt"

    parsed_resume = parse_resume(resume_path)
    print(parsed_resume)
    for section, data in parsed_resume.items():
        print(f"\\n[{section.upper()}]")
        print(f"Content: {data['content']}")
        print(f"Entities: {data['entities']}")
    


1
{}
{}
