In [1]:
# Resume_Parser.ipynb

# Step 1: Install required packages
!pip install pdfplumber python-docx spacy nltk

Collecting pdfplumber
  Downloading pdfplumber-0.11.6-py3-none-any.whl.metadata (42 kB)
     ---------------------------------------- 0.0/42.8 kB ? eta -:--:--
     ------------------------------------ - 41.0/42.8 kB 653.6 kB/s eta 0:00:01
     -------------------------------------- 42.8/42.8 kB 524.4 kB/s eta 0:00:00
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting pdfminer.six==20250327 (from pdfplumber)
  Downloading pdfminer_six-20250327-py3-none-any.whl.metadata (4.1 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-win_amd64.whl.metadata (48 kB)
     ---------------------------------------- 0.0/48.2 kB ? eta -:--:--
     ---------------------------------------- 48.2/48.2 kB 1.2 MB/s eta 0:00:00
Downloading pdfplumber-0.11.6-py3-none-any.whl (60 kB)
   ---------------------------------------- 0.0/60.2 kB ? eta -:--:--
   ---------------------------------------- 60.2/60.2 kB 3.3 MB/s eta 0:

In [3]:
import os
import re
import spacy
import pdfplumber
import docx
import nltk
import ipywidgets as widgets
from IPython.display import display
from nltk.tokenize import sent_tokenize



In [4]:
# Download necessary NLTK data
nltk.download('punkt')



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gupta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 660.6 kB/s eta 0:00:20
     ---------------------------------------- 0.1/12.8 MB 1.5 MB/s eta 0:00:09
      --------------------------------------- 0.2/12.8 MB 2.0 MB/s eta 0:00:07
      --------------------------------------- 0.3/12.8 MB 2.0 MB/s eta 0:00:07
     - -------------------------------------- 0.4/12.8 MB 2.3 MB/s eta 0:00:06
     - -------------------------------------- 0.4/12.8 MB 2.4 MB/s eta 0:00:06
     - -------------------------------------- 0.6/12.8 MB 2.4 MB/s eta 0:00:06
     - -------------------------------------- 0.6/12.8 MB 2.0 MB/s eta 0:00:06
     - -------------------------------------- 0.6/12.8 MB 1.9 MB/s eta 0:00:07
     - ---------------------------------

In [7]:
# Load spaCy model
nlp = spacy.load('en_core_web_sm')

In [11]:
# 🧠 Step 2: Define Text Extraction Functions

def extract_text_from_pdf(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

def extract_text_from_docx(file_path):
    doc = docx.Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])


In [41]:
# Ask user to enter resume file path directly
resume_path = input("Enter the full path to your resume (.pdf or .docx): ")
# resume_path = resume_path.replace("\\", "\")
print(resume_path)
# Then use this path to extract text
if resume_path.endswith(".pdf"):
    resume_text = extract_text_from_pdf(resume_path)
elif resume_path.endswith(".docx"):
    resume_text = extract_text_from_docx(resume_path)
else:
    print("Unsupported file type")


# "D:\OneDrive\Desktop\AbhinavGuptaResume2025.pdf"
# D:\\OneDrive\\Desktop\\AbhinavGuptaResume2025.pdf
# "D:\OneDrive\Desktop\Arya_SinghResume___.pdf"
# D:\\OneDrive\\Desktop\\Blue Simple Professional CV Resume.pdf

Enter the full path to your resume (.pdf or .docx):  D:\\OneDrive\\Desktop\\Blue Simple Professional CV Resume.pdf


D:\\OneDrive\\Desktop\\Blue Simple Professional CV Resume.pdf


In [42]:
resume_text

"RICHARD SANCHEZ\nMARKETING MANAGER\nCONTACT PROFILE\n+123-456-7890 Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor\nincididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam quis\nhello@reallygreatsite.com nostrud exercitation. Lorem ipsum dolor sit amet, consectetur adipiscing elit,\nsed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad\n123 Anywhere St., Any City minim veniam quis nostrud exercitation. Ut enim ad minim veniam quis nostrud\nexercitation.\nwww.reallygreatsite.com\nWORK EXPERIENCE\nEDUCATION\nBorcelle Studio 2030 - PRESENT\nMarketing Manager & Specialist\n2029 - 2030\nWARDIERE UNIVERSITY\nDevelop and execute comprehensive marketing strategies and\nMaster of Business campaigns that align with the company's goals and objectives.\nManagement Lead, mentor, and manage a high-performing marketing team,\nfostering a collaborative and results-driven work environment.\nMonitor brand consistency across marketing c

In [43]:
import re

def extract_section_by_heading(text, heading_keywords):
    lines = text.splitlines()
    sections = {}
    current_section = None
    current_content = []

    for line in lines:
        line = line.strip()
        if not line:
            continue
        # Check if this line is a heading
        for heading in heading_keywords:
            if re.match(rf"^{heading}\s*:?\s*$", line, re.IGNORECASE):
                if current_section and current_content:
                    sections[current_section] = "\n".join(current_content)
                current_section = heading.lower()
                current_content = []
                break
        else:
            if current_section:
                current_content.append(line)

    if current_section and current_content:
        sections[current_section] = "\n".join(current_content)

    return sections


# Define headers you want to extract
headings = ["SKILLS", "EXPERIENCE", "PROJECTS"]
extracted_sections = extract_section_by_heading(resume_text, headings)


In [44]:
# Clean SKILLS into a list
def clean_skills(skills_text):
    skills = re.split(r'[:,•\n]', skills_text)
    return [skill.strip() for skill in skills if skill.strip()]

# Extract links from a string
def extract_links(text):
    return re.findall(r'https?://\S+|www\.\S+', text)

# Clean PROJECTS into structured format
def clean_projects(projects_text):
    projects = []
    entries = re.split(r'\n(?=[A-Z])', projects_text)  # new project likely starts with capitalized line
    for entry in entries:
        if entry.strip():
            title_match = re.match(r'^[•-]?\s*(.+?)(?:\s*[\(\[]|:)', entry)
            title = title_match.group(1).strip() if title_match else entry.split("\n")[0]
            description = entry.strip()
            links = extract_links(entry)
            projects.append({
                "title": title,
                "description": description,
                "links": links
            })
    return projects

# Clean EXPERIENCE into a list of roles/descriptions
def clean_experience(exp_text):
    entries = re.split(r'\n(?=[A-Z])', exp_text)
    return [entry.strip() for entry in entries if entry.strip()]


In [45]:
skills = clean_skills(extracted_sections.get("skills", ""))
experience = clean_experience(extracted_sections.get("experience", ""))
projects = clean_projects(extracted_sections.get("projects", ""))


In [47]:
print("🔧 Skills:\n", skills)
print("\n💼 Experience:")
for exp in experience:
    print("-", exp)

print("\n🚀 Projects:")
for proj in projects:
    print(f"- {proj['title']}")
    print(f"  Description: {proj['description']}")
    print(f"  Links: {proj['links']}")


🔧 Skills:
 []

💼 Experience:

🚀 Projects:
