In [None]:
from PyPDF2 import PdfReader
import json
import re

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as pdf_file:
        reader = PdfReader(pdf_file)
        resume_text = ''
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            resume_text += page.extract_text()
    return resume_text

def parse_resume(resume_text):
    # Define regex patterns for extracting specific information
    email_pattern = r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'
    phone_pattern = r'(\+\d{1,2}\s?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}'
    date_pattern = r'(?:\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|\w+\s\d{1,2}(?:st|nd|rd|th)?,\s\d{4})'

    # Extract information using regex
    email = re.search(email_pattern, resume_text)
    phone = re.search(phone_pattern, resume_text)
    name = "Your Name"  # Extract name using NLP techniques if needed

    # Education extraction
    education = []
    education_matches = re.finditer(r'(?P<degree>[^\n]+)\n(?P<university>[^\n]+)\n(?P<graduation_date>' + date_pattern + ')', resume_text, re.MULTILINE)
    for match in education_matches:
        education.append({
            "degree": match.group('degree').strip(),
            "university": match.group('university').strip(),
            "graduation_date": match.group('graduation_date').strip()
        })

    # Experience extraction
    experience = []
    experience_matches = re.finditer(r'(?P<title>[^\n]+)\n(?P<company>[^\n]+)\n(?P<location>[^\n]+)\n(?P<dates>' + date_pattern + r'\s?-?\s?' + date_pattern + r')\n(?P<responsibilities>[\s\S]+?)(?=\n\n|\Z)', resume_text, re.MULTILINE)
    for match in experience_matches:
        responsibilities = match.group('responsibilities').strip().split('\n')
        experience.append({
            "title": match.group('title').strip(),
            "company": match.group('company').strip(),
            "location": match.group('location').strip(),
            "start_date": match.group('dates').split('-')[0].strip(),
            "end_date": match.group('dates').split('-')[1].strip() if '-' in match.group('dates') else "Present",
            "responsibilities": responsibilities
        })

    # Skills extraction (assuming comma-separated list or bullet points)
    skills_matches = re.search(r'Skills[:\s\n]*(?P<skills>[\s\S]+?)(?=\n\n|\Z)', resume_text, re.MULTILINE | re.IGNORECASE)
    skills = [skill.strip() for skill in skills_matches.group('skills').split(',')]

    # Projects extraction (assuming bullet points)
    projects = []
    projects_matches = re.finditer(r'(?P<title>[^\n]+)\n(?P<description>[\s\S]+?)(?=\n\n|\Z)', resume_text, re.MULTILINE)
    for match in projects_matches:
        projects.append({
            "title": match.group('title').strip(),
            "description": match.group('description').strip()
        })

    # Certifications extraction
    certifications = []
    certifications_matches = re.finditer(r'(?P<title>[^\n]+)\n(?P<date>' + date_pattern + ')', resume_text, re.MULTILINE)
    for match in certifications_matches:
        certifications.append({
            "title": match.group('title').strip(),
            "date": match.group('date').strip()
        })

    # Construct resume data JSON
    resume_data = {
        "name": name,
        "contact_info": {
            "email": email.group() if email else "",
            "phone": phone.group() if phone else ""
        },
        "education": education,
        "experience": experience,
        "skills": skills,
        "projects": projects,
        "certifications": certifications
    }

    return resume_data

def save_to_json(data, output_file):
    with open(output_file, 'w') as json_file:
        json.dump(data, json_file, indent=4)

# Example usage:
if __name__ == "__main__":
    pdf_path = input("Enter the path to your resume PDF file: ").strip()
    resume_text = extract_text_from_pdf(pdf_path)
    parsed_data = parse_resume(resume_text)
    output_file = 'resume_data.json'
    save_to_json(parsed_data, output_file)
    print(f"Resume data parsed and saved to {output_file}")

In [2]:
pip install PyPDF2

Collecting PyPDF2
  Using cached pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Using cached pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Users/dipeshkumar/anaconda3/bin/python -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
