In [1]:
!pip install pymupdf

Collecting pymupdf
  Downloading PyMuPDF-1.24.7-cp312-none-win_amd64.whl.metadata (3.4 kB)
Collecting PyMuPDFb==1.24.6 (from pymupdf)
  Downloading PyMuPDFb-1.24.6-py3-none-win_amd64.whl.metadata (1.4 kB)
Downloading PyMuPDF-1.24.7-cp312-none-win_amd64.whl (3.2 MB)
   ---------------------------------------- 0.0/3.2 MB ? eta -:--:--
   ---------------------------------------- 0.0/3.2 MB ? eta -:--:--
    --------------------------------------- 0.0/3.2 MB 667.8 kB/s eta 0:00:05
   --- ------------------------------------ 0.3/3.2 MB 2.8 MB/s eta 0:00:02
   --------- ------------------------------ 0.7/3.2 MB 5.7 MB/s eta 0:00:01
   --------------- ------------------------ 1.3/3.2 MB 7.3 MB/s eta 0:00:01
   --------------------- ------------------ 1.8/3.2 MB 8.0 MB/s eta 0:00:01
   --------------------------- ------------ 2.2/3.2 MB 8.3 MB/s eta 0:00:01
   ------------------------------------- -- 3.0/3.2 MB 9.1 MB/s eta 0:00:01
   ---------------------------------------- 3.2/3.2 MB 8.9 MB/


[notice] A new release of pip is available: 24.0 -> 24.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
import fitz  # PyMuPDF
import json

def extract_text_from_pdf(pdf_path):
    document = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text()
    return text

def parse_resume(text):
    lines = text.split('\n')
    
    resume_data = {
        "name": "",
        "contact_information": {
            "email": "",
            "phone": "",
            "location": ""
        },
        "summary": "",
        "education": [],
        "experience": [],
        "projects": [],
        "skills": [],
        "languages": [],
        "links": []
    }
    
    current_section = ""
    for i, line in enumerate(lines):
        line = line.strip()
        if "@" in line:
            resume_data["contact_information"]["email"] = line
        elif any(char.isdigit() for char in line) and len(line) > 9 and len(line) < 15:
            resume_data["contact_information"]["phone"] = line
        elif line.lower().startswith("chennai"):
            resume_data["contact_information"]["location"] = line
        elif "Summary" in line:
            current_section = "summary"
            continue
        elif "Education" in line:
            current_section = "education"
            continue
        elif "Experience" in line:
            current_section = "experience"
            continue
        elif "Projects" in line:
            current_section = "projects"
            continue
        elif "Skills" in line:
            current_section = "skills"
            continue
        elif "Languages" in line:
            current_section = "languages"
            continue
        elif "Links" in line:
            current_section = "links"
            continue
        
        if current_section == "summary":
            resume_data["summary"] += line + " "
        elif current_section == "education":
            if "B.Tech" in line or "12" in line or "10" in line:
                education_entry = {
                    "institution": lines[i-1].strip() if i-1 >= 0 else "",
                    "degree": "",
                    "field_of_study": "",
                    "location": lines[i+1].strip() if i+1 < len(lines) else "",
                    "dates": "",
                    "gpa": ""
                }
                parts = line.split("·")
                if len(parts) > 0:
                    education_entry["degree"] = parts[0].strip()
                if len(parts) > 1:
                    education_entry["field_of_study"] = parts[1].strip()
                if len(parts) > 2:
                    education_entry["location"] = parts[2].strip()
                if len(parts) > 3:
                    education_entry["gpa"] = parts[3].strip()
                resume_data["education"].append(education_entry)
        elif current_section == "experience":
            if "Intern" in line:
                experience_entry = {
                    "company": lines[i-1].strip() if i-1 >= 0 else "",
                    "role": "",
                    "location": "",
                    "dates": "",
                    "responsibilities": []
                }
                parts = line.split("·")
                if len(parts) > 0:
                    experience_entry["role"] = parts[0].strip()
                if len(parts) > 1:
                    experience_entry["location"] = parts[1].strip()
                if len(parts) > 2:
                    experience_entry["dates"] = parts[2].strip()
                for j in range(1, 4):
                    if i+j < len(lines):
                        experience_entry["responsibilities"].append(lines[i+j].strip())
                resume_data["experience"].append(experience_entry)
        elif current_section == "projects":
            if "|" in line:
                project_entry = {
                    "name": line.split("|")[0].strip(),
                    "dates": line.split("|")[1].strip() if len(line.split("|")) > 1 else "",
                    "description": " ".join(lines[i+1:i+5]).strip(),
                    "link": lines[i+5].strip() if i+5 < len(lines) and "http" in lines[i+5] else None
                }
                resume_data["projects"].append(project_entry)
        elif current_section == "skills":
            resume_data["skills"] = line.split(", ")
        elif current_section == "languages":
            if "[" in line:
                language_entry = {
                    "language": line.split("[")[0].strip(),
                    "proficiency": line.split("[")[1].replace("]", "").strip()
                }
                resume_data["languages"].append(language_entry)
        elif current_section == "links":
            if "GitHub" in line or "CodeChef" in line:
                link_entry = {
                    "type": line.split(":")[0].strip(),
                    "url": line.split(":")[1].strip() if len(line.split(":")) > 1 else ""
                }
                resume_data["links"].append(link_entry)

    return resume_data

def main():
    pdf_path = 'flash 3.pdf'  # Update this path to the location of your PDF file
    text = extract_text_from_pdf(pdf_path)
    resume_data = parse_resume(text)
    json_output = json.dumps(resume_data, indent=4)
    print(json_output)

if __name__ == "__main__":
    main()


{
    "name": "",
    "contact_information": {
        "email": "ak8957@srmist.edu.in",
        "phone": "May 2024 -",
        "location": "Chennai,India"
    },
    "summary": "Innovative Computer Science and Engineering student from SRM KTR, with a strong foundation in Data Science, Deep Learning, Generative AI, and Full Stack Development. My expertise in Machine Learning fuels my pursuit of creating cutting edge technological solutions. Looking for a job to gain more skills and experience in the tech world ",
    "education": [
        {
            "institution": "Sep 2021 - Present",
            "degree": "B.Tech",
            "field_of_study": "Computer Science and Engineering",
            "location": "Chennai",
            "dates": "",
            "gpa": ""
        },
        {
            "institution": "Mar 2020 - Feb 2021",
            "degree": "12",
            "field_of_study": "CBSE",
            "location": "Bilaspur",
            "dates": "",
            "gpa": ""
    