In [52]:
import spacy
import re
import fitz  # PyMuPDF for PDF text extraction
from docx import Document  # python-docx for DOCX handling
import requests
import json, pdfplumber
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from tempfile import NamedTemporaryFile

In [45]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

In [46]:
#Upload File link
resume_drive_link = "https://drive.google.com/file/d/11n3y9ApkUybZpjkM4Qs8r8xR2rRIejJS/view?usp=sharing"

In [5]:
resume_drive_link

'https://drive.google.com/file/d/11n3y9ApkUybZpjkM4Qs8r8xR2rRIejJS/view?usp=sharing'

In [6]:
file_id = resume_drive_link.split("/")[-2]
file_id

'11n3y9ApkUybZpjkM4Qs8r8xR2rRIejJS'

In [7]:
download_resume = f"https://drive.google.com/uc?export=download&id={file_id}"
download_resume

'https://drive.google.com/uc?export=download&id=11n3y9ApkUybZpjkM4Qs8r8xR2rRIejJS'

In [8]:
req_response = requests.get(download_resume)
if req_response.status_code != 200:
    print("Error downloading the file.")
else:
    resume_content = req_response.content

In [9]:
print(resume_content)

b'%PDF-1.7\r\n%\xb5\xb5\xb5\xb5\r\n1 0 obj\r\n<</Type/Catalog/Pages 2 0 R/Lang(en) /StructTreeRoot 46 0 R/MarkInfo<</Marked true>>/Metadata 281 0 R/ViewerPreferences 282 0 R>>\r\nendobj\r\n2 0 obj\r\n<</Type/Pages/Count 1/Kids[ 3 0 R] >>\r\nendobj\r\n3 0 obj\r\n<</Type/Page/Parent 2 0 R/Resources<</Font<</F1 5 0 R/F2 9 0 R/F3 11 0 R/F4 20 0 R/F5 25 0 R/F6 27 0 R/F7 32 0 R/F8 34 0 R>>/ExtGState<</GS7 7 0 R/GS8 8 0 R>>/XObject<</Image40 40 0 R/Image44 44 0 R>>/ProcSet[/PDF/Text/ImageB/ImageC/ImageI] >>/Annots[ 16 0 R 17 0 R 18 0 R 19 0 R 39 0 R 41 0 R 42 0 R 43 0 R] /MediaBox[ 0 0 612 792] /Contents 4 0 R/Group<</Type/Group/S/Transparency/CS/DeviceRGB>>/Tabs/S/StructParents 0>>\r\nendobj\r\n4 0 obj\r\n<</Filter/FlateDecode/Length 10281>>\r\nstream\r\nx\x9c\xcd][s\xdb\xc6\x92~w\x95\xff\x03^\xb6\x8a<k\xc1\x98+\x80\xd4)\xd7\xca\x92\xed8\xb1\x15\xdbr\xe2\xad:\xd9\x07\x9a\x82%\xc4\x14\xa5\x90\x94}\x9c_\xbf\xd3=\x83\x0b93$\x0c\x0c\x94\x93\xaa\xc8$\x08\xe0\x9bkw\xcf7\xdd=\xd1\xe3\xf3\xdb\xd92\x

In [53]:
# Save the PDF content to a temporary file
with NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file:
    temp_file.write(resume_content)
    temp_pdf_path = temp_file.name

# Open the PDF using pdfplumber
with pdfplumber.open(temp_pdf_path) as pdf:
    resume_text = ""
    for page in pdf.pages:
        resume_text += page.extract_text()


In [54]:
resume_gen = nlp(resume_text)
resume_gen

Rahul Bhowmick
Aspiring Data Analyst /Scientist
Kolkata, West Bengal, India ⋄ Ph. +91 97483 22716
Email: rahulbhowmick2002@gmail.com
LinkedIn: Rahul Bhowmick ⋄ GitHub: 0205Rahul
PROFILE
I have a keen interest in working in the field of Data Science/Data Analytics. I am enthusiastic and dedicated to developing strong
analytical skills, having worked on real-time projects. I have the ability to work well in a team and am always ready to upskill myself.
EDUCATION
• B. Tech (Computer Science and Engineering), KIIT University, Bhubaneswar – CGPA: 9.74 2020-Present
• Senior Secondary (12th CBSE), South Point High School, Kolkata – Percentage: 88.80% 2020
• Secondary (10th CBSE), South Point High School, Kolkata – Percentage: 84.80% 2018
TRAINING EXPERIENCE
• Training, Metvy, Data Analytics Training, Remote. 2023
o Gained insights, and skills: Acquired knowledge of working with data using MS Excel, Power BI, Python, and SQL.
o Accomplishments: Built multiple real-data projects, and awarded si

In [55]:
# Regular expression patterns for extracting different sections
name_pattern = r"^([A-Za-z\s]+)\s*$"
email_pattern = r"Email:\s*(\S+)\s*"
phone_pattern = r"Ph.\s*([\d\s]+)"
address_pattern = r"([^⋄]+),"
profile_pattern = r"PROFILE\s*([\s\S]+?)\s*EDUCATION"
education_pattern = r"•\s*(.*?)\s*–\s*(.*?)\s*–\s*(.*?)\n"
experience_pattern = r"•\s*(.*?)\s*(\d{4}\s*–\s*.*?)\n"
skills_set_pattern = r"SKILLS\s+•\s*(.*?)\s+"

In [56]:
parsed_resume = {
    "NAME": None,
    "EMAIL-ID": None,
    "PHONE": None,
    "ADDRESS": None,
    "EDUCATION": [],
    "EXPERIENCE": [],
    "PROJECTS": None,
    "CERTIFICATIONS": None,
    "TECHNICAL SKILLS": None,
    "SKILLS SET": None,
}

In [58]:
# Extract information using NER
for ent in resume_gen.ents:
    if ent.label_ == "PERSON":
        parsed_resume["NAME"] = ent.text
    elif ent.label_ == "EMAIL":
        parsed_resume["EMAIL-ID"] = ent.text
    elif ent.label_ == "PHONE":
        parsed_resume["PHONE"] = ent.text
    elif ent.label_ == "GPE":
        parsed_resume["ADDRESS"] = ent.text

In [63]:
# Define training data for section classification
section_text = [
    "EDUCATION", "EXPERIENCE", "PROJECTS", "CERTIFICATIONS",
    "TECHNICAL SKILLS", "SKILLS SET"
]
section_labels = ["EDUCATION"] * len(parsed_resume["EDUCATION"]) + \
                 ["EXPERIENCE"] * len(parsed_resume["EXPERIENCE"]) + \
                 ["PROJECTS"] * len(parsed_resume["PROJECTS"]) + \
                 ["CERTIFICATIONS"] * len(parsed_resume["CERTIFICATIONS"]) + \
                 ["TECHNICAL SKILLS"] * len(parsed_resume["TECHNICAL SKILLS"]) + \
                 ["SKILLS SET"] * len(parsed_resume["SKILLS SET"])

TypeError: object of type 'NoneType' has no len()

In [67]:
parsed_resume

{'NAME': 'Jupyter NB',
 'EMAIL-ID': None,
 'PHONE': None,
 'ADDRESS': 'Metvy',
 'PROFILE': None,
 'EDUCATION': [],
 'EXPERIENCE': [],
 'PROJECTS': None,
 'CERTIFICATIONS': None,
 'TECHNICAL SKILLS': None,
 'SKILLS SET': None}

In [60]:
# Build a machine learning pipeline for section classification
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC())
])

In [61]:
# Fit the pipeline on the training data
pipeline.fit(section_text, section_labels)

NameError: name 'section_labels' is not defined

In [None]:

# ... (Rest of the section_labels creation)


# Predict sections for each piece of text
predicted_sections = pipeline.predict(section_text)

# Assign predicted sections to the parsed_resume dictionary
for section, predicted_section in zip(section_text, predicted_sections):
    parsed_resume[section] = predicted_section

# Print the parsed resume data
print(json.dumps(parsed_resume, indent=4))