In [1]:
!pip install pdfminer.six #If you need to work with PDF documents in your Python script or projects,

Collecting pdfminer.six
  Downloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pdfminer.six
Successfully installed pdfminer.six-20231228


In [3]:
import re  # Import the regular expression module
from pdfminer.high_level import extract_text  # Import text extraction function from pdfminer library
import spacy  # Import spaCy library for natural language processing
from spacy.matcher import Matcher  # Import Matcher class from spaCy's matcher module

def extract_text_from_pdf(pdf_path):
    """Function to extract text from a PDF file."""
    return extract_text(pdf_path)

def extract_contact_number_from_resume(text):
    """Function to extract contact number from a text using regex."""
    contact_number = None

    # Regular expression pattern to find a potential contact number
    pattern = r"\b(?:\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b"
    match = re.search(pattern, text)
    if match:
        contact_number = match.group()

    return contact_number

def extract_email_from_resume(text):
    """Function to extract email address from a text using regex."""
    email = None

    # Regular expression pattern to find a potential email address
    pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"
    match = re.search(pattern, text)
    if match:
        email = match.group()

    return email

def extract_skills_from_resume(text, skills_list):
    """Function to extract skills from a text using a list of predefined skills."""
    skills = []

    for skill in skills_list:
        pattern = r"\b{}\b".format(re.escape(skill))  # Create a regex pattern for each skill in the skills list
        match = re.search(pattern, text, re.IGNORECASE)  # Perform case-insensitive search
        if match:
            skills.append(skill)  # Add the skill to the list if found

    return skills

def extract_education_from_resume(text):
    """Function to extract education information from a text using regex."""
    education = []

    # Regular expression pattern to find education information
    pattern = r"(?i)(?:Bsc|\bB\.\w+|\bM\.\w+|\bPh\.D\.\w+|\bBachelor(?:'s)?|\bMaster(?:'s)?|\bPh\.D)\s(?:\w+\s)*\w+"
    matches = re.findall(pattern, text)  # Find all matches in the text based on the pattern
    for match in matches:
        education.append(match.strip())  # Strip whitespace and append to education list

    return education

def extract_name(resume_text):
    """Function to extract name from resume text using spaCy's Matcher."""
    nlp = spacy.load('en_core_web_sm')  # Load English language model
    matcher = Matcher(nlp.vocab)  # Initialize Matcher with spaCy's vocabulary

    # Define name patterns for Matcher
    patterns = [
        [{'POS': 'PROPN'}, {'POS': 'PROPN'}],  # First name and Last name
        [{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}],  # First name, Middle name, and Last name
        [{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}]  # First name, Middle name, Middle name, and Last name
        # Add more patterns as needed
    ]

    for pattern in patterns:
        matcher.add('NAME', patterns=[pattern])  # Add each pattern to the Matcher

    doc = nlp(resume_text)  # Process the resume text with spaCy
    matches = matcher(doc)  # Apply the Matcher to find name patterns

    for match_id, start, end in matches:
        span = doc[start:end]  # Get the matched span of text
        return span.text  # Return the matched text (name)

    return None  # Return None if no name is found

if __name__ == '__main__':
    resume_paths = [r"/content/Untitled-resume.pdf"]  # List of resume file paths to process

    for resume_path in resume_paths:
        text = extract_text_from_pdf(resume_path)  # Extract text from the PDF resume

        print("Resume:", resume_path)  # Print the resume file path

        name = extract_name(text)  # Extract name from the resume text
        if name:
            print("Name:", name)  # Print the extracted name
        else:
            print("Name not found")  # Print if name extraction failed

        contact_number = extract_contact_number_from_resume(text)  # Extract contact number from the resume text
        if contact_number:
            print("Contact Number:", contact_number)  # Print the extracted contact number
        else:
            print("Contact Number not found")  # Print if contact number extraction failed

        email = extract_email_from_resume(text)  # Extract email address from the resume text
        if email:
            print("Email:", email)  # Print the extracted email address
        else:
            print("Email not found")  # Print if email extraction failed

        skills_list = ['Python', 'Data Analysis', 'Machine Learning', 'Communication', 'Project Management', 'Deep Learning', 'SQL', 'Tableau']
        extracted_skills = extract_skills_from_resume(text, skills_list)  # Extract skills from the resume text
        if extracted_skills:
            print("Skills:", extracted_skills)  # Print the extracted skills
        else:
            print("No skills found")  # Print if no skills were found

        extracted_education = extract_education_from_resume(text)  # Extract education information from the resume text
        if extracted_education:
            print("Education:", extracted_education)  # Print the extracted education information
        else:
            print("No education information found")  # Print if no education information was found

        print()  # Print an empty line for better readability between resumes


Resume: /content/Untitled-resume.pdf
Name: Sanket Sarwade
Contact Number: 7798248452
Email: sanketsarwade111@gmail.com
Skills: ['Python', 'Data Analysis', 'Machine Learning', 'Communication', 'Deep Learning', 'SQL', 'Tableau']
Education: ['Bsc Microbiology']



#  **Code when multiple  resumes are  to be parsed.**



1.   PDF Text Extraction:



The code uses pdfminer to extract text from PDFs, allowing for the extraction of contact details, email addresses, skills, education, and names from resume files.
2.   Regex Patterns:

Regular expressions are employed to identify and extract structured data such as contact numbers, email addresses, and educational qualifications from the extracted text.

3.  NLP for Name Extraction:

The spaCy library is used with the Matcher class to detect names based on part-of-speech (POS) patterns. It leverages NLP techniques to match patterns of proper nouns indicating names.  

4.  Handling Multiple Resumes:

Google Colab's file upload widget is utilized for uploading a zip file containing multiple resumes. The code extracts all PDFs from the zip file and processes each resume individually, extracting and printing relevant information using a predefined list of skills and NLP techniques.

In [None]:
# Importing necessary libraries
import re  # For regular expressions
import os  # For interacting with the operating system
import zipfile  # For handling ZIP files
from pdfminer.high_level import extract_text  # For extracting text from PDFs
import spacy  # For natural language processing
from spacy.matcher import Matcher  # For pattern matching in text
from google.colab import files  # For handling file uploads in Google Colab

def load_skills_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        skills = [line.strip() for line in file if line.strip()]
    return skills

# Example usage:
skills_list = load_skills_from_file('/content/all_skills.txt')
print("Total skills loaded:", len(skills_list))
print("Example skills:", skills_list[:10])  # Print first 10 skills as an example

def load_skills_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        skills = [line.strip() for line in file if line.strip()]
    return skills

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    return extract_text(pdf_path)  # Use pdfminer to extract text from the PDF

# Function to extract contact number from resume text
def extract_contact_number_from_resume(text):
    contact_number = None  # Initialize contact number as None

    # Regex pattern to find contact numbers
    pattern = r"\b(?:\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b"
    match = re.search(pattern, text)  # Search for the pattern in the text
    if match:
        contact_number = match.group()  # If found, extract the contact number

    return contact_number  # Return the contact number

# Function to extract email address from resume text
def extract_email_from_resume(text):
    email = None  # Initialize email as None

    # Regex pattern to find email addresses
    pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"
    match = re.search(pattern, text)  # Search for the pattern in the text
    if match:
        email = match.group()  # If found, extract the email address

    return email  # Return the email address

# Function to extract skills from resume text based on a provided skills list
def extract_skills_from_resume(text, skills_list):
    skills = []

    for skill in skills_list:
        pattern = r"\b{}\b".format(re.escape(skill))
        if re.search(pattern, text, re.IGNORECASE):
            skills.append(skill)

    return skills

# Function to extract education information from resume text
def extract_education_from_resume(text):
    education = []

    # Use regex pattern to find education information
    pattern = r"(?i)(?:BSc|B\.\w+|M\.\w+|Ph\.D\.?\w*|Bachelor(?:'s)?|Master(?:'s)?|Ph\.D)\s(?:\w+\s)*\w+"
    matches = re.findall(pattern, text)
    for match in matches:
        education.append(match.strip())

    return education

# Function to extract the name from resume text
def extract_name(resume_text):
    nlp = spacy.load('en_core_web_sm')  # Load the small English language model from spacy
    matcher = Matcher(nlp.vocab)  # Initialize the Matcher with the vocabulary of the model

    # Define patterns to match names (sequences of proper nouns)
    patterns = [
        [{'POS': 'PROPN'}, {'POS': 'PROPN'}],  # First name and Last name
        [{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}],  # First name, Middle name, and Last name
        [{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}]  # First name, Middle name, Middle name, and Last name
    ]

    for pattern in patterns:
        matcher.add('NAME', patterns=[pattern])  # Add each pattern to the matcher

    doc = nlp(resume_text)  # Process the resume text with the NLP model
    matches = matcher(doc)  # Use the matcher to find matches in the document

    for match_id, start, end in matches:
        span = doc[start:end]  # Get the span of the matched text
        return span.text  # Return the first matched name

    return None  # Return None if no matches are found

# Function to process a resume and extract relevant information
def process_resume(resume_path, skills_list):
    text = extract_text_from_pdf(resume_path)  # Extract text from the PDF file

    # Create a dictionary with the extracted information
    resume_data = {
        'Resume Path': resume_path,
        'Name': extract_name(text),
        'Contact Number': extract_contact_number_from_resume(text),
        'Email': extract_email_from_resume(text),
        'Skills': extract_skills_from_resume(text, skills_list),
        'Education': extract_education_from_resume(text)
    }

    return resume_data  # Return the dictionary with the resume data

# Main execution block
if __name__ == '__main__':
    import sys  # Import sys for system-specific parameters and functions
    import zipfile  # Import zipfile for handling ZIP files

    # Use Google Colab's file upload widget to upload files
    uploaded = files.upload()
    if not uploaded:
        print("No file uploaded. Exiting...")  # Print message if no file is uploaded
        sys.exit(1)  # Exit the program with status 1 (error)

    zip_path = list(uploaded.keys())[0]  # Get the path of the first uploaded file

    extract_dir = "extracted_resumes"  # Directory to extract the resumes

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)  # Extract all files in the ZIP archive to the directory

    # List of skills to search for in the resumes
    skills_list = ['Python', 'Data Analysis', 'Machine Learning', 'Communication', 'Project Management', 'Deep Learning', 'SQL', 'Tableau']
    all_resumes_data = []  # Initialize an empty list to store data from all resumes

    for root, _, files in os.walk(extract_dir):
        for file in files:
            if file.endswith(".pdf"):
                resume_path = os.path.join(root, file)  # Get the full path of the PDF file
                resume_data = process_resume(resume_path, skills_list)  # Process the resume
                all_resumes_data.append(resume_data)  # Add the extracted data to the list

    # Print the extracted information from each resume
    for resume_data in all_resumes_data:
        print("Resume:", resume_data['Resume Path'])
        print("Name:", resume_data['Name'] or "Name not found")
        print("Contact Number:", resume_data['Contact Number'] or "Contact Number not found")
        print("Email:", resume_data['Email'] or "Email not found")
        print("Skills:", resume_data['Skills'] or "No skills found")
        print("Education:", resume_data['Education'] or "No education information found")
        print()


Total skills loaded: 37340
Example skills: ["'05", "'06", "'08", "'09", "'11", '(ISC)2', '.10', '.17', '.NET', '.NET CLR']


Saving suyash.zip to suyash (4).zip
Resume: extracted_resumes/New folder (4)/New folder (4)/Shreyas-Anil-Chore.pdf
Name: Shreyas Anil
Contact Number: 9356634711
Email: shreyas.chore20@pccoepune.org
Skills: ['Python', 'Machine Learning', 'Communication']
Education: No education information found

Resume: extracted_resumes/New folder (4)/New folder (4)/GiramResume.pdf
Name: Sudarshan Giram
Contact Number: 91 7558373273
Email: giramsudarshan@gmail.com
Skills: ['Python']
Education: No education information found

Resume: extracted_resumes/New folder (4)/New folder (4)/Pranav Pedhekar Resume.pdf
Name: Pranav Pedhekar
Contact Number: 91 9156155835
Email: pranav.pedhekar20@pccoepune.org
Skills: ['Python']
Education: No education information found

Resume: extracted_resumes/New folder (4)/New folder (4)/TEJAS SANDEEP GAIKWAD RESUME.pdf
Name: SANDEEP GAIKWAD
Contact Number: 7498087850
Email: tgaikwad966@gmail.com
Skills: ['Python', 'Data Analysis', 'Machine Learning', 'Communication']
Education