# Step 1: Setting up the Environment

Note:- First,ensure you have the necessary libraries installed. You can install them using pip:

In [1]:
!pip install spacy pandas sklearn
!python -m spacy download en_core_web_sm


Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m69.4 MB/s[0m eta [36m0:00:00[0m
[

In [2]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.7-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.24.6 (from PyMuPDF)
  Downloading PyMuPDFb-1.24.6-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.7/15.7 MB[0m [31m53.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.24.7 PyMuPDFb-1.24.6


In [3]:
import spacy
import pandas as pd
import sklearn

print("spaCy version:", spacy.__version__)
print("pandas version:", pd.__version__)
print("scikit-learn version:", sklearn.__version__)


spaCy version: 3.7.5
pandas version: 2.0.3
scikit-learn version: 1.2.2


In [4]:
!pip install chardet



In [5]:
!pip install pdfminer.six


Collecting pdfminer.six
  Downloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pdfminer.six
Successfully installed pdfminer.six-20231228


# Step 2: Parsing Resumes

We'll use 'spaCy' to extract key information from resumes.

2.1 Import Libraries

In [6]:
import re
from pdfminer.high_level import extract_text
import spacy
from spacy.matcher import Matcher
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load spacy model
nlp = spacy.load('en_core_web_sm')

2.2 Define Extraction Functions

Create functions to extract the name, contact no, email, education and skills.

In [7]:
def extract_text_from_pdf(pdf_path):
    return extract_text(pdf_path)

def extract_contact_number_from_resume(text):
    contact_number = None
    pattern = r"\b(?:\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b"
    match = re.search(pattern, text)
    if match:
        contact_number = match.group()
    return contact_number

def extract_email_from_resume(text):
    email = None
    pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"
    match = re.search(pattern, text)
    if match:
        email = match.group()
    return email

def extract_skills_from_resume(text, skills_list):
    skills = []
    for skill in skills_list:
        pattern = r"\b{}\b".format(re.escape(skill))
        if re.search(pattern, text, re.IGNORECASE):
            skills.append(skill)
    return skills

def extract_education_from_resume(text):
    education = []
    pattern = r"(?i)(?:Bsc|\bB\.\w+|\bM\.\w+|\bPh\.D\.\w+|\bBachelor(?:'s)?|\bMaster(?:'s)?|\bPh\.D)\s(?:\w+\s)*\w+"
    matches = re.findall(pattern, text)
    for match in matches:
        education.append(match.strip())
    return education

def extract_name(resume_text):
    matcher = Matcher(nlp.vocab)
    patterns = [
        [{'POS': 'PROPN'}, {'POS': 'PROPN'}],
        [{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}],
        [{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}]
    ]
    for pattern in patterns:
        matcher.add('NAME', patterns=[pattern])
    doc = nlp(resume_text)
    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        return span.text
    return None

def preprocess_text(text):
    text = text.lower()  #lowercase conversion
    text = re.sub(r'\s+', ' ', text)  # whitespace removal
    return text.strip()


# Step 3: Matching Candidates with Job Descriptions

Develop an algorithm to match resumes with job descriptions.

3.1 Define Matching Functions

Create functions to calculate the match score between a resume and a job description.

In [8]:
def calculate_similarity(text1, text2):
    vectorizer = CountVectorizer().fit_transform([text1, text2])
    vectors = vectorizer.toarray()
    cosine_sim = cosine_similarity(vectors)
    return cosine_sim[0][1]

def match_resume_with_job(resume_text, job_description):
    resume_skills = extract_skills_from_resume(resume_text, job_description['skills'])
    resume_education = extract_education_from_resume(resume_text)
    candidate_name = extract_name(resume_text) #candidate name is being extracted

    # Combining resume skills and education into a single string for matching
    resume_combined = ' '.join(resume_skills) + ' ' + ' '.join(resume_education)
    job_combined = ' '.join(job_description['skills']) + ' ' + ' '.join(job_description['education'])

    # Preprocess text
    resume_combined = preprocess_text(resume_combined)
    job_combined = preprocess_text(job_combined)

    # Calculate similarity
    similarity_score = calculate_similarity(resume_combined, job_combined)

    return {
        'name': candidate_name,
        'skills': resume_skills,
        'education': resume_education,
        'similarity_score': similarity_score
    }

3.2 Sample Job Description and Match Resumes with Job Description

Define a sample job description to test the matching function.

In [11]:
if __name__ == '__main__':
    # Sample job description and resumes
    job_description = {
        'title': 'Data Scientist',
        'skills': ['Python', 'Machine Learning', 'Data Visualization', 'Leadership'],
        'education': ['Bachelor', 'Master']
    }

    resume_paths = [
        r"/content/Resume 1.pdf",
        r"/content/Resume 2.pdf",
        r"/content/Resume 3.pdf",
        r"/content/Untitled-resume 3.pdf"
        r"/content/Anisha Dsouza - Resume.pdf"
    ]
    # Match resumes with job description and find the best match
    best_match = None
    best_score = -1
    for resume_path in resume_paths:
        try:
            resume_text = extract_text_from_pdf(resume_path)
            match_info = match_resume_with_job(resume_text, job_description)
            score = match_info['similarity_score']
            # Update best match if current score is higher
            if score > best_score:
                best_score = score
                best_match = {
                    'resume_path': resume_path,
                    'name': match_info['name'],
                    'skills': match_info['skills'],
                    'education': match_info['education'],
                    'score': best_score
                }
        except Exception as e:
            print(f"Error processing {resume_path}: {e}")
    # Output detailed information of the best matching resume
    if best_match:
        print(f"Best Matching Resume: {best_match['resume_path']}")
        print(f"Candidate Name: {best_match['name']}")
        print(f"Match Score: {best_match['score']:.2f}")
        print(f"Skills: {', '.join(best_match['skills'])}")
        print(f"Education: {', '.join(best_match['education'])}")
    else:
        print("No valid resumes found or all failed to match the job description.")

Error processing /content/Untitled-resume 3.pdf/content/Anisha Dsouza - Resume.pdf: [Errno 20] Not a directory: '/content/Untitled-resume 3.pdf/content/Anisha Dsouza - Resume.pdf'
Best Matching Resume: /content/Resume 3.pdf
Candidate Name: Rachelle Beaudry
Match Score: 0.27
Skills: Leadership
Education: Master of Business Administration, Bachelor of Science in Accounting
City College
Aug 2015
