In [1]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

### Load a pre-trained BERT model

In [2]:
model = SentenceTransformer('all-MiniLM-L6-v2')

### Resume parsing Functions

In [3]:
%pip install pdfplumber docx2txt

Note: you may need to restart the kernel to use updated packages.


In [4]:
import os
import pdfplumber
import docx2txt

In [15]:
# Extract the text from .pdf files
def extract_text_from_pdf(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

# Extract the text from .doc or .docx files
def extract_text_from_docx(file_path):
    return docx2txt.process(file_path)

# Function to extract the file and return the text
def extract_resume_text(file_path):
    ext = file_path.split('.')[-1].lower()
    if ext == 'pdf':
        return extract_text_from_pdf(file_path)
    elif ext in ['docx', 'doc']:
        return extract_text_from_docx(file_path)
    else :
        raise ValueError(f"Unidentified file type ({ext})")

### Sample resume and job description

In [16]:
# sample job description
job_description = """
We are seeking a self-driven and passionate Junior Machine Learning Engineer to join our team 
and contribute to the development of intelligent systems. This role is ideal for a student or 
early-career professional with a strong foundation in programming and a growing skill set in AI/ML, 
especially in Deep Learning, Reinforcement Learning, and Computer Vision.
"""

In [17]:
# Load all resumes from a directory
resume_folder = 'resumes/'
resume_texts = []
resume_files = []

# go through the files, extract the text and append it to the list resume_texts
for filename in os.listdir(resume_folder):
    file_path = os.path.join(resume_folder, filename)
    try:
        text = extract_resume_text(file_path)
        print(f"{filename} - Length: {len(text.strip())} characters")

        if text.strip():  # Only add non-empty text
            resume_texts.append(text)
            resume_files.append(filename)
        else:
            print(f"{filename} is empty or unreadable.")
    except Exception as e:
        print(f"Failed to process {filename}: {e}")

Failed to process .ipynb_checkpoints: Unidentified file type (ipynb_checkpoints)
Document 21.pdf - Length: 2957 characters
Document 22.pdf - Length: 1737 characters
Document 23.pdf - Length: 1577 characters
sample_resume.pdf - Length: 2160 characters


### Encode Resume and Job description into BERT embeddings

In [18]:
# Encoding the texts into BERT embeddings
resume_embeddings = model.encode(resume_texts)
jd_embeddings = model.encode([job_description])

### Calculate cosine similarities

This is to calculate the similarities between the job description and resume

In [11]:
similarities = cosine_similarity(jd_embeddings, resume_embeddings)[0]

### Rank Resumes

In [12]:
ranked_indices = np.argsort(similarities)[::-1]

In [14]:
print("Top Resume Matches:\n")
for idx in ranked_indices:
    print(f"🔹 File: {resume_files[idx]}")
    print(f"   Score: {similarities[idx]:.4f}")
    print("-" * 80)

Top Resume Matches:

🔹 File: sample_resume.pdf
   Score: 0.4673
--------------------------------------------------------------------------------
🔹 File: Document 21.pdf
   Score: 0.2689
--------------------------------------------------------------------------------
🔹 File: Document 22.pdf
   Score: 0.2413
--------------------------------------------------------------------------------
🔹 File: Document 23.pdf
   Score: 0.1069
--------------------------------------------------------------------------------
