First approach - Using Extraction and tokenization as mentioned in the task description

In [23]:
!pip install PyPDF2



In [24]:

!pip install fpdf



In [25]:
!pip install pdfplumber



In [26]:
!pip install transformers



In [27]:
!pip install transformers



In [28]:
import os
import pandas as pd
import pdfplumber
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [29]:
# Define the path to the job descriptions PDF file
job_descriptions_pdf_path = '/content/job_descriptions.pdf'

# Define the folder containing candidates' resumes in PDF format
pdf_folder = '/content/drive/MyDrive/capital task/data/PUBLIC-RELATIONS'



In [30]:
# Define a function to extract text from PDF using pdfplumber
def extract_text_from_pdf(pdf_path):
    text = ''
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text

In [31]:
# Initialize DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [32]:
# Extract text from the job descriptions PDF
job_descriptions_text = extract_text_from_pdf(job_descriptions_pdf_path)

In [33]:
#Tokenize and preprocess job descriptions
inputs = tokenizer(job_descriptions_text, return_tensors="pt", padding=True, truncation=True)
outputs = model(**inputs)
job_description_embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()

In [34]:
# Initialize lists to store candidate resume embeddings and their filenames
cv_embeddings = []
cv_filenames = []

In [35]:
# Iterate through PDF files in the specified folder and extract text (for candidate resumes)
for filename in os.listdir(pdf_folder):
    if filename.endswith('.pdf'):
        pdf_path = os.path.join(pdf_folder, filename)
        text = extract_text_from_pdf(pdf_path)

        # Tokenize and preprocess candidate resumes
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()

        cv_embeddings.append(embeddings)
        cv_filenames.append(filename)  # Store the filename for later reference

In [36]:

# Perform cosine similarity calculation for each candidate's resume and job descriptions
similarity_scores = cosine_similarity(cv_embeddings, [job_description_embeddings])

In [37]:
# Rank CVs based on similarity scores for each job description
for i, job_description_similarity_scores in enumerate(similarity_scores.T):
    top_cv_indices = (-job_description_similarity_scores).argsort()[:5]
    print(f"Top 5 CVs for Job Description {i+1}:")
    for cv_index in top_cv_indices:
        similarity_score = job_description_similarity_scores[cv_index]
        cv_filename = cv_filenames[cv_index]
        print(f"Candidate: {cv_filename}, Similarity Score: {similarity_score}")

Top 5 CVs for Job Description 1:
Candidate: 11902276.pdf, Similarity Score: 0.9530705213546753
Candidate: 11160414.pdf, Similarity Score: 0.9324125051498413
Candidate: 12237267.pdf, Similarity Score: 0.9296835660934448
Candidate: 11850315.pdf, Similarity Score: 0.9280699491500854
Candidate: 12191094.pdf, Similarity Score: 0.9244903922080994
