# Resume Matching Project

## Importing Libraries

In [121]:
import PyPDF2 as ppd
import re
import os
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import DistilBertTokenizer, DistilBertModel
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders
from datasets import load_dataset
import pandas as pd
import spacy
import torch


## Processing the text

In [122]:
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  
    text = text.lower()  
    return text

## Extracting the text from pdf

In [123]:
def extract_text_from_pdf(pdf_file_path):
    with open(pdf_file_path, 'rb') as pdf_file:
        pdf_reader = ppd.PdfReader(pdf_file)
        text = ""
        for page_number in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_number]
            text += page.extract_text().lower()
            # text += preprocess_text(page.extract_text())
        return text

## Loading the sample dataset

In [124]:
resumes=[]
root_directory = 'Dataset/data/data'

folder_name = 'ENGINEERING'
folder_path = os.path.join(root_directory, folder_name)

if os.path.exists(folder_path) and os.path.isdir(folder_path):
    for pdf_file_name in os.listdir(folder_path):
        pdf_file_path = os.path.join(folder_path, pdf_file_name)
        if pdf_file_name.endswith(".pdf"):
            extracted_text = extract_text_from_pdf(pdf_file_path)
            resumes.append(extracted_text)

In [125]:
resumes

['engineering lab technician\ncareer focus\nmy main objective in seeking employment with triumph actuation systems inc. is to work in a professional atmosphere where i can utilize my\nskills and continue to gain experience in the aerospace industry to advance in my career.\nprofessional experience\nengineering lab technician\n \noct 2016\n \nto \ncurrent\n \ncompany name\n \nï¼\u200b \ncity\n \n, \nstate\nresponsible for testing various seat structures to meet specific certification requirements. â \nmaintain and calibrate test instruments to ensure testing capabilities are maintained.\nensure data is captured and recorded correctly for certification test reports.\nduties also dynamic test set-up and static suite testing. \nengineering lab technician, sr. specialist\n \napr 2012\n \nto \noct 2016\n \ncompany name\n \nï¼\u200b \ncity\n \n, \nstate\nutilized skills learned from labview course 1 training to construct and maintain labview vi programs.\nresponsible for fabricating and maint

### Extracted skills

In [126]:
nlp = spacy.load('en_core_web_sm')

def extract_skills(resume_text):
    nlp_text = nlp(resume_text)

    tokens = [token.text for token in nlp_text if not token.is_stop]
    
    data = pd.read_csv("skills.csv") 
    
    skills = list(data.columns.values)
    
    skillset = []
    
    for token in tokens:
        if token.lower() in skills:
            skillset.append(token)
    
    for chunk in nlp_text.noun_chunks:
        chunk_text = chunk.text.lower().strip()
        if chunk_text in skills:
            skillset.append(chunk_text)
    
    return [i.capitalize() for i in set([i.lower() for i in skillset])]


### Extracted job role

In [127]:
def extract_job_role(resume):
    job_role = ''
    for i in resume:
        if i == '\n':
            break
        job_role += i
    return job_role

### Extracted education details

In [128]:
def extract_education(text):
    education_pattern = r"(\b[\w\s]+[\.,]?\s+(?:University|College|School|Institute)[\w\s]*[\.,]?)\s+(\b[\w\s]+[\.,]?\s+(?:Degree|Diploma|Certificate)[\w\s]*[\.,]?)"

    education_matches = re.findall(education_pattern, text, re.IGNORECASE)

    education_details = [match for match in education_matches]

    return education_details


In [129]:
job_role = []
skills = []
education_details = []

for resume in resumes:
    job_role.append(extract_job_role(resume))
    skills.append(extract_skills(resume))
    education_details.append(extract_education(resume))

print(job_role)
print(job_role)
print(job_role)

['engineering lab technician', 'equipment engineering technician', 'engineering operations director', 'engineering services manager', 'mechanical engineering intern', 'senior engineering manager', 'software engineering manager', 'engineering and quality technician', 'engineering intern', 'qa engineering team lead', 'senior engineering program manager', 'engineering manager', 'regional engineering manager', 'engineering technician', 'director of engineering', 'engineering intern', 'engineering technician', 'engineering technician v', 'engineering assistant', 'self-sustaining engineering technician', 'electrical engineering lab technician', 'process engineering technician iii', 'engineering assistant', 'biomedical engineering technician ii', 'clinical engineering manager', 'multi-skilled engineering manager', 'mechanical engineering intern', 'engineering manager/quality manager', 'engineering associate', 'engineering project manager iii', 'engineering planning manager', 'industrial engin

## Loading job descriptions dataset

In [130]:
def embed_text(text):
    tokens = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    outputs = model(**tokens)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.detach().numpy()

In [131]:
def calculate_similarity(job_description_embedding, cv_embedding):
    similarity_scores = cosine_similarity([job_description_embedding], cv_embedding)
    return similarity_scores

In [132]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased")

### Loadind job description dataset from hugging face

In [133]:
dataset = load_dataset("jacob-hugging-face/job-descriptions")
job_descriptions = dataset["train"]["job_description"][:10]

## Tokenizing the job description

In [134]:

tokenizer_data_directory = "tokenizer_data"

if not os.path.exists(tokenizer_data_directory):
    os.makedirs(tokenizer_data_directory)

tokenizer_job_desc = Tokenizer(models.BPE())
tokenizer_job_desc.pre_tokenizer = pre_tokenizers.ByteLevel()
tokenizer_job_desc.decoder = decoders.ByteLevel()

training_files = []

for i, description in enumerate(job_descriptions):
    file_name = os.path.join(tokenizer_data_directory, f"job_description_{i}.txt")
    
    with open(file_name, "w", encoding="utf-8") as file:
        file.write(description)
    
    training_files.append(file_name)

trainer = trainers.BpeTrainer(special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"])
tokenizer_job_desc.train(training_files, trainer=trainer)

def tokenize_job_description(job_description):
    encoding = tokenizer_job_desc.encode(job_description)
    return encoding.tokens

In [135]:
sample_job_description = job_descriptions[0]
tokens = tokenize_job_description(sample_job_description)
print(tokens)

['Ġminimum', 'Ġqualifications', 'Ċ', 'b', 'ach', 'elors', 'Ġdegree', 'Ġor', 'Ġequivalent', 'Ġpractical', 'Ġexperience', 'Ġyears', 'Ġof', 'Ġexperience', 'Ġin', 'Ġsaas', 'Ġor', 'Ġproductivity', 'Ġtools', 'Ġbusinessexperience', 'Ġmanaging', 'Ġenterprise', 'Ġaccounts', 'Ġwith', 'Ġsales', 'Ġcycles', 'Ċ', 'pre', 'fer', 'red', 'Ġqualifications', 'Ċ', 'Ġyears', 'Ġof', 'Ġexperience', 'Ġbuilding', 'Ġstrategic', 'Ġbusiness', 'Ġpartnerships', 'Ġwith', 'Ġenterprise', 'Ġcustomersability', 'Ġto', 'Ġwork', 'Ġthrough', 'Ġand', 'Ġwith', 'Ġa', 'Ġreseller', 'Ġecosystem', 'Ġto', 'Ġscale', 'Ġthe', 'Ġbusinessability', 'Ġto', 'Ġplan', 'Ġpitch', 'Ġand', 'Ġexecute', 'Ġa', 'Ġterritory', 'Ġbusiness', 'Ġstrategyability', 'Ġto', 'Ġbuild', 'Ġrelationships', 'Ġand', 'Ġto', 'Ġdeliver', 'Ġresults', 'Ġin', 'Ġa', 'Ġcrossfunctionalmatrixed', 'Ġenvironmentability', 'Ġto', 'Ġidentify', 'Ġcrosspromoting', 'Ġand', 'Ġuppromoting', 'Ġopportunities', 'Ġwithin', 'Ġthe', 'Ġexisting', 'Ġaccount', 'Ġbaseexcellent', 'Ġaccount', 'Ġman

## Tokenize and convert cv details to embeddings

In [136]:
def embedding_cv_details(job_description):
    tokens = tokenizer(job_description, padding=True, truncation=True, return_tensors="pt")
    outputs = model(**tokens)
    resume_details_embedding = outputs.last_hidden_state
    return resume_details_embedding

In [145]:
def embedding_job_description(job_description):
    tokens = tokenizer_job_desc.encode(job_description)
    tokens = torch.tensor(tokens.ids).unsqueeze(0)  # Convert to PyTorch tensor and add batch dimension
    
    with torch.no_grad():
        outputs = model(input_ids=tokens)
    
    job_description_embedding = outputs.last_hidden_state.mean(dim=1)  

    return job_description_embedding

## Matching job description and resume details

In [146]:
all_similarity_scores = []

job_description_embedding = embedding_job_description(sample_job_description)

job_description_embedding = job_description_embedding.squeeze().detach().numpy()

for i in range(len(resumes)):
    job_role_text = f"Job Role: {job_role[i]}"
    
    if isinstance(skills[i], str):
        skills_text = f"Skills: {skills[i]}"
    elif isinstance(skills[i], list):
        skills_text = f"Skills: {', '.join(skills[i])}"
    else:
        skills_text = "Skills: "  
    
    if isinstance(education_details[i], list):
        education_text = f"Education: {', '.join([' - '.join(ed) for ed in education_details[i]])}"
    else:
        education_text = "Education: "  
    cv_details = f"{job_role_text}. {skills_text}. {education_text}"

    cv_embedding = embedding_cv_details(cv_details)

    cv_embedding = cv_embedding.squeeze().detach().numpy()
    cv_embedding_mean = cv_embedding.mean(axis=0)

    similarity_scores = cosine_similarity([job_description_embedding], [cv_embedding_mean])

    all_similarity_scores.append(similarity_scores[0][0])

top_candidates_indices = np.argsort(all_similarity_scores)[-5:]


## Top 5 matching resumes

In [147]:
for idx in top_candidates_indices:
    print(f"Resume {idx + 1} - Similarity Score: {all_similarity_scores[idx]}")

Resume 25 - Similarity Score: 0.3456060588359833
Resume 21 - Similarity Score: 0.34724247455596924
Resume 26 - Similarity Score: 0.35182416439056396
Resume 83 - Similarity Score: 0.35966843366622925
Resume 22 - Similarity Score: 0.3613538444042206
