In [1]:
# !pip3 install transformers
# !pip3 install torch
# !pip3 install PyMuPDF
# !pip3 install torch torchvision torchaudio

In [4]:
import spacy
import torch
from transformers import BertTokenizer, BertModel
import fitz  # PyMuPDF
import re

In [9]:
class ResumeMatcher:
    def __init__(self, job_description, resume):
        # self.nlp = spacy.load("en_core_web_sm")
        self.job_description = job_description,
        self.resume = resume
        self.job_attributes = self.extract_attributes(job_description)
        self.resume_attributes = self.extract_attributes(resume)
    
    def create_bert_embedding(self, text):
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertModel.from_pretrained('bert-base-uncased')
    
        # Tokenize and convert to tensor
        inputs = tokenizer(text, return_tensors="pt")
        outputs = model(**inputs)
    
        # Extract the embeddings from the second-to-last layer
        embedding = outputs.last_hidden_state.mean(dim=1)
    
        return embedding
    
    def extract_attributes(self, text):
        nlp = spacy.load("en_core_web_sm")
        doc = nlp(text)
    
        job_title = location = industry = education = skills = None
    
        for token in doc:
            if "JOB_TITLE" in token.ent_type_:
                job_title = token.text.strip()
            elif "GPE" in token.ent_type_:
                location = token.text.strip()
            elif "ORG" in token.ent_type_:
                industry = token.text.strip()
            elif re.search(r'(?i)(Ph\.?D\.?|MS|Masters?|B(?:Tech)?|BS)\s+in\s+(.*?)(?=\n|$)', token.text):
                education = re.search(r'(?i)(Ph\.?D\.?|MS|Masters?|B(?:Tech)?|BS)\s+in\s+(.*?)(?=\n|$)', token.text).group(2).strip()
            elif re.search(r'(?i)\b(?:NLP|AI|statistics|probability|python|c\+\+|keras|pytorch|machine learning)\b', token.text):
                if skills is None:
                    skills = []
                skills.append(re.search(r'(?i)\b(?:NLP|AI|statistics|probability|python|c\+\+|keras|pytorch|machine learning)\b', token.text).group(0).strip())
    
        if isinstance(skills, list):
            skills = ', '.join(skills)
    
    
        return {'job_title': job_title, 'location': location, 'industry': industry, 'education': education, 'skills': skills}
    
    def calculate_matching_score(self, weights):
        print('Job description attributes:', self.job_attributes)
        print('\nResume attributes:', self.resume_attributes)
        matching_scores = {}
        total_weight = sum(weights.values())

        print('\nRunning matcher...\n')
        for attribute in self.job_attributes:
            job_attribute_value = self.job_attributes[attribute]
            resume_attribute_value = self.resume_attributes[attribute]
    
            if job_attribute_value is not None and resume_attribute_value is not None:
                job_embedding = self.create_bert_embedding(job_attribute_value)
                resume_embedding = self.create_bert_embedding(resume_attribute_value)
    
                # Calculate similarity score for each attribute
                similarity_score = torch.nn.functional.cosine_similarity(job_embedding, resume_embedding, dim=1)
                # Scale the similarity score by the corresponding weight
                weighted_score = similarity_score.item() * weights.get(attribute, 1.0)
                
                # Normalize the weighted score to be in the range [0, 100]
                normalized_score = (weighted_score / total_weight) * 100
                
                matching_scores[attribute] = normalized_score
            else:
                matching_scores[attribute] = 0
    
        return matching_scores