In [2]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

class BertRelevanceScorer:
    def __init__(self):
        # Load pre-trained BERT model and tokenizer
        # We use 'bert-base-uncased' as requested
        self.model_name = 'bert-base-uncased'
        self.tokenizer = BertTokenizer.from_pretrained(self.model_name)
        self.model = BertModel.from_pretrained(self.model_name)
        
        # Set to evaluation mode (saves memory/speed)
        self.model.eval()

    def get_embedding(self, text):
        """
        Converts text into a single vector using Mean Pooling.
        """
        # 1. Tokenize
        inputs = self.tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
        
        # 2. Pass through BERT model
        with torch.no_grad():
            outputs = self.model(**inputs)
            
        # 3. Get the "Last Hidden State" (Vectors for every word)
        # Shape: [batch_size, seq_len, hidden_dim]
        token_embeddings = outputs.last_hidden_state
        
        # 4. Get Attention Mask (to ignore padding tokens)
        attention_mask = inputs['attention_mask']

        # 5. MEAN POOLING (The critical step)
        # We average the vectors, but only for real words (not padding)
        
        # Expand mask to match embedding size
        mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        
        # Sum of vectors / Sum of valid tokens
        sum_embeddings = torch.sum(token_embeddings * mask_expanded, 1)
        sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
        
        mean_pooled_embedding = sum_embeddings / sum_mask
        
        return mean_pooled_embedding

    def calculate_relevance(self, resume_text, job_description):
        """
        Calculates Cosine Similarity (0 to 1) between Resume and Job.
        """
        # Get vectors
        resume_vec = self.get_embedding(resume_text)
        job_vec = self.get_embedding(job_description)
        
        # Calculate Cosine Similarity
        # transform to numpy for sklearn
        similarity = cosine_similarity(resume_vec.numpy(), job_vec.numpy())
        
        # Return as a float (0.0 to 1.0)
        return float(similarity[0][0])

# --- Example Usage ---
if __name__ == "__main__":
    scorer = BertRelevanceScorer()
    
    job_desc = "Looking for a software engineer with Python developer and Machine Learning experience."
    
    resume_1 = "I am a Python developer with 3 years of experience in AI and deep learning."
    resume_2 = "I am a graphic designer skilled in Photoshop and Illustrator."
    
    score_1 = scorer.calculate_relevance(resume_1, job_desc)
    score_2 = scorer.calculate_relevance(resume_2, job_desc)
    
    print(f"Resume 1 Relevance: {score_1:.4f}")  # Should be High
    print(f"Resume 2 Relevance: {score_2:.4f}")  # Should be Low

Resume 1 Relevance: 0.8349
Resume 2 Relevance: 0.6166
