In [11]:
## designing blue score for model response output with ground truth references
import math 
from collections import Counter 
from typing import List 


In [None]:
class BLEUScore:
    """
    BLEU (Bilingual Evaluation Understudy) metric implementation.
    Focuses on precision - how much of generated text appears in reference.
    """

    def __init__(self, max_n:int = 4):
        self.max_n = max_n 

    def _get_ngrams(self, tokens:List[str], n:int) -> Counter:
        """Extract n-grams from the token list"""
        if n > len(tokens):
            return Counter()
        ngrams = []
        for i in range(len(tokens)-n+1):
            ngrams.append(tuple(tokens[i:i+n])) 
        return Counter(ngrams) 
    

    def _modified_precision(self, candidate:List[str], references:List[List[str]],n:int):
        """Calculate modified n-gram precisions"""
        candidate_ngrams = self._get_ngrams(candidate, n) 
        if not candidate_ngrams:
            return 0.0 
        
        ## get maximum count for each n-gram across all references
        max_ref_counts = Counter() 
        for reference in references:
            ref_ngrams = self._get_ngrams(reference, n) 
            for ngram in candidate_ngrams:
                max_ref_counts[ngram] = max(max_ref_counts[ngram], ref_ngrams[ngram]) 
        
        # calculate clipped count
        clipped_count = 0
        total_counts = 0
        for ngram, count in candidate_ngrams.items():
            clipped_count+= min(count, max_ref_counts[ngram])
            total_counts+=count 

        return clipped_count/ total_counts if total_counts > 0 else 0.0 
    

    def _brevity_penalty(self, candidate_len:int, reference_len:List[int]):
        """calculate brevity penalty"""
        closest_ref_len = min(reference_len, key=lambda x: abs(x-candidate_len)) 
        if candidate_len > closest_ref_len :
            return 1.0 
        else:
            return math.exp(1-closest_ref_len/candidate_len) if candidate_len > 0 else 0.0 
        

    ## calculate the blue score     
    def calculate(self, candidate:str, references:List[str]):
        """
        Calculate BLEU score.
        
        Args:
            candidate: Generated text
            references: List of reference texts
        
        Returns:
            Dictionary with BLEU score and component metrics
        """
        candidate_tokens = candidate.lower().split() 
        references_tokens = [ref.lower().split() for ref in references] 

        if not candidate_tokens:
            return {'blue': 0.0, 'precision':[0.0]*self.max_n, 'brevity_penalty': 0.0} 
        
        # calculate modified precisions
        precisions = [] 
        for n in range(1, self.max_n+1):
            precision = self._modified_precision(candidate_tokens, references_tokens, n) 
            precisions.append(precision) 

        # calcualte brevity penlty
        candidate_len = len(candidate_tokens) 
        reference_len = [len(ref) for ref in references_tokens] 
        bp = self._brevity_penalty(candidate_len, reference_len) 

        ## calculate blue score 
        if all(p > 0 for p in precisions):
            log_precisions = [math.log(p) for p in precisions] 
            geometric_mean = math.exp(sum(log_precisions)/len(log_precisions)) 
            blue = bp*geometric_mean 
        else:
            blue = 0.0 

        return {
            'blue' : blue,
            'precisions' : precisions,
            'candidate_len' : candidate_len,
            'reference_len': reference_len
        }

In [10]:
## example test
candidate = "The cat sat on the mat and looked around" ## this is response from the model
references_bleu = ["The cat sat on the mat", "A cat was sitting on the mat"] ## this is ground truth references 

bleu_scorer = BLEUScore(3)
bleu_results = bleu_scorer.calculate(candidate, references_bleu)

print(bleu_results)

{'blue': 0.6197980942410934, 'precisions': [0.6666666666666666, 0.625, 0.5714285714285714], 'candidate_len': 9, 'reference_len': [6, 7]}
