In [2]:
from collections import Counter, defaultdict
from typing import Union, List
import math

In [3]:
class ROUGEScore:
    """
    ROUGE (Recall-Oriented Understudy for Gisting Evaluation) metric implementation.
    Focuses on recall - how much of reference content is captured in generated text.
    """
    def _get_ngrams(self, tokens:List[str], n:int):
        if n > len(tokens):
            return Counter() 
        
        ngrams = []
        for i in range(len(tokens)-n+1):
            ngrams.append(tuple(tokens[i:i+n])) 
        return Counter(ngrams) 
    
    def _lcs_length(self, x:List[str], y:List[str]):
        """Calculate length of longest common string"""
        m,n = len(x), len(y) 
        dp = [[0]*(n+1) for _ in range(m+1)] 

        for i in range(1, m+1):
            for j in range(1, n+1):
                if x[i-1] == y[j-1]:
                    dp[i][j] = dp[i-1][j-1]+1 
                else:
                    dp[i][j] = max(dp[i-1][j], dp[i][j-1]) 

        return dp[m][n]
    
    def rouge_n(self, candidate:str, reference:str, n:int=1):
        """candidate: generated text,
           references: reference text
           n = n grams(default = 1)

           Returns:
                Dictionary with precision, recall and F1 scores
        """
        candidate_tokens = candidate.lower().split() 
        reference_tokens = reference.lower().split()

        candidate_ngrams = self._get_ngrams(candidate_tokens, n)
        reference_ngrams = self._get_ngrams(reference_tokens, n) 

        if not reference_ngrams:
            return {'precision':0.0, 'recall':0.0, 'f1': 0.0}

        ## calculate the overlap
        overlap = 0 
        for ngram in candidate_ngrams:
            overlap+= min(candidate_ngrams[ngram], reference_ngrams[ngram])

        ## calculate - precision, recall, and f1 score
        precision = overlap / sum(candidate_ngrams.values()) if candidate_ngrams else 0.0 # true positive out of predicted positive
        recall = overlap / sum(reference_ngrams.values()) if reference_ngrams else 0.0  # true positive out of actual positive
        f1 = (precision*recall) / (precision+recall) if (precision+recall) >0 else 0.0


        return {'precision': precision, 'recall':recall, 'f1':f1} 
    
    def rouge_l(self, candidate:str, reference:str):
        """ Calculate rouge l using longest common string""" 
        candidate_tokens = candidate.lower().split() 
        reference_tokens = reference.lower().split() 

        if not reference_tokens:
            return {'precision':0.0, 'recall':0.0, 'f1':0.0} 
        
        lcs_length = self._lcs_length(candidate_tokens, reference_tokens)
        precision = lcs_length / len(candidate_tokens) if candidate_tokens else 0.0 
        recall = lcs_length / len(reference_tokens) if reference_tokens else 0.0 
        f1 = (precision*recall) / (recall+precision) if (precision+recall) > 0 else 0.0 

        return {'precision': precision, 'recall':recall, 'f1':f1}
    
    def calculate_all(self,candidate:str, reference:str):
        return{
            'rouge-1' : self.rouge_n(candidate, reference, 1),
            'rouge-2' : self.rouge_n(candidate, reference, 2),
            'rouge-l' : self.rouge_l(candidate, reference) 
        }



In [4]:
## example test
candidate = "The cat sat on the mat and looked around"
reference_rouge = "The cat sat on the mat and was very comfortable" 

rouge_scorer = ROUGEScore()
rouge_results = rouge_scorer.calculate_all(candidate, reference_rouge) 
print(rouge_results)


{'rouge-1': {'precision': 0.7777777777777778, 'recall': 0.7, 'f1': 0.36842105263157887}, 'rouge-2': {'precision': 0.75, 'recall': 0.6666666666666666, 'f1': 0.35294117647058826}, 'rouge-l': {'precision': 0.7777777777777778, 'recall': 0.7, 'f1': 0.36842105263157887}}
