# Metric 1 : BLEU

## I- Charging of the libraries

In [1]:
import math
import sys
import warnings
from collections import Counter
from fractions import Fraction
from nltk.util import ngrams

In [None]:
import nltk
nltk.download('punkt')
from nltk import tokenize

## I- Chargement des datasets

In [17]:
hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which','ensures', 'that', 'the', 'military', 'always','obeys', 'the', 'commands', 'of', 'the', 'party']
ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that','ensures', 'that', 'the', 'military', 'will', 'forever','heed', 'Party', 'commands']
ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which','guarantees', 'the', 'military', 'forces', 'always','being', 'under', 'the', 'command', 'of', 'the', 'Party']
ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', 'army', 'always', 'to', 'heed', 'the', 'directions','of', 'the', 'party']

hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was','interested', 'in', 'world', 'history']
ref2a = ['he', 'was', 'interested', 'in', 'world', 'history','because', 'he', 'read', 'the', 'book']

list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
hypotheses = [hyp1, hyp2]

In [26]:
weights=(0.25, 0.25, 0.25, 0.25)

## II- Implementation of the metric

In [32]:
#To see if there is no error in the input data

if len(list_of_references) != len(hypotheses) :
    print("Error, you need the same number of ref and hyp")
if len(list_of_references) == 0 :
    print("Error, you need references")
if len(hypotheses) == 0 :
    print("Error, you need hypotheses")

#weights = [weights]
weight_length = len(weights)

# We create Counter dictionnaries (count the occurencies) 
numerators_precision = Counter()
denominators_precision = Counter()
hyp_lengths, ref_lengths = 0, 0

# Iterate through each hypothesis and their corresponding references.
for references, hypothesis in zip(list_of_references, hypotheses):
    # compute the numerator and denominator of the corpus-level precision for each order of ngram. 
    for i in range(1, weight_length + 1):
        # Extracts all ngrams in hypothesis
        counts = Counter(ngrams(hypothesis, i))
        # To get the union of counts of hyp and ref ngrams
        max_counts = {}
        for reference in references:
            # Extract all unique ngrams in references
            reference_counts = (Counter(ngrams(reference, i)))
            for ngram in counts:
                max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram])

        # Intersection between hypothesis and references' counts for each different ngrams.
        intersection_counts = {ngram: min(count, max_counts[ngram]) for ngram, count in counts.items()}

        numerator = sum(intersection_counts.values())
        # The denominator must be superior to 0 (0 posssible if the ngram order is > len(reference) ).
        denominator = max(1, sum(counts.values()))
        p_i = Fraction(numerator, denominator, _normalize=False)
        numerators_precision[i] += p_i.numerator
        denominators_precision[i] += p_i.denominator

    # Compute the hypothesis length (number of words) and the closest reference length.
    # It is useful to calculate corpus-level brevity penalty
    hyp_len = len(hypothesis)
    hyp_lengths += hyp_len
    #print(hyp_lengths)
    ref_lens = (len(reference) for reference in references)
    ref_lengths += min(ref_lens, key= lambda ref_len: (abs(ref_len - hyp_len), ref_len))
    #print(ref_lengths)

# Calculate corpus-level brevity penalty.
if hyp_lengths > ref_lengths:
    brevity_penalty = 1
else:
    brevity_penalty = math.exp(1 - ref_lengths / hyp_lengths)

# Save the precision values for the different ngram orders (from 1 to weight_length).
p_n = [Fraction(numerators_precision[i], denominators_precision[i], _normalize=False) for i in range(1, weight_length + 1)]

# In the case that precision is equal to 0
if numerators_precision[1] == 0:
        bleu_score = 0

for weight in weights:
    info = (weight * math.log(precision_i) for precision_i in p_n if precision_i > 0)
bleu_score = brevity_penalty * math.exp(math.fsum(info))

In [31]:
bleu_score

0.5920778868801042

Explanations : 

modified_precision : 
    
    The normal precision method may lead to some wrong translations with
    high-precision, e.g., the translation, in which a word of reference
    repeats several times, has very high precision.

    This function only returns the Fraction object that contains the numerator
    and denominator necessary to calculate the corpus-level precision.
    To calculate the modified precision for a single pair of hypothesis and
    references, cast the Fraction object into a float.

    The famous "the the the ... " example shows that you can get BLEU precision
    by duplicating high frequency words.
    
    In the modified n-gram precision, a reference word will be considered
    exhausted after a matching hypothesis word is identified

closest_ref_length : 
    
    This function finds the reference that is the closest length to the
    hypothesis. The closest reference length is referred to as *r* variable
    from the brevity penalty formula in Papineni et. al. (2002)

brevity_penalty : 

    As the modified n-gram precision still has the problem from the short
    length sentence, brevity penalty is used to modify the overall BLEU
    score according to length.

    An example from the paper. There are three references with length 12, 15
    and 17. And a concise hypothesis of the length 12. The brevity penalty is 1.
    
    In case a hypothesis translation is shorter than the references, penalty is
    applied.
    
    The length of the closest reference is used to compute the penalty. If the
    length of a hypothesis is 12, and the reference lengths are 13 and 2, the
    penalty is applied because the hypothesis length (12) is less then the
    closest reference length (13).
    
    The brevity penalty doesn't depend on reference order. More importantly,
    when two reference sentences are at the same distance, the shortest
    reference sentence length is used.

Inspired from https://www.nltk.org/_modules/nltk/translate/bleu_score.html