In [1]:
#!pip install rouge-score

from rouge_score import rouge_scorer

# Create a scorer for multiple ROUGE metrics
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

reference = "The cat sat on the mat."
candidate = "The cat is sitting on the mat."

scores = scorer.score(reference, candidate)

# Display scores
for metric, score in scores.items():
    print(f"{metric}:")
    print(f"  Precision: {score.precision:.4f}")
    print(f"  Recall:    {score.recall:.4f}")
    print(f"  F1:        {score.fmeasure:.4f}\n")

rouge1:
  Precision: 0.7143
  Recall:    0.8333
  F1:        0.7692

rouge2:
  Precision: 0.5000
  Recall:    0.6000
  F1:        0.5455

rougeL:
  Precision: 0.7143
  Recall:    0.8333
  F1:        0.7692



In [2]:
!pip install nltk

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Example reference and generated text
reference = ["the cat is on the mat".split()]
candidate = "the cat sat on the mat".split()

# Compute BLEU score
smooth_fn = SmoothingFunction().method1  # helps avoid zero scores
score = sentence_bleu(reference, candidate, smoothing_function=smooth_fn)

print("BLEU Score:", round(score, 4))

BLEU Score: 0.2541


In [3]:
!pip install evaluate

import evaluate

# Load BLEU metric
bleu = evaluate.load("bleu")

# Example data
predictions = ["the cat sat on the mat"]
references = [["the cat is on the mat"]]

# Compute BLEU score
results = bleu.compute(predictions=predictions, references=references)

print("BLEU Score:", results["bleu"])



  from .autonotebook import tqdm as notebook_tqdm


BLEU Score: 0.0


In [4]:
### Perplexity ######
import math

# Example sentence (test data)
sentence = ["I", "love", "machine", "learning"]

# Hypothetical model probabilities for each next word
# (In real models, these come from the softmax layer)
probs = [0.5, 0.4, 0.1, 0.2]  # P(w_i | context)

# Compute perplexity
N = len(probs)
log_sum = sum(math.log(p) for p in probs)
perplexity = math.exp(-log_sum / N)

print("Perplexity:", perplexity)


Perplexity: 3.9763536438352527


In [5]:
### cHrF++ #####

#!pip install sacrebleu
from sacrebleu.metrics import CHRF

# References and hypothesis
refs = ["The cat is on the mat"]
hyps = ["The cat is sitting on the mat"]

# Initialize chrF++ metric
chrf = CHRF(word_order=2)  # chrF++ (uses both char and word n-grams)

score = chrf.corpus_score(hyps, [refs])
print(f"chrF++ score: {score.score:.2f}")

chrF++ score: 76.01


In [6]:
from sacrebleu.metrics import BLEU, CHRF

# Reference (ground truth translation)
refs = ["The cat is on the mat"]

# 3 candidate system outputs
hyps = [
    "The cat is on the mat",              # Perfect match
    "The cat is sitting on the mat",      # Slight variation
    "A cat lies over a rug"               # Paraphrased version
]

# Initialize metrics
bleu = BLEU()
chrf = CHRF(word_order=2)  # chrF++ (word_order=0 is chrF)

# Compute and print both scores
for hyp in hyps:
    bleu_score = bleu.sentence_score(hyp, [refs[0]]).score
    chrf_score = chrf.sentence_score(hyp, [refs[0]]).score
    print(f"\nHypothesis: {hyp}")
    print(f"→ BLEU: {bleu_score:.2f}")
    print(f"→ chrF++: {chrf_score:.2f}")





Hypothesis: The cat is on the mat
→ BLEU: 100.00
→ chrF++: 100.00

Hypothesis: The cat is sitting on the mat
→ BLEU: 41.11
→ chrF++: 76.01

Hypothesis: A cat lies over a rug
→ BLEU: 8.12
→ chrF++: 12.51
