# LLM Response evaluation using ROUGE SCORE

# Precision: Fraction of matched n-grams in the generated text.
# Recall: Fraction of matched n-grams in the reference text.
# F1-Score: Harmonic mean of precision and recall.

In [14]:
from rouge_score import rouge_scorer

# Reference and generated summaries
reference = "The cat sat on the mat, watching tom and jerry cartoon in television"
generated = "The cat is sitting on the mat, watching tom and jerry"

# Initialize the ROUGE scorer
#scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2'], use_stemmer=True)
#scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2','rouge3'], use_stemmer=True)
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2','rouge3', 'rougeL'],use_stemmer=True)

# Compute ROUGE scores
scores = scorer.score(reference, generated)

# Print results
for rouge_type, score in scores.items():
    print(f"{rouge_type}: Precision: {score.precision:.4f}, Recall: {score.recall:.4f}, F1: {score.fmeasure:.4f}")


rouge1: Precision: 0.8182, Recall: 0.6923, F1: 0.7500
rouge2: Precision: 0.7000, Recall: 0.5833, F1: 0.6364
rouge3: Precision: 0.5556, Recall: 0.4545, F1: 0.5000
rougeL: Precision: 0.8182, Recall: 0.6923, F1: 0.7500


In [15]:
reference1 = "The dog chased the ball."
generated1 = "A ball was chased by the dog."

scorer = rouge_scorer.RougeScorer(['rouge1','rouge2','rouge3','rougeL'])

scores = scorer.score(reference1, generated1)

for rouge_type, score in scores.items():
    print(f"{rouge_type}: Precision: {score.precision:.4f}, Recall: {score.recall:.4f}, F1: {score.fmeasure:.4f}")

rouge1: Precision: 0.5714, Recall: 0.8000, F1: 0.6667
rouge2: Precision: 0.1667, Recall: 0.2500, F1: 0.2000
rouge3: Precision: 0.0000, Recall: 0.0000, F1: 0.0000
rougeL: Precision: 0.2857, Recall: 0.4000, F1: 0.3333


In [16]:
reference2 = "She loves reading books."
generated2 = "She enjoys reading novels and books."

scorer = rouge_scorer.RougeScorer(['rouge1','rouge2','rouge3','rougeL'])

scores = scorer.score(reference2, generated2)

for rouge_type, score in scores.items():
    print(f"{rouge_type}: Precision: {score.precision:.4f}, Recall: {score.recall:.4f}, F1: {score.fmeasure:.4f}")

rouge1: Precision: 0.5000, Recall: 0.7500, F1: 0.6000
rouge2: Precision: 0.0000, Recall: 0.0000, F1: 0.0000
rouge3: Precision: 0.0000, Recall: 0.0000, F1: 0.0000
rougeL: Precision: 0.5000, Recall: 0.7500, F1: 0.6000


In [17]:
reference3 = "The quick brown fox jumped over the lazy dog that was sleeping peacefully under the shade of the big oak tree."
generated3 = "Under the large oak tree, a lazy dog was sleeping when a brown fox quickly jumped over it."

scorer = rouge_scorer.RougeScorer(['rouge1','rouge2','rouge3','rougeL'])

scores = scorer.score(reference3, generated3)

for rouge_type, score in scores.items():
    print(f"{rouge_type}: Precision: {score.precision:.4f}, Recall: {score.recall:.4f}, F1: {score.fmeasure:.4f}")

rouge1: Precision: 0.6667, Recall: 0.5714, F1: 0.6154
rouge2: Precision: 0.3529, Recall: 0.3000, F1: 0.3243
rouge3: Precision: 0.0000, Recall: 0.0000, F1: 0.0000
rougeL: Precision: 0.2778, Recall: 0.2381, F1: 0.2564


In [18]:
reference4 = "The quick brown fox jumped over the lazy dog that was sleeping peacefully under the shade of the big oak tree."
generated4 = "Under the large oak tree, a lazy dog was sleeping when a brown fox quickly jumped over it."

scorer = rouge_scorer.RougeScorer(['rouge1','rouge2','rouge3','rougeL'])

scores = scorer.score(reference4, generated4)

for rouge_type, score in scores.items():
    print(f"{rouge_type}: Precision: {score.precision:.4f}, Recall: {score.recall:.4f}, F1: {score.fmeasure:.4f}")

rouge1: Precision: 0.6667, Recall: 0.5714, F1: 0.6154
rouge2: Precision: 0.3529, Recall: 0.3000, F1: 0.3243
rouge3: Precision: 0.0000, Recall: 0.0000, F1: 0.0000
rougeL: Precision: 0.2778, Recall: 0.2381, F1: 0.2564


**Explanation**

* Reference 1,2,3,4 & Generated 1,2,3,4 sentences are taken form the AI generated text.
* For the output of the Reference 1,2,3,4 & Generated 1,2,3,4 -> Paraphrasing and Rewording does't change the ROUGE-SCORE.

In [19]:
reference5 = "Summarization is cool, I love Machine Learning, Good night."
generated5 = "Summarization is beneficial and cool, Summarization saves time, People are getting used to Machine Learning, I think i love Machine Learning, Good night everyone!, Night!."

scorer = rouge_scorer.RougeScorer(['rouge1','rouge2','rouge3','rougeL'])

scores = scorer.score(reference5, generated5)

for rouge_type, score in scores.items():
    print(f"{rouge_type}: Precision: {score.precision:.4f}, Recall: {score.recall:.4f}, F1: {score.fmeasure:.4f}")

rouge1: Precision: 0.3600, Recall: 1.0000, F1: 0.5294
rouge2: Precision: 0.2500, Recall: 0.7500, F1: 0.3750
rouge3: Precision: 0.1739, Recall: 0.5714, F1: 0.2667
rougeL: Precision: 0.3600, Recall: 1.0000, F1: 0.5294


In [20]:
reference6 = "Cybercrime is one of the most known crime and taking money of most the people which is leading to loss of income"
generated6 = "Cybercrime is one of the most prevalent crimes, affecting many individuals and leading to financial losses"

scorer = rouge_scorer.RougeScorer(['rouge1','rouge2','rouge3','rougeL'])

scores = scorer.score(reference5, generated5)

for rouge_type, score in scores.items():
    print(f"{rouge_type}: Precision: {score.precision:.4f}, Recall: {score.recall:.4f}, F1: {score.fmeasure:.4f}")

rouge1: Precision: 0.3600, Recall: 1.0000, F1: 0.5294
rouge2: Precision: 0.2500, Recall: 0.7500, F1: 0.3750
rouge3: Precision: 0.1739, Recall: 0.5714, F1: 0.2667
rougeL: Precision: 0.3600, Recall: 1.0000, F1: 0.5294


* In the above example Semantic similarity are high, but direct matches in sequence are low.