<a href="https://colab.research.google.com/github/AoShuang92/PhD_tutorial/blob/main/NLP_Evaluation_Metrics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Evaluating Natural Language Understanding (NLU) with a focus on semantic understanding involves a variety of metrics, each of which offers a unique perspective on how well a model comprehends and interprets language. Here are some of the best metrics for this purpose:

**BLEU** (Bilingual Evaluation Understudy): Originally designed for machine translation, BLEU evaluates the quality of text by comparing it to reference texts. It measures precision - how many words in the generated text appear in the reference texts.

**ROUGE** (Recall-Oriented Understudy for Gisting Evaluation): ROUGE is used primarily in summarization tasks. It measures recall - the fraction of words from the reference summaries that appear in the generated summaries.

**METEOR** (Metric for Evaluation of Translation with Explicit Ordering): This metric, also used in machine translation, evaluates generated text against reference texts. It considers word order and synonymy, offering a more nuanced view than BLEU.

**BERTScore**: Utilizing BERT embeddings, BERTScore computes the similarity of two sentences as a sum of cosine similarities between their tokens' embeddings. It's sensitive to semantic and syntactic nuances.

**Word Mover’s Distance (WMD)**: WMD measures the distance between two text documents as the minimum amount of distance that the embedded words of one document need to "travel" to reach the embedded words of another document. It's effective in capturing semantic similarity.

**Semantic Textual Similarity (STS)**: STS measures the degree of semantic equivalence between two sentences. It's often used in tasks where understanding the similarity or difference in meaning is crucial.

**Perplexity**: While traditionally used in language modeling, perplexity can also be a metric for NLU. Lower perplexity indicates a higher probability of the text being semantically and syntactically coherent.

# Using nltk library

In [None]:
!pip -q install nltk==3.5

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.4 MB[0m [31m1.1 MB/s[0m eta [36m0:00:02[0m[2K     [91m━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.4/1.4 MB[0m [31m5.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m0.8/1.4 MB[0m [31m7.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━[0m [32m1.2/1.4 MB[0m [31m8.7 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.4/1.4 MB[0m [31m9.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for nltk (setup.py) ..

In [None]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.chrf_score import chrf_precision_recall_fscore_support
from nltk.translate.meteor_score import single_meteor_score
import warnings
warnings.filterwarnings("ignore", category=UserWarning)


label = 'this is a small test'.split()
prediction    = 'this is test'.split()
BLEU_1 = sentence_bleu([label], prediction, weights=(1, 0, 0, 0))
BLEU_2 = sentence_bleu([label], prediction, weights=(1, 1, 0, 0))
BLEU_3 = sentence_bleu([label], prediction, weights=(1, 1, 1, 0))
BLEU_4 = sentence_bleu([label], prediction, weights=(1, 1, 1, 1))
print('BLEU_1:%.4f, BLEU_2:%.4f, BLEU_3:%.4f, BLEU_4:%.4f'%(BLEU_1, BLEU_2, BLEU_3, BLEU_4))

label = 'this is a small test'.split()
prediction    = 'this is test'.split()
prec, rec, f1, tp = chrf_precision_recall_fscore_support(label, prediction, n=1) # where n = n-gram
print('prec:%.4f, rec:%.4f, f1:%.4f, tp:%.4f'%(prec, rec, f1, tp))


label = 'this is a small test'
prediction    = 'this is test'
meteor = single_meteor_score(label, prediction)
print('Meteor:%.4f'%meteor)

BLEU_1:0.5134, BLEU_2:0.2567, BLEU_3:0.0000, BLEU_4:0.0000
prec:1.0000, rec:0.6000, f1:0.6250, tp:3.0000
Meteor:0.5324


# Using python coco captioning library
Limitations:<br>
- Labels and predictions must be more than one sample
- Labels and predictions must be list inside list

In [None]:
!pip -q install pycocoevalcap

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.3/104.3 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.spice.spice import Spice

import numpy as np

#Labels and predictions must be more than one sample
#Labels and predictions must be list inside list

labels = [['this is a small test'],['a large jetliner flying over a traffic filled street']]
predictions = [['this is test'],['plane is flying through the sky']]

labels = dict(zip(np.arange(len(labels)).astype(np.float), labels))
predictions = dict(zip(np.arange(len(predictions)).astype(np.float), predictions))

(bleu1_avg, bleu1_per_sentence) = Bleu(n=1).compute_score(labels, predictions) # n = n-gram
(bleu4_avg, bleu4_per_sentence) = Bleu(n=4).compute_score(labels, predictions)
(cider_avg, cider_per_sentence) = Cider().compute_score(labels, predictions)
(meteor_avg, meteor_per_sentence) = Meteor().compute_score(labels, predictions)
(rouge_avg, rouge_per_sentence) = Rouge().compute_score(labels, predictions)

(spice_avg, cider_per_sentence) = Spice().compute_score(labels, predictions)

print('BLEU_1:%.4f, CIDEr:%.4f, METEOR:%.4f, ROUGE:%.4f, SPICE:%.4f'
        %(bleu1_avg[0], cider_avg, meteor_avg, rouge_avg, spice_avg))



Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  labels = dict(zip(np.arange(len(labels)).astype(np.float), labels))
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  predictions = dict(zip(np.arange(len(predictions)).astype(np.float), predictions))


{'testlen': 9, 'reflen': 14, 'guess': [9], 'correct': [4]}
ratio: 0.6428571428112246
{'testlen': 9, 'reflen': 14, 'guess': [9, 7, 5, 3], 'correct': [4, 1, 0, 0]}
ratio: 0.6428571428112246
Downloading stanford-corenlp-3.6.0 for SPICE ...
Progress: 384.5M / 384.5M (100.0%)
Extracting stanford-corenlp-3.6.0 ...
Done.
BLEU_1:0.2550, CIDEr:1.6123, METEOR:0.1570, ROUGE:0.4232, SPICE:0.3333


#semantic-text-similarity metrics
Source: https://github.com/AndriyMulyar/semantic-text-similarity

In [None]:
!pip install -q semantic-text-similarity

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/416.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/416.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m409.6/416.0 kB[0m [31m6.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m416.0/416.0 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.1/158.1 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m169.4/169.4 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K   

In [None]:
from semantic_text_similarity.models import WebBertSimilarity
from semantic_text_similarity.models import ClinicalBertSimilarity

web_model = WebBertSimilarity(device='cpu', batch_size=10) #defaults to GPU prediction

clinical_model = ClinicalBertSimilarity(device='cuda', batch_size=10) #defaults to GPU prediction

web_model.predict([("She won an olympic gold medal","The women is an olympic champion")])

Downloading model: web-bert-similarity from https://github.com/AndriyMulyar/semantic-text-similarity/releases/download/v1.0.0/web_bert_similarity.tar.gz


100%|██████████| 405359924/405359924 [00:08<00:00, 50512744.25B/s]


Downloading model: clinical-bert-similarity from https://github.com/AndriyMulyar/semantic-text-similarity/releases/download/v1.0.0/clinical_bert_similarity.tar.gz


100%|██████████| 401555686/401555686 [00:07<00:00, 53907128.67B/s]


array([3.0079896], dtype=float32)

#BertScore

In [None]:
!pip install -q transformers bert_score

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━[0m [32m41.0/61.1 kB[0m [31m1.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m925.0 kB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from bert_score import score, plot_example
preds = ["She won an olympic gold medal"]
gts = ["The women is an olympic champion"]
P, R, F1 = score(preds, gts, lang="en", verbose=True)
print('Bert Score: F1 = ', F1.item())
# pred, gt = preds[0], gts[0]
# plot_example(cand, gt, lang="en")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.97 seconds, 1.03 sentences/sec
Bert Score: F1 =  0.9326457381248474


#Huggingface evaluate lib
src: https://huggingface.co/docs/evaluate/choosing_a_metric

In [None]:
!pip install -q -U evaluate rouge_score

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


BLEU, Rouge, Precision

In [None]:
import evaluate

bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
meteor = evaluate.load('meteor')

###Example-1
preds = ["She won an olympic gold medal"]
gts = ["She won an olympic silver medal"]

results_bleu = bleu.compute(predictions=preds, references=gts)
results_rouge = rouge.compute(predictions=preds, references=gts)
results_meteor = meteor.compute(predictions=preds, references=gts)

print('Examples1:')
print('results_bleu:',results_bleu['bleu'])
print('results_rouge1:',results_rouge['rouge1'])
print('results_rougeL:',results_rouge['rougeL'])
print('results_meteor:', results_meteor)

###Example-2
preds = ["She won an olympic gold medal"]
gts = ["The women is an olympic champion"]

results_bleu = bleu.compute(predictions=preds, references=gts)
results_rouge = rouge.compute(predictions=preds, references=gts)
results_meteor = meteor.compute(predictions=preds, references=gts)

print('\nExamples2:')
print('results_bleu:',results_bleu['bleu'])
print('results_rouge1:',results_rouge['rouge1'])
print('results_rougeL:',results_rouge['rougeL'])
print('results_meteor:', results_meteor)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Examples1:
results_bleu: 0.537284965911771
results_rouge1: 0.8333333333333334
results_rougeL: 0.8333333333333334
results_meteor: {'meteor': 0.8066666666666668}

Examples2:
results_bleu: 0.0
results_rouge1: 0.3333333333333333
results_rougeL: 0.3333333333333333
results_meteor: {'meteor': 0.3125}


#Metric: perplexity using huggingface evaluate lib

perperplexity of a generated sentence

<img src="https://global.discourse-cdn.com/hellohellohello/original/2X/0/060e08c9f09a8b84a6aec6ce99878265d0300b38.gif">

In [None]:
import evaluate

perplexity = evaluate.load("perplexity", module_type="metric")
input_texts = ["She won an olympic gold medal", "The women is an olympic champion"]

results = perplexity.compute(model_id='gpt2',
                             add_start_token=False,
                             predictions=input_texts)
print(results)

  0%|          | 0/1 [00:00<?, ?it/s]

{'perplexities': [27.879850387573242, 99.88091278076172], 'mean_perplexity': 63.88038158416748}
