<a href="https://colab.research.google.com/github/DmitriyValetov/nlp_course_project/blob/master/metrics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np

# <pad>  - 0
# <unk>  - 1 
# <sos>  - 2 
# <eos>  - 3
# word A - 4
# word B - 5
eos = 3
# batch targets (batch size, batch length)
bts = np.array([
[2, 1, 4, 1, 0],  # target 1
[2, 1, 4, 5, 3],  # target 2
[2, 4, 3, 0, 0]   # target 3
])
# batch predictions (batch size, batch length)
bps = np.array([
[2, 1, 5, 1, 0],  # prediction 1
[2, 1, 5, 4, 3],  # prediction 2
[2, 4, 3, 0, 0]   # prediction 3
])

In [48]:
!pip install rouge_score
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], 
                                  use_stemmer=False)
target = 'The quick brown dog jumps on the log.'
prediction = 'The quick brown fox jumps over the lazy dog'
scores = scorer.score(target, prediction)
print(scores)

{'rouge1': Score(precision=0.6666666666666666, recall=0.75, fmeasure=0.7058823529411765), 'rouge2': Score(precision=0.25, recall=0.2857142857142857, fmeasure=0.26666666666666666), 'rougeL': Score(precision=0.5555555555555556, recall=0.625, fmeasure=0.5882352941176471)}


In [49]:
!pip install rouge
from rouge import Rouge 

hypothesis = 'The quick brown dog jumps on the log.'
reference = 'The quick brown fox jumps over the lazy dog'
rouge = Rouge()
scores = rouge.get_scores(hypothesis, reference)
print(scores)

[{'rouge-1': {'f': 0.7058823479584776, 'p': 0.75, 'r': 0.6666666666666666}, 'rouge-2': {'f': 0.266666661688889, 'p': 0.2857142857142857, 'r': 0.25}, 'rouge-l': {'f': 0.5882352891349482, 'p': 0.625, 'r': 0.5555555555555556}}]


In [58]:
from pprint import pprint
# set (unique) metrics (without order)
scores = {}
for sp, st in zip(bts, bps):
  # target to set
  st_eos_i = np.where(st==eos)[0][0] if eos in st else len(st)
  st_eos = st[:st_eos_i]
  t_set = set(st_eos) #- {0, 1, 2, 3}
  # prediction to set
  sp_eos_i = np.where(sp==eos)[0][0] if eos in sp else len(sp)
  sp_eos = sp[:sp_eos_i]
  p_set = set(sp_eos) #- {0, 1, 2, 3}
  # metrics
  i_set = t_set.intersection(p_set)
  pre = len(i_set)/len(p_set) if len(p_set) != 0 else 0
  rec = len(i_set)/len(t_set) if len(t_set) != 0 else 0
  f1 = 2*pre*rec/(pre + rec) if pre + rec != 0 else 0
  rl = len(sp_eos)/len(st_eos) if len(st_eos) != 0 else 0
  print(f'st: {st}, sp: {sp}')
  print(f'eos in st?: {eos in st}, sp_eos_i: {st_eos_i}')
  print(f'eos in sp?: {eos in sp}, sp_eos_i: {sp_eos_i}')
  print(f'st_eos: {st_eos}, sp_eos: {sp_eos}')
  print(f't_set: {t_set}, p_set: {p_set}')
  t_str = ' '.join(map(str, st_eos))
  p_str = ' '.join(map(str, sp_eos))
  print(f't_str: "{t_str}", p_str: "{p_str}"')
  rs_1 = scorer.score(t_str, p_str)
  rs_2 = rouge.get_scores(p_str, t_str)
  pprint(rs_1)
  pprint(rs_2)
  print(f'precision: {pre}, recall: {rec}, f1: {f1}. rl: {rl}\n')
  scores.setdefault('set_precision', []).append(pre)
  scores.setdefault('set_recall', []).append(rec)
  scores.setdefault('set_f1', []).append(f1)
  scores.setdefault('set_relen', []).append(rl)
  for k, v in rs_1.items():
    scores.setdefault(f'{k}_precision', []).append(v.precision)
    scores.setdefault(f'{k}_recall', []).append(v.recall)
    scores.setdefault(f'{k}_f1', []).append(v.fmeasure)
  # pprint(scores)

st: [2 1 5 1 0], sp: [2 1 4 1 0]
eos in st?: False, sp_eos_i: 5
eos in sp?: False, sp_eos_i: 5
st_eos: [2 1 5 1 0], sp_eos: [2 1 4 1 0]
t_set: {0, 1, 2, 5}, p_set: {0, 1, 2, 4}
t_str: "2 1 5 1 0", p_str: "2 1 4 1 0"
{'rouge1': Score(precision=0.8, recall=0.8, fmeasure=0.8000000000000002),
 'rouge2': Score(precision=0.5, recall=0.5, fmeasure=0.5),
 'rougeL': Score(precision=0.8, recall=0.8, fmeasure=0.8000000000000002)}
[{'rouge-1': {'f': 0.7999999950000002, 'p': 0.8, 'r': 0.8},
  'rouge-2': {'f': 0.4999999950000001, 'p': 0.5, 'r': 0.5},
  'rouge-l': {'f': 0.749999995, 'p': 0.75, 'r': 0.75}}]
precision: 0.75, recall: 0.75, f1: 0.75. rl: 1.0

{'rouge1_f1': [0.8000000000000002],
 'rouge1_precision': [0.8],
 'rouge1_recall': [0.8],
 'rouge2_f1': [0.5],
 'rouge2_precision': [0.5],
 'rouge2_recall': [0.5],
 'rougeL_f1': [0.8000000000000002],
 'rougeL_precision': [0.8],
 'rougeL_recall': [0.8],
 'set_f1': [0.75],
 'set_precision': [0.75],
 'set_recall': [0.75],
 'set_relen': [1.0]}
st: [2 1 5

In [0]:
# accuracy (ordered)
pad_mask = bts != 0  # mask <pad> at target
m_bts = bts[pad_mask]  # mask targets
m_bps = bps[pad_mask]  # mask predictions like targets
comp = m_bts == m_bps  # compare
acc = np.sum(comp) / len(m_bts)
print(f'targets:\n{bts}')
print(f'predictions:\n{bps}')
print(f'pad_mask:\n{pad_mask}')
print(f'masked targets:\n{m_bts}')
print(f'masked predictions:\n{m_bps}')
print(f'comparison\n{comp}')
print(f'accuracy: {acc}')

targets:
[[2 1 4 3 0]
 [2 1 4 5 3]
 [2 4 3 0 0]]
predictions:
[[2 1 5 1 0]
 [2 1 5 4 3]
 [2 4 3 0 0]]
pad_mask:
[[ True  True  True  True False]
 [ True  True  True  True  True]
 [ True  True  True False False]]
masked targets:
[2 1 4 3 2 1 4 5 3 2 4 3]
masked predictions:
[2 1 5 1 2 1 5 4 3 2 4 3]
comparison
[ True  True False False  True  True False False  True  True  True  True]
accuracy: 0.6666666666666666
