<a href="https://colab.research.google.com/github/Dimildizio/system_design/blob/main/NLLB_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install huggingface lib

In [1]:
%%capture
!pip install transformers
!pip install rouge-score

## Imports

In [28]:
import nltk
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer

In [29]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

## Specify huggingface access token to download model

In [6]:
access_token ='' #Put your huggingface token here

## Download tokenization models for rus and english corpus

In [7]:
eng_tokenizer = AutoTokenizer.from_pretrained(
    "facebook/nllb-200-distilled-600M", token=access_token)
rus_tokenizer = AutoTokenizer.from_pretrained(
    "facebook/nllb-200-distilled-600M", src_lang="rus_Cyrl", token=access_token)

Downloading (…)okenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

# Trying the out-of-the-box model

## Create model

In [8]:
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M", token=access_token)

Downloading (…)lve/main/config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]



Downloading (…)neration_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

## Create example data

In [68]:
doc = 'Шустрая бурая лисица прыгает через ленивого пса!'
reference = 'The quick brown fox jumps over the lazy dog!'
g_trans = 'The nimble brown fox jumps over the lazy dog!'

## Tokenize

In [10]:
rus_tok = rus_tokenizer(doc, return_tensors='pt')

## Translate

In [11]:
translated_tokens = model.generate(
    **rus_tok, forced_bos_token_id=rus_tokenizer.lang_code_to_id["eng_Latn"], max_length=30)

In [12]:
translated = rus_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0] #for multiple entries

## Metrics

In [13]:
Rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

#### **BLEU** - BiLingual Evaluation Understudy
>cares more about word overlap.

>Precision is more important.

> Uses n-grams for evaluation.

>Normalizes scores for text length.

>Typical for machine translation.

>Rewards model for producing matching with reference words.

> Penalizes longer sentences.


#### **ROUGE** - Recall-Oriented Understudy for Gisting Evaluation

>Focused on capturing context (Gisting Evaluation).

>Recall is more important. (Recall-Oriented)

> Uses longest common subseq for evaluation.

> Doesn't normalize scores for text length.

>Typical for text summarization.

>Rewards model if in general generated text represents the contexts of reference.

> Longer texts have advantage for recall.


#### **METEOR** - Metrics for Evaluation of Translation with Explicit ORdering

> Takes word order into account.

> Uses stemming and other techniques for synonyms and paraphrasing

> F1 score is more important.

> Uses unigrams (1 word) along with synonyms with preloaded WordNet synonym dictionary.

> More robust in variations

> Doesn't penalize longer texts

> Typical for machine translation and text summarization.

> Again: more flexible due to use of synonyms (more complex than word overlap)


#### **TER** - Translation Edit Rate

> Represents the **number of edits** needed to get from the hypothesis to the reference sentence. Lower is better.

> Basically quantifies the dissimilarity of the reference and translation

> Possible changes include: deletions, substitution, insertion, shifting

> Used in machine translation and texts summarization.

> Not included in commonly-used libs like nltk

> More complex

> No percentage score. Difficult to interprete the result. 0 edits is the best.




In [59]:
def round_perc(num: float) -> float:
  return round(num*100, 2)

def get_sent_bleu(sentence: str, reference: str=reference) -> float:
  '''n=3 gram'''
  score = sentence_bleu([reference.split()], sentence.split(), weights = (0.25, 0.5, 0.25)) #weights define the 'window size'
  return round_perc(score)

def get_bleu(sentence: str, reference: str=reference) -> float:
  score = corpus_bleu([[reference.split()]], [sentence.split()])
  return round_perc(score)

def get_meteor(sentence: str, reference: str=reference) -> float:
  score = meteor_score([reference.split()], sentence.split())
  return round_perc(score)

def get_rouge(sentence: str, reference: str=reference) -> float:
  '''rouge-1 unigrams, individual words
     rouge-2 bigrams, word pairs
     rouge-L longest sequence'''
  scores = Rouge.score(reference, sentence)
  idict = {key:{} for key in scores}
  for key in scores:
    idict[key]['precision'] = str(round_perc(scores[key].precision))+'%'
    idict[key]['recall'] = str(round_perc(scores[key].recall))+'%'
    idict[key]['f1'] = str(round_perc(scores[key].fmeasure))+'%'
  rouge_dict = {key: idict[key] for key in idict}
  return rouge_dict

In [70]:
def ter(hypothesis, reference=reference):
    n = len(reference)
    m = len(hypothesis)

    # Init matrix for dynamic programming
    dp = [[0] * (m + 1) for _ in range(n + 1)]

    # Init first row and column
    for i in range(n + 1):
        dp[i][0] = i
    for j in range(m + 1):
        dp[0][j] = j

    # Fill in the DP matrix
    for i in range(1, n + 1):
        for j in range(1, m + 1):
            cost = 0 if reference[i - 1] == hypothesis[j - 1] else 1
            dp[i][j] = min(dp[i - 1][j] + 1, dp[i][j - 1] + 1, dp[i - 1][j - 1] + cost)
    return dp[n][m]

### Test metrics

In [76]:
print(f'Reference: {reference}\n\n')
for translation in (g_trans, translated, reference):
  score = ter(translation, reference)
  print(f"Translation: {translation}\nScore: {score}\n")

Reference: The quick brown fox jumps over the lazy dog!


Translation: The nimble brown fox jumps over the lazy dog!
Score: 6

Translation: A shrewd brown fox jumps over a lazy dog!
Score: 12

Translation: The quick brown fox jumps over the lazy dog!
Score: 0



## Flow

In [81]:
class MachineTranslation:

  def __init__(self, model, tokenizer, target_lang='eng_Latn', sent_len=300):
    self.model=model
    self.tokenizer = tokenizer
    self.to_lang = target_lang
    self.sent_len = sent_len
    self.metrics_dict = {'TER':ter,
                        'Bleu corpus':get_bleu,
                        'Bleu sentence': get_sent_bleu,
                        'Meteor': get_meteor,
                        'Rouge': get_rouge
                         }


  def tokenize(self, sent: str):
    '''Tokenize input sentence'''
    return self.tokenizer(sent, return_tensors='pt')


  def translate(self, inputs):
    '''
    Generate translation
    '''
    return self.model.generate(
      **inputs, forced_bos_token_id=self.tokenizer.lang_code_to_id[self.to_lang],
      max_length=self.sent_len)


  def get_decoded(self, toks) -> list:
    '''
    Convert vect tokens into sentences
    '''
    return self.tokenizer.batch_decode(toks, skip_special_tokens=True)


  def generate_metrics(self, translation: str, reference: str) -> None:
    '''
    Use BLEU metrics and compare translated sent to the best translation
    '''
    print(f'Reference: {reference}\nTranslation:{translation}\n\n')
    for name, func in self.metrics_dict.items():
      score = func(translation, reference)
      self.print_metrics(translation, name, score)


  def print_metrics(self, translation, metrics_name, score):
    perc_sign = ' edits' if metrics_name == 'TER' else '%'
    print(f"{metrics_name} score: {score}{perc_sign}")


  def process_sentence(self, sent: str):
    '''
    main process for translation
    '''
    tokens = self.tokenize(sent)
    translated_tokens = self.translate(tokens)
    result = self.get_decoded(translated_tokens)
    return result


  def infer(self, sent: str, reference: str) -> None:
    ''' TO BE CHANGED
    Compare first sentence of the doc to the reference
    '''
    translation = self.process_sentence(sent)
    self.generate_metrics(translation[0], reference)

In [83]:
MT = MachineTranslation(model, rus_tokenizer)

In [84]:
MT.infer(doc, reference)

Reference: The quick brown fox jumps over the lazy dog!
Translation:A shrewd brown fox jumps over a lazy dog!


TER score: 12 edits
Bleu corpus score: 35.49%
Bleu sentence score: 46.71%
Meteor score: 65.43%
Rouge score: {'rouge1': {'precision': '66.67%', 'recall': '66.67%', 'f1': '66.67%'}, 'rouge2': {'precision': '50.0%', 'recall': '50.0%', 'f1': '50.0%'}, 'rougeL': {'precision': '66.67%', 'recall': '66.67%', 'f1': '66.67%'}}%
