<a href="https://colab.research.google.com/github/Dimildizio/system_design/blob/main/NLLB_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install huggingface lib

In [9]:
%%capture
!pip install transformers

## Imports

In [84]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
from nltk.translate.bleu_score import corpus_bleu

## Specify huggingface access token to download model

In [5]:
access_token ='' #Put your huggingface token here or check version before

## Download tokenization models for rus and english corpus

In [8]:
eng_tokenizer = AutoTokenizer.from_pretrained(
    "facebook/nllb-200-distilled-600M", token=access_token)
rus_tokenizer = AutoTokenizer.from_pretrained(
    "facebook/nllb-200-distilled-600M", src_lang="rus_Cyrl", token=access_token)

# Trying the out-of-the-box model

## Create model

In [10]:
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M", token=access_token)

Downloading (…)lve/main/config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]



Downloading (…)neration_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

## Create example data

In [45]:
doc = 'Шустрая бурая лисица прыгает через ленивого пса!'
reference = 'The quick brown fox jumps over the lazy dog!'
g_trans = 'The nimble brown fox jumps over the lazy dog!'

## Tokenize

In [19]:
rus_tok = rus_tokenizer(doc, return_tensors='pt')

## Translate

In [17]:
translated_tokens = model.generate(
    **rus_tok, forced_bos_token_id=rus_tokenizer.lang_code_to_id["eng_Latn"], max_length=30)

In [56]:
translated = rus_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0] #for multiple entries

## Metrics

In [65]:
def get_bleu(sentence: str, reference: str=reference) -> float:
    return round(corpus_bleu([[reference.split()]], [sentence.split()])*100, 2)

### Test metrics

In [88]:
print(f'Reference: {reference}\n\n')
for translation in (g_trans, translated, reference):
  score = get_bleu(translation)
  print(f"Translation: {translation}\nScore: {score}%\n")

Reference: The quick brown fox jumps over the lazy dog!


Translation: The nimble brown fox jumps over the lazy dog!
Score: 75.06%

Translation: A shrewd brown fox jumps over a lazy dog!
Score: 35.49%

Translation: The quick brown fox jumps over the lazy dog!
Score: 100.0%



## Flow

In [85]:
class MachineTranslation:

  def __init__(self, model, tokenizer, target_lang='eng_Latn', sent_len=300):
    self.model=model
    self.tokenizer = tokenizer
    self.to_lang = target_lang
    self.sent_len = sent_len


  def tokenize(self, sent: str):
    '''Tokenize input sentence'''
    return self.tokenizer(sent, return_tensors='pt')


  def translate(self, inputs):
    '''
    Generate translation
    '''
    return self.model.generate(
      **inputs, forced_bos_token_id=self.tokenizer.lang_code_to_id[self.to_lang],
      max_length=self.sent_len)


  def get_decoded(self, toks) -> list:
    '''
    Convert vect tokens into sentences
    '''
    return self.tokenizer.batch_decode(toks, skip_special_tokens=True)


  def print_metrics(self, translation: str, reference: str) -> None:
    '''
    Use BLEU metrics and compare translated sent to the best translation
    '''
    print(f'Reference: {reference}\n\n')
    score = get_bleu(translation, reference)
    print(f"Translation: {translation}\nScore: {score}%\n")


  def process_sentence(self, sent: str):
    '''
    main process for translation
    '''
    tokens = self.tokenize(sent)
    translated_tokens = self.translate(tokens)
    result = self.get_decoded(translated_tokens)
    return result


  def infer(self, sent: str, reference: str) -> None:
    ''' TO BE CHANGED
    Compare first sentence of the doc to the reference
    '''
    translation = self.process_sentence(sent)
    self.print_metrics(translation[0], reference)

In [86]:
MT = MachineTranslation(model, rus_tokenizer)

In [87]:
MT.infer(doc, reference)

Reference: The quick brown fox jumps over the lazy dog!


Translation: A shrewd brown fox jumps over a lazy dog!
Score: 35.49%

