<a href="https://colab.research.google.com/github/Dimildizio/system_design/blob/main/create_translations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install transformers sentencepiece googletrans==4.0.0-rc1
#set a PARTICULAR version of google trans omg
#Marian model requires sentence piece

In [2]:
import pandas as pd
import sentencepiece

from google.colab import files
from googletrans import Translator
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, MarianTokenizer, MarianMTModel
from typing import List

In [3]:
access_token =''

In [55]:
MARIAN_NAME = "Helsinki-NLP/opus-mt-ru-en"
NLLB_NAME = "facebook/nllb-200-distilled-600M"

## Upload text

In [4]:
df = pd.read_excel('eval_data.xlsx')
df.head()

Unnamed: 0,ru,en_expert,en_auto
0,На основе данной модели реализован эксперимент...,An experimental method is implemented based on...,Based on this model is implemented experimenta...
1,Прецизионное уточнение атомной структуры минер...,Accurate crystal structure refinement of natro...,Precise refinement of atomic structure mineral...
2,Здесь и далее: точки – экспериментальные значе...,"Hereinafter, circles are experimental data, so...","Hereinafter,: point - experimental values, sol..."
3,Об образовании метастабильных фаз при кристалл...,On the formation of metastable phases during c...,About formation metastable phases at crystalli...
4,Магнитоэлектрический эффект в трехслойных асим...,Magnetoelectric Effect in Three-Layer Asymmetr...,Magnetoelectric effect in a three-layer asymme...


## Google translate

In [53]:
class G_Translator:
  def __init__(self):
    self.translator = Translator()


  def translate(self, sent: str) -> str:
    return self.translator.translate(sent, src='ru', dest='en').text

### NLLB model

In [None]:
%%capture
rus_tokenizer = AutoTokenizer.from_pretrained(
    NLLB_NAME, src_lang="rus_Cyrl", token=access_token)

nllb_model = AutoModelForSeq2SeqLM.from_pretrained(NLLB_NAME,
                                                   token=access_token)

In [43]:
class NLLB_Translator:
  def __init__(self, model, tokenizer, target_lang: str = 'eng_Latn', sent_len: int = 300):
    self.model = model
    self.tokenizer = tokenizer
    self.to_lang = target_lang
    self.max_length = sent_len

  def tokenize(self, sent: str):
    return self.tokenizer(sent, return_tensors='pt')

  def generate_translation(self, inputs):
    return self.model.generate(
      **inputs, forced_bos_token_id=self.tokenizer.lang_code_to_id[self.to_lang],
      max_length=self.max_length)

  def get_decoded(self, toks) -> List:
    return self.tokenizer.batch_decode(toks, skip_special_tokens=True)

  def translate(self, sent: str):
    tokens = self.tokenize(sent)
    translated = self.generate_translation(tokens)
    result = self.get_decoded(translated)
    return result

## Marian model

In [46]:
%%capture
marian_tokenizer = MarianTokenizer.from_pretrained(MARIAN_NAME)
marian_pretrained = MarianMTModel.from_pretrained(MARIAN_NAME)

In [41]:
class Marian_Translator:
  def __init__(self, tokenizer, model,  maxlen = 300, tokenlist = ('</s>', '<pad> ')):
    self.tokenizer = tokenizer
    self.model = model
    self.list_of_tokens = tokenlist  #tokens to skip other than UNK
    self.maxlen = maxlen


  def remove_specials(self, sent: str) -> str:
    for token in self.list_of_tokens:
      sent = sent.replace(token, '')
    return sent


  def get_tokens(self, sent: str):
    return self.tokenizer.encode(sent, return_tensors='pt')


  def generate_translation(self, ids):
    return self.model.generate(ids, max_length=self.maxlen, num_beams=4,
                               early_stopping=True)[0]


  def decode(self, tokens, skip=False):
    return self.tokenizer.decode(tokens, skip_special_token=skip)


  def translate(self, sent: str):
    ru_ids = self.get_tokens(sent)
    translated_ids = self.generate_translation(ru_ids)
    decoded = self.decode(translated_ids)
    result = self.remove_specials(decoded)

## Create models

In [51]:
google_model = G_Translator()
nllb_model = NLLB_Translator(nllb_model, rus_tokenizer)
marian_model = Marian_Translator(marian_tokenizer, marian_pretrained)

### Save model

In [56]:
def save_n_download(df, name = 'translations.tsv', sep = '\t'):
  df.to_csv(name, index=False, sep=sep)
  files.download(name)

## Create a class to run the translators

In [57]:
class Datasetter:
  def __init__(self, dataset, g, nllb_m, marian):

    self.df = dataset.copy()

    self.g_translator = g
    self.nllb_translator = nllb_m
    self.marian_translator = marian

    self.translations = {'en_google':self.g_trans,
                         'en_nllb':self.nllb_trans,
                         'en_marian': self.marian_trans
                         }


  def g_trans(self, line:str) -> str:
    return self.g_translator.translate(line.lower())


  def nllb_trans(self, line: str) -> str:
    return self.nllb_model.translate(line.lower())


  def marian_trans(self, line: str) -> str:
    return self.marian_translator.translate(line.lower())


  def translate(self):
    for col_name, func in self.translations.items():
      self.df[col_name] = self.df['ru'].apply(func)
    return self.df

In [58]:
ensemble = Datasetter(df, google_model, nllb_model, marian_model)