<a href="https://colab.research.google.com/github/Dimildizio/system_design/blob/main/create_translations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install transformers sentencepiece googletrans==4.0.0-rc1
#set a PARTICULAR version of google trans omg
#Marian model requires sentence piece

In [3]:
import pandas as pd
import sentencepiece

from google.colab import files
from googletrans import Translator
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, MarianTokenizer, MarianMTModel
from typing import List

In [5]:
access_token =''

In [6]:
MARIAN_NAME = "Helsinki-NLP/opus-mt-ru-en"
NLLB_NAME = "facebook/nllb-200-distilled-600M"
FILENAME = "eval_data.xlsx"

## Upload text

In [None]:
df = pd.read_excel(FILENAME)
df.head()

## Google translate

In [10]:
class G_Translator:
  def __init__(self):
    self.translator = Translator()


  def translate(self, sent: str) -> str:
    return self.translator.translate(sent, src='ru', dest='en').text

### NLLB model

In [25]:
class NLLB_Translator:
  def __init__(self, tokenizer, model, target_lang: str = 'eng_Latn', sent_len: int = 300):
    self.tokenizer = tokenizer
    self.model = model
    self.to_lang = target_lang
    self.max_length = sent_len

  def tokenize(self, sent: str):
    return self.tokenizer(sent, return_tensors='pt')

  def generate_translation(self, inputs):
    return self.model.generate(
      **inputs, forced_bos_token_id=self.tokenizer.lang_code_to_id[self.to_lang],
      max_length=self.max_length)

  def get_decoded(self, toks) -> List:
    return self.tokenizer.batch_decode(toks, skip_special_tokens=True)[0]

  def translate(self, sent: str):
    tokens = self.tokenize(sent)
    translated = self.generate_translation(tokens)
    result = self.get_decoded(translated)
    return result

## Marian model

In [20]:
class Marian_Translator:
  def __init__(self, tokenizer, model,  maxlen = 300, tokenlist = ('</s>', '<pad> ')):
    self.tokenizer = tokenizer
    self.model = model
    self.list_of_tokens = tokenlist  #tokens to skip other than UNK
    self.maxlen = maxlen


  def remove_specials(self, sent: str) -> str:
    for token in self.list_of_tokens:
      sent = sent.replace(token, '')
    return sent


  def get_tokens(self, sent: str):
    return self.tokenizer.encode(sent, return_tensors='pt')


  def generate_translation(self, ids):
    return self.model.generate(ids, max_length=self.maxlen, num_beams=4,
                               early_stopping=True)[0]


  def decode(self, tokens, skip=False):
    return self.tokenizer.decode(tokens, skip_special_token=skip)


  def translate(self, sent: str):
    ru_ids = self.get_tokens(sent)
    translated_ids = self.generate_translation(ru_ids)
    decoded = self.decode(translated_ids)
    result = self.remove_specials(decoded)
    return result

### Save model

In [13]:
def save_n_download(df, name='translations.tsv', sep='\t'):
  df.to_csv(name, index=False, sep=sep)
  files.download(name)

## Create a class to run the translators

In [21]:
class Datasetter:
  def __init__(self, g, nllb_m, marian, original_df=FILENAME,
               name_to_save='translations.tsv', sep='\t'):

    self.filename = original_df
    self.name_to_save = name_to_save
    self.sep = sep
    self.df = self.load_data()

    self.g_translator = g
    self.nllb_translator = nllb_m
    self.marian_translator = marian

    self.translations = {'en_google':self.g_trans,
                         'en_nllb':self.nllb_trans,
                         'en_marian': self.marian_trans
                         }   #could create two separate lists and a for cycle


  def lower_dec(func):
    def wrapper(self, line):
      return func(self, line.lower())
    return wrapper

  @lower_dec
  def g_trans(self, line: str) -> str:
    return self.g_translator.translate(line)

  @lower_dec
  def nllb_trans(self, line: str) -> str:
    return self.nllb_translator.translate(line)

  @lower_dec
  def marian_trans(self, line: str) -> str:
    return self.marian_translator.translate(line)


  def translate(self) -> pd.DataFrame:
    for col_name, func in self.translations.items():
      self.df[col_name] = self.df['ru'].apply(func)
    return self.df


  def change_name_to_save(self, name: str) -> None:
    self.name_to_save = name


  def change_separator(self, sep: str) -> None:
    self.sep = sep


  def load_data(self) -> pd.DataFrame:
    df = pd.read_excel(self.filename)
    return df


  def load_csv(self, filename, separator) -> None:
    self.df = pd.read_csv(filename, sep=separator)


  def df_head(self, n=5) -> None:
    display(self.df.head(n))


  def save_df(self) -> None:
    self.df.to_csv(self.name_to_save, index=False, sep=sep)


  def download_df(self) -> None:
    files.download(self.name_to_save)

## Check result

#### Load NLLB

In [15]:
%%capture
rus_tokenizer = AutoTokenizer.from_pretrained(
    NLLB_NAME, src_lang="rus_Cyrl", token=access_token)

nllb_pretrained = AutoModelForSeq2SeqLM.from_pretrained(NLLB_NAME,
                                                   token=access_token)

#### Load Marian

In [16]:
%%capture
marian_tokenizer = MarianTokenizer.from_pretrained(MARIAN_NAME)
marian_pretrained = MarianMTModel.from_pretrained(MARIAN_NAME)

#### Create models

In [26]:
google_model = G_Translator()
nllb_model = NLLB_Translator(rus_tokenizer, nllb_pretrained)
marian_model = Marian_Translator(marian_tokenizer, marian_pretrained)

#### Create general use instance

In [29]:
ensemble = Datasetter(google_model, nllb_model, marian_model)

In [None]:
ensemble.translate()