<a href="https://colab.research.google.com/github/Dimildizio/system_design/blob/main/create_translations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install transformers sentencepiece googletrans==4.0.0-rc1
#set a PARTICULAR version of google trans omg
#Marian model requires sentence piece

In [2]:
from googletrans import Translator
import pandas as pd
import sentencepiece

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, MarianTokenizer, MarianMTModel
from typing import List

In [3]:
access_token =''

## Upload text

In [4]:
df = pd.read_excel('eval_data.xlsx')
df.head()

Unnamed: 0,ru,en_expert,en_auto
0,На основе данной модели реализован эксперимент...,An experimental method is implemented based on...,Based on this model is implemented experimenta...
1,Прецизионное уточнение атомной структуры минер...,Accurate crystal structure refinement of natro...,Precise refinement of atomic structure mineral...
2,Здесь и далее: точки – экспериментальные значе...,"Hereinafter, circles are experimental data, so...","Hereinafter,: point - experimental values, sol..."
3,Об образовании метастабильных фаз при кристалл...,On the formation of metastable phases during c...,About formation metastable phases at crystalli...
4,Магнитоэлектрический эффект в трехслойных асим...,Magnetoelectric Effect in Three-Layer Asymmetr...,Magnetoelectric effect in a three-layer asymme...


## Google translate the text

In [5]:
translator = Translator()

In [6]:
def g_trans(sent: str) -> str:
  return translator.translate(sent.lower(), src='ru', dest='en').text

In [22]:
df['en_google'] = df['ru'].apply(g_trans)

### NLLB model

In [7]:
%%capture
rus_tokenizer = AutoTokenizer.from_pretrained(
    "facebook/nllb-200-distilled-600M", src_lang="rus_Cyrl", token=access_token)

In [8]:
%%capture
nnlb_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M", token=access_token)

In [9]:
class NLLB_Translate:
  def __init__(self, model, tokenizer, target_lang: str = 'eng_Latn', sent_len: int = 300):
    self.model = model
    self.tokenizer = tokenizer
    self.to_lang = target_lang
    self.max_length = sent_len

  def tokenize(self, sent: str):
    return self.tokenizer(sent, return_tensors='pt')

  def translate(self, inputs):
    return self.model.generate(
      **inputs, forced_bos_token_id=self.tokenizer.lang_code_to_id[self.to_lang],
      max_length=self.max_length)

  def get_decoded(self, toks) -> List:
    return self.tokenizer.batch_decode(toks, skip_special_tokens=True)

  def process_sentence(self, sent: str):
    tokens = self.tokenize(sent)
    translated = self.translate(tokens)
    result = self.get_decoded(translated)
    return result

In [None]:
nllb_trans_model = NLLB_Translate(model, rus_tokenizer)

In [48]:
sentence = nllb_trans_model.process_sentence(df['ru'][0])
sentence[0]

In [None]:
def nllb_trans(sent: str) -> str:
  return nllb_trans_model.process_sentence(sent)[0]

In [90]:
df1=df.copy()

In [92]:
df1['en_nllb'] = df1['ru'].apply(nllb_trans)

In [None]:
df1.head()

Unnamed: 0,ru,en_expert,en_auto,en_google,en_nllb
0,На основе данной модели реализован эксперимент...,An experimental method is implemented based on...,Based on this model is implemented experimenta...,"Based on this model, an experimental method is...","Based on this model, an experimental method ha..."
1,Прецизионное уточнение атомной структуры минер...,Accurate crystal structure refinement of natro...,Precise refinement of atomic structure mineral...,The precision clarification of the atomic stru...,Precision refinement of the atomic structure o...
2,Здесь и далее: точки – экспериментальные значе...,"Hereinafter, circles are experimental data, so...","Hereinafter,: point - experimental values, sol...","Hereinafter: points - experimental values, sol...","Here and there: points experimental values, a..."
3,Об образовании метастабильных фаз при кристалл...,On the formation of metastable phases during c...,About formation metastable phases at crystalli...,On the formation of metastable phases for crys...,About the formation of metastable phases in th...
4,Магнитоэлектрический эффект в трехслойных асим...,Magnetoelectric Effect in Three-Layer Asymmetr...,Magnetoelectric effect in a three-layer asymme...,The magnetoelectric effect in three -layer asy...,Magnetic effect in three-layer asymmetrical st...


## Marian model

In [20]:
%%capture
m_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ru-en")
m_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-ru-en")
list_of_tokens = ['</s>', '<pad> ']  #tokens to skip other than UNK

In [21]:
def marian_translate(model, tokenizer, sentence, maxlen=300):
  input_ids_ru = tokenizer.encode(sentence, return_tensors="pt")
  translated_ids_en = model.generate(input_ids_ru, max_length=maxlen, num_beams=4, early_stopping=True)
  result = tokenizer.decode(translated_ids_en[0], skip_special_tokens=False) #skip UNK for now
  for token in list_of_tokens:
    result = result.replace(token, '')
  return result

def apply_marian(sent: str) -> str:
  return marian_translate(m_model, m_tokenizer, sent)

In [None]:
df_all = df1.copy()
df_all['en_marian'] = df_all['ru'].apply(apply_marian)