In [1]:
%%capture
!pip install transformers sentencepiece googletrans==4.0.0-rc1
#set a PARTICULAR version of google trans omg
#Marian model requires sentence piece

In [53]:
import pandas as pd
import sentencepiece

from google.colab import files
from googletrans import Translator
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, MarianTokenizer, MarianMTModel
from typing import List, Dict, Tuple, Callable

In [5]:
access_token =''

In [6]:
MARIAN_NAME = "Helsinki-NLP/opus-mt-ru-en"
NLLB_NAME = "facebook/nllb-200-distilled-600M"
FILENAME = "eval_data.xlsx"

## Upload text

In [None]:
df = pd.read_excel(FILENAME)
df.head()

## Google translate

In [56]:
class G_Translator:
  def __init__(self):
    """
    Initialize the Google Translator.
    """
    self.translator = Translator()


  def translate(self, sent: str) -> str:
    """
    Translate a sentence from Russian to English.

    :param sent: The input sentence in Russian.
    :return: The translated sentence in English.
    """
    return self.translator.translate(sent, src='ru', dest='en').text

### NLLB model

In [57]:
class NLLB_Translator:
  def __init__(self, tokenizer, model, target_lang: str = 'eng_Latn', sent_len: int = 300):
    """
    Initialize the NLLB Translator.

    :param tokenizer: The tokenizer for the NLLB model.
    :param model: The NLLB model for translation.
    :param target_lang: Target language code. Default is 'eng_Latn'.
    :param sent_len: Maximum length for generated translations. Default is 300.
    """
    self.tokenizer = tokenizer
    self.model = model
    self.to_lang = target_lang
    self.max_length = sent_len

  def tokenize(self, sent: str):
    """
    Tokenize a sentence using the NLLB tokenizer.

    :param sent: The input sentence.
    :return: Tokenized inputs.
    """
    return self.tokenizer(sent, return_tensors='pt')

  def generate_translation(self, inputs):
    """
    Generate a translation from tokenized inputs.

    :param inputs: Tokenized inputs.

    :return: torch.Tensor: Token IDs of the generated translation.
    """
    return self.model.generate(
      **inputs, forced_bos_token_id=self.tokenizer.lang_code_to_id[self.to_lang],
      max_length=self.max_length)


  def get_decoded(self, toks) -> str:
    """
    Decode token IDs to text.

    :param toks: Token IDs.
    :return: Decoded sentence.
    """
    return self.tokenizer.batch_decode(toks, skip_special_tokens=True)[0]

  def translate(self, sent: str) -> str:
    """
    Perform all necessary actions to translate a sentence.

    :param sent: The input sentence.
    :return: The translated sentence.
    """
    tokens = self.tokenize(sent)
    translated = self.generate_translation(tokens)
    result = self.get_decoded(translated)
    return result

## Marian model

In [54]:
class Marian_Translator:
  def __init__(self, tokenizer, model,  maxlen: int=300, tokenlist: Tuple[str]=('</s>', '<pad> ')):
    """
    Initialize the Marian Translator.

    :param tokenizer: The tokenizer for the Marian model.
    :param model: The Marian model for translation.
    :param maxlen: Maximum length for generated translations. Default is 300.
    :param tokenlist: Tuple of special tokens to skip except UNK. Default is ('<s>', '<pad>').
    """
    self.tokenizer = tokenizer
    self.model = model
    self.list_of_tokens = tokenlist  #tokens to skip other than UNK
    self.maxlen = maxlen


  def remove_specials(self, sent: str) -> str:
    """
    Remove special tokens from a sentence.

    :param sent: The input sentence.
    :return The sentence with special tokens removed.
    """
    for token in self.list_of_tokens:
      sent = sent.replace(token, '')
    return sent


  def get_tokens(self, sent: str):
    """
    Tokenize a sentence using the SentencePiece tokenizer.
    :param sent: The input sentence.
    :return: torch.Tensor: The tokenized sentence.
    """
    return self.tokenizer.encode(sent, return_tensors='pt')


  def generate_translation(self, ids):
    """
    Generate a translation from token IDs.

    :param ids: Token IDs of the input sentence.
    :return: torch.Tensor: Token IDs of the generated translation.
    """
    return self.model.generate(ids, max_length=self.maxlen, num_beams=4,
                               early_stopping=True)[0]


  def decode(self, tokens, skip: bool=False) -> str:
    """
    Decode token IDs to text.

    :param tokens: torch.Tensor: Token IDs.
    :param skip: Whether to skip special tokens. Defaults to False. We got a function to filter it.
    :return:The decoded text.
    """
    return self.tokenizer.decode(tokens, skip_special_token=skip)


  def translate(self, sent: str):
    """
    Perform all necessary actions to translate a sentence.
    :param sent: The input sentence in Russian.
    :return: The translated sentence in English.
    """
    ru_ids = self.get_tokens(sent)
    translated_ids = self.generate_translation(ru_ids)
    decoded = self.decode(translated_ids)
    result = self.remove_specials(decoded)
    return result

### Save model

In [52]:
def save_n_download(df: pd.DataFrame, name='translations.tsv', sep='\t'):
  """
  Save a DataFrame to a file and download it.

  :param df: The DataFrame to be saved and downloaded.
  :param name: The name of the file. Default is 'translations.tsv'.
  :param sep: The separator for the file. Default is '\t'.
  """
  df.to_csv(name, index=False, sep=sep)
  files.download(name)

## Create a class to run the translators

In [58]:
class Datasetter:
  """
  A class for translating and managing datasets.
  """
  def __init__(self, g, nllb_m, marian, original_df: str=FILENAME,
               name_to_save: str ='translations.tsv', sep: str='\t'):
    """
    Initialize the Datasetter.

    :param g: Google Translator instance.
    :param nllb_m: NLLB Translator instance.
    :param marian: Marian Translator instance.
    :param original_df: Path to the original dataset (Excel format).
    :param name_to_save: Name of the file to save translated data to.
    :param sep: Separator for saving the file (default is tab).
    """
    self.filename = original_df
    self.name_to_save = name_to_save
    self.sep = sep
    self.df = self.load_data()

    self.g_translator = g
    self.nllb_translator = nllb_m
    self.marian_translator = marian

    self.translations: Dict[str, Callable]= {'en_google':self.g_trans,
                         'en_nllb':self.nllb_trans,
                         'en_marian': self.marian_trans
                         }   #could create two separate lists and a for cycle


  def lower_dec(func):
    """
    A decorator to convert input text to lowercase before translation.

    :param func: The translation function.
    :return: Wrapped translation function.
    """
    def wrapper(self, line):
      return func(self, line.lower())
    return wrapper


  @lower_dec
  def g_trans(self, line: str) -> str:
    """
    Translate a line using Google Translator.

    :param line: Input text in Russian.
    :return: Translated text in English.
    """
    return self.g_translator.translate(line)


  @lower_dec
  def nllb_trans(self, line: str) -> str:
    """
    Translate a line using NLLB Translator.

    :param line: Input text in Russian.
    :return: Translated text in English.
    """
    return self.nllb_translator.translate(line)


  @lower_dec
  def marian_trans(self, line: str) -> str:
    """
    Translate a line using Marian Translator.

    :param line: Input text in Russian.
    :return: Translated text in English.
    """
    return self.marian_translator.translate(line)


  def translate(self) -> pd.DataFrame:
    """
    Translate the 'ru' column in the DataFrame using all methods.

    :return: DataFrame with new columns for each translation method.
    """
    for col_name, func in self.translations.items():
      # Could add a check here whether we want to translate to existing cols
      self.df[col_name] = self.df['ru'].apply(func)
    return self.df


  def change_name_to_save(self, name: str) -> None:
    """
    Change the name of the file to save.

    :param name: New file name.
    """
    self.name_to_save = name


  def change_separator(self, sep: str) -> None:
    """
    Change the separator used when saving the file.

    :param sep: New separator.
    """
    self.sep = sep


  def load_data(self) -> pd.DataFrame:
    """
    Load data from an Excel file into a DataFrame.

    :return: Loaded DataFrame.
    """
    df = pd.read_excel(self.filename)
    return df


  def load_csv(self, filename: str, separator: str) -> None:
    """
    Load data from a CSV/TSV file into the DataFrame.

    :param filename: Path to the CSV/TSV file.
    :param separator: Separator used in the CSV/TSV file.
    """
    self.df = pd.read_csv(filename, sep=separator)


  def df_head(self, n: int=5) -> None:
    """
    Displays in colab(only) the first 'n' rows of the DataFrame.

    :param n: Number of rows to display (default is 5).
    """
    display(self.df.head(n))


  def save_df(self) -> None:
    """
    Save the DataFrame to a CSV/TSV file with the specified name and separator.
    """
    self.df.to_csv(self.name_to_save, index=False, sep=self.sep)


  def download_df(self) -> None:
    """
    Download the saved DataFrame as a file.
    """
    files.download(self.name_to_save)


  def pipe(self) -> None:
    """
    Translate the data, save it to a file, and download it.
    """
    self.translate()
    self.save_df()
    self.download_df()

## Check result

#### Load NLLB

In [15]:
%%capture
rus_tokenizer = AutoTokenizer.from_pretrained(
    NLLB_NAME, src_lang="rus_Cyrl", token=access_token)

nllb_pretrained = AutoModelForSeq2SeqLM.from_pretrained(NLLB_NAME,
                                                   token=access_token)

#### Load Marian

In [16]:
%%capture
marian_tokenizer = MarianTokenizer.from_pretrained(MARIAN_NAME)
marian_pretrained = MarianMTModel.from_pretrained(MARIAN_NAME)

#### Create models

In [60]:
google_model = G_Translator()
nllb_model = NLLB_Translator(rus_tokenizer, nllb_pretrained)
marian_model = Marian_Translator(marian_tokenizer, marian_pretrained)

#### Create general use instance

In [62]:
ensemble = Datasetter(google_model, nllb_model, marian_model)

In [63]:
ensemble.pipe()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>