In [None]:
! pip install sacrebleu

In [None]:

import numpy as np 
import pandas as pd
import transformers

from transformers import MarianMTModel, MarianTokenizer
import torch
import time
import sacrebleu

## Read data

In [None]:
data_path = '/kaggle/input/translations/translation_merged.parquet'
translations_df = pd.read_parquet(data_path)
language_pairs = translations_df[['src_lang', 'tgt_lang']].drop_duplicates()
language_pairs = language_pairs.values.tolist()

## Load language translations models

In [None]:
def load_models(language_pair: tuple[str]) -> dict:
    """Loads pretrained models from MarianMT

    :param language_pair: tuple containing src language and tgt language
    :type language_pair: tuple[str]
    :param cache_path: path to save cache for loading models
    :type cache_path: str
    :return: dictionary containing tokenizer and model objects
    :rtype: dict
    """
    # Get src and tgt language pairs
    src, tgt = language_pair
    model_name = f"Helsinki-NLP/opus-mt-{src}-{tgt}"

    # Load from huggingface or cache
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)

    return {"tokenizer": tokenizer, "model": model}


In [None]:
mt_models = {}
for src, tgt in language_pairs:
    print(f'Fetching {src}-{tgt} model')
    
    try:
        mt_models[f'{src}_{tgt}'] = load_models((src, tgt))
    except:
        try:
            src_mod = 'no' if src == 'nb'else src
            tgt_mod = 'no' if tgt == 'nb' else tgt
            mt_models[f'{src}_{tgt}'] = load_models((src_mod, tgt_mod))
        except:
            print(f'Model for {src}_{tgt} not found.')
            mt_models[f'{src}_{tgt}'] = None

In [None]:
# Models not supported
models_not_supported = [key for key, value in mt_models.items() if value is None]
models_not_supported

## Translate

Since the data is large, for each language pair we sample about 5K translations. We will compute statistics for the BLEU metric, using bootstrap.

In [None]:
sample_size = 5_000
random_state = 12

translations_df['lang_pair'] = translations_df['src_lang'] + '_' + translations_df['tgt_lang']
translations_filtered_df = translations_df.loc[~translations_df['lang_pair'].isin(models_not_supported)]
translations_filtered_df = translations_filtered_df.reset_index(drop=True)

# Sample by each language pair
translations_filtered_sample = (translations_filtered_df
                                .groupby('lang_pair')
                                .sample(n=sample_size, random_state=random_state)
                                .reset_index(drop=True)
                               )

In [None]:
def create_lang_pair_dataframe(translations:pd.DataFrame, lang_pair:str) -> pd.DataFrame:
    language_pair_df = (translations
                    .loc[translations['lang_pair']==lang_pair]
                    .reset_index(drop=True)
                   )
    return language_pair_df

lang_pair_samples = {lang_pair: create_lang_pair_dataframe(translations_filtered_sample, lang_pair)
                    for lang_pair in translations_filtered_sample['lang_pair'].unique()}

Now we tokenize each src_text from the sample dataframe

In [None]:
def tokenize_lang_pair_dataframe(lang_pair_df:pd.DataFrame, lang_pair: str, tokenizer, batch_size:int):
    src_text = lang_pair_df['src_text'].tolist()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    tokenized_text = [tokenizer(src_text[i:(i+batch_size)], return_tensors="pt", padding=True).to(device) 
                     for i in range(0, len(src_text), batch_size)]
    
    return tokenized_text

Finally, we produce the translation, and add it to the original dataframe

In [None]:
def translate_src_text(lang_pair_df, lang_pair, model, tokenized_text):
    language_pair_df = lang_pair_df.copy()
    model.eval()
    
    # Make translations
    translated_text = []
    t1 = time.time()
    for i, batch in enumerate(tokenized_text):
        translated = model.generate(**batch)
        translated_batch = [
            tokenizer.decode(t, skip_special_tokens=True) for t in translated
        ]
        translated_text.append(translated_batch)

        if i%10 == 0:
            print(f"{i} done")

    t2 = time.time()
    
    print(f'Translation done in {(t2-t1)/60} mins')
    
    # Create translation column and compute BLEU score
    language_pair_df['translation'] = [t for batch in translated_text for t in batch]
    language_pair_df['bleu'] = language_pair_df.apply(lambda x: sacrebleu.sentence_bleu(x['translation'].replace('"', '').strip(),
                                                                                    [x['tgt_text'].replace('"', '')]).score,axis=1)
    
    return language_pair_df
    
    

In [None]:
batch_size = 32
output_dir = "/kaggle/working"

for lang_pair in lang_pair_list:
    print(f'translating {lang_pair}')
    df = lang_pair_samples[lang_pair]
    
    # Get model, modifying for Norwegian
    src, tgt = lang_pair.split('_')
    src_mod = 'no' if src == 'nb' else src
    tgt_mod = 'no' if tgt == 'nb' else tgt
    
    model_name = f"Helsinki-NLP/opus-mt-{src_mod}-{tgt_mod}"

    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name).to(device)
    
    # Tokenize text
    tokenized_text = tokenize_lang_pair_dataframe(df, lang_pair, tokenizer, batch_size)
    result = translate_src_text(df, lang_pair, model, tokenized_text)
    
    # Save
    result.to_csv(f'{output_dir}/{lang_pair}_bleu_sample.csv', index=False)
    print(f"{lang_pair} BLEU sample saved with mean {result['bleu'].mean()}")
