# Translation of the MT-GenEval Development Data Set from English to Swedish

In [None]:
pip install transformers sentencepiece


In [None]:
!huggingface-cli login

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

file_en_fem = '/content/geneval-sentences-feminine-dev.en_de.en'
file_de_fem = '/content/geneval-sentences-feminine-dev.en_de.de'
file_en_masc = '/content/geneval-sentences-masculine-dev.en_de.en'
file_de_masc = '/content/geneval-sentences-masculine-dev.en_de.de'
output_csv = 'MTGenEvalDev.csv'

with open(file_en_fem, 'r', encoding='utf-8') as f_en_f, \
     open(file_de_fem, 'r', encoding='utf-8') as f_de_f, \
     open(file_en_masc, 'r', encoding='utf-8') as f_en_m, \
     open(file_de_masc, 'r', encoding='utf-8') as f_de_m:

    en_f_lines = [line.strip() for line in f_en_f.readlines()]
    de_f_lines = [line.strip() for line in f_de_f.readlines()]
    en_m_lines = [line.strip() for line in f_en_m.readlines()]
    de_m_lines = [line.strip() for line in f_de_m.readlines()]

if not (len(en_f_lines) == len(de_f_lines) == len(en_m_lines) == len(de_m_lines)):
    raise ValueError("Check file line length!!")


token = 'XXX' #HF_token
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-sv", use_auth_token=token)
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-sv", use_auth_token=token).to(device)


def translate_to_swedish(sentences, batch_size=8):
    translations = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to(device)
        outputs = model.generate(**inputs)
        decoded = [tokenizer.decode(t, skip_special_tokens=True) for t in outputs]
        translations.extend(decoded)
    return translations

all_en_lines = [line for pair in zip(en_f_lines, en_m_lines) for line in pair]
sv_translations = translate_to_swedish(all_en_lines)

In [None]:
import csv

with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['DE', 'EN', 'SV'])

    sw_idx = 0
    for de_f, en_f, de_m, en_m in zip(de_f_lines, en_f_lines, de_m_lines, en_m_lines):
        writer.writerow([de_f, en_f, sv_translations[sw_idx]])
        sw_idx += 1
        writer.writerow([de_m, en_m, sv_translations[sw_idx]])
        sw_idx += 1

In [None]:
from google.colab import files
files.download('MTGenEvalDev.csv')