In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install datasets sentencepiece transformers evaluate sacrebleu



In [3]:
from datasets import load_dataset
from IPython.display import display
import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch
from torch import optim
from torch.nn import functional as F
from transformers import AdamW, AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm_notebook
import evaluate

sns.set()
bleu = evaluate.load("bleu")
sacrebleu = evaluate.load("sacrebleu")

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print("Using device: %s" % (device))

Using device: cuda


In [4]:
dataset = load_dataset('alt')
train_dataset = dataset['train']
test_dataset = dataset['test']
train_dataset[0]

{'SNT.URLID': '80188',
 'SNT.URLID.SNTID': '1',
 'url': 'http://en.wikinews.org/wiki/2007_Rugby_World_Cup:_Italy_31_-_5_Portugal',
 'translation': {'bg': 'ফ্রান্সের প্যারিসের পার্ক দি প্রিন্সেস-এ হওয়া ২০০৭-এর রাগবি বিশ্বকাপের পুল সি-তে ইটালি পর্তুগালকে ৩১-৫ গোলে হারিয়েছে।',
  'en': 'Italy have defeated Portugal 31-5 in Pool C of the 2007 Rugby World Cup at Parc des Princes, Paris, France.',
  'en_tok': 'Italy have defeated Portugal 31-5 in Pool C of the 2007 Rugby World Cup at Parc des Princes , Paris , France .',
  'fil': 'Natalo ng Italya ang Portugal sa puntos na 31-5 sa Grupong C noong 2007 sa Pandaigdigang laro ng Ragbi sa Parc des Princes, Paris, France.',
  'hi': '2007 में फ़्रांस, पेरिस के पार्क डेस प्रिंसेस में हुए रग्बी विश्व कप के पूल C में इटली ने पुर्तगाल को 31-5 से हराया।',
  'id': 'Italia berhasil mengalahkan Portugal 31-5 di grup C dalam Piala Dunia Rugby 2007 di Parc des Princes, Paris, Perancis.',
  'ja': 'フランスのパリ、パルク・デ・プランスで行われた2007年ラグビーワールドカップのプールCで、イタリアは31対5でポルトガルを

In [5]:
from transformers import M2M100Tokenizer, M2M100ForConditionalGeneration

model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-base")
tokenizer = AutoTokenizer.from_pretrained("google/mt5-base")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [6]:
max_seq_len = 128

# LANG_TOKEN_MAPPING = {
#     'vi': '<vi>',
#     'lo': '<lo>',
# }

# LANG_TOKEN_MAPPING = {
#     'vi': 'translate Lao to Vietnamese: ',
#     'lo': 'translate Vietnamese to Lao: ',
# }

LANG_TOKEN_MAPPING = {
    'vi': '<vi>',
    'lo': '<lao>'
}

In [7]:
special_tokens_dict = {'additional_special_tokens': list(LANG_TOKEN_MAPPING.values())}
tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

model.load_state_dict(torch.load("/content/drive/MyDrive/mt5_translation.pt"))
model.to(device)
model.eval()

MT5ForConditionalGeneration(
  (shared): Embedding(250102, 768)
  (encoder): MT5Stack(
    (embed_tokens): Embedding(250102, 768)
    (block): ModuleList(
      (0): MT5Block(
        (layer): ModuleList(
          (0): MT5LayerSelfAttention(
            (SelfAttention): MT5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): MT5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): MT5LayerFF(
            (DenseReluDense): MT5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
         

In [8]:
def encode_str(text, text_target, tokenizer, seq_len, lang_token_map=LANG_TOKEN_MAPPING):

    input_lang_token = lang_token_map['lo']
    target_lang_token = lang_token_map['vi']

    # Tokenize and add special tokens
    tokenizerOutp = tokenizer(
        text = target_lang_token + text,
        text_target = text_target,
        return_tensors = 'pt',
        padding = 'max_length',
        truncation = True,
        max_length = seq_len).to(device)

    tokenizerOutp1 = tokenizer(
        text = input_lang_token + text_target,
        text_target = text,
        return_tensors = 'pt',
        padding = 'max_length',
        truncation = True,
        max_length = seq_len).to(device)

    return tokenizerOutp['input_ids'][0], tokenizerOutp1['input_ids'][0], tokenizerOutp['attention_mask'][0] # Lo, Vi, AttenM


def format_translation_data(translations, tokenizer, input_lang = 'lo', target_lang = 'vi', seq_len = max_seq_len):

    # Get the translations for the batch
    input_text = translations[input_lang]
    target_text = translations[target_lang]

    if input_text is None or target_text is None:
        return None

    input_token_ids, target_token_ids, attention_mask = encode_str(
        input_text, target_text, tokenizer, seq_len)

    return input_token_ids, target_token_ids, attention_mask


def transform_batch(batch, tokenizer, input_lang, target_lang):

    inputs = []
    targets = []
    attentionMask = []

    for translation_set in batch['translation']:
        formatted_data = format_translation_data(
            translation_set, tokenizer, input_lang, target_lang, max_seq_len)

        if formatted_data is None:
            continue

        input_ids, target_ids, attention_mask = formatted_data

        inputs.append(input_ids.unsqueeze(0))
        targets.append(target_ids.unsqueeze(0))
        attentionMask.append(attention_mask.unsqueeze(0))

    batch_input_ids = torch.cat(inputs).cuda()
    batch_target_ids = torch.cat(targets).cuda()
    attentionMask = torch.cat(attentionMask).cuda()

    return batch_input_ids, batch_target_ids, attentionMask


def get_data_generator(dataset, input_lang, target_lang, tokenizer, batch_size = 32):
    dataset = dataset.shuffle()
    for i in range(0, len(dataset), batch_size):
        raw_batch = dataset[i:i+batch_size]
        yield transform_batch(raw_batch, tokenizer, input_lang, target_lang)


def eval_model(model, tokenizer, gdataset, input_lang = 'lo', target_lang = 'vi'):

    model.eval()
    test_generator = get_data_generator(gdataset, input_lang, target_lang, tokenizer, batch_size = 6)

    trueSentenceListLoVi = []
    outputSentenceListLoVi = []

    trueSentenceListViLo = []
    outputSentenceListViLo = []

    for i, (input_batch, label_batch, attention_mask_batch) in enumerate(test_generator):

        outpSentenceLoVi = model.generate(input_batch, num_beams = 20, num_return_sequences=1, max_new_tokens = max_seq_len)
        outpSentenceLoVi = tokenizer.batch_decode(outpSentenceLoVi, skip_special_tokens = True)
        outputSentenceListLoVi = outputSentenceListLoVi + outpSentenceLoVi

        trueSentenceLoVi = tokenizer.batch_decode(label_batch, skip_special_tokens = True)
        trueSentenceListLoVi = trueSentenceListLoVi + trueSentenceLoVi


        outpSentenceViLo = model.generate(label_batch, num_beams = 20, num_return_sequences=1, max_new_tokens = max_seq_len)
        outpSentenceViLo = tokenizer.batch_decode(outpSentenceViLo, skip_special_tokens = True)
        outputSentenceListViLo = outputSentenceListViLo + outpSentenceViLo

        trueSentenceViLo = tokenizer.batch_decode(input_batch, skip_special_tokens = True)
        trueSentenceListViLo = trueSentenceListViLo + trueSentenceViLo


    print(len(trueSentenceListLoVi))
    print(len(outputSentenceListLoVi))
    print(len(trueSentenceListViLo))
    print(len(outputSentenceListViLo))

    return (trueSentenceListLoVi, outputSentenceListLoVi, trueSentenceListViLo, outputSentenceListViLo)


In [9]:
inputText = ""
outputText = "Đã có thông tin khẳng định rằng tám chú ngựa đua thuần chủng tại Trường đua Randwick ở Sydney đã bị nhiễm cúm ngựa."

encodedInputs = encode_str(inputText, outputText, tokenizer, max_seq_len)

outputSent = model.generate(encodedInputs[1].unsqueeze(0), num_beams = 50, num_return_sequences = 3, max_new_tokens = max_seq_len) ##Change this index for vi-lao and lao-vi
outputSent = tokenizer.batch_decode(outputSent, skip_special_tokens = True)
print(outputSent)

['ມີການຢືນຢັນວ່າ ງົວແປດໂຕ ທີ່ໂຮງຮຽນການແຂ່ງຂັນ ແຣນນວິກ ໃນຊິດນີ ໄດ້ຮັບການຕິດເຊື້ອໄຂ້ຫວັດຫມູຈາກໄຂ້ຫວັດຫມູ.', 'ມີການຢືນຢັນວ່າ ງົວແປດໂຕ ທີ່ໂຮງຮຽນການແຂ່ງຂັນ ລັງເວັກສ ໃນຊິດນີ ໄດ້ຮັບການຕິດເຊື້ອໄຂ້ຫວັດຫມູຈາກໄຂ້ຫວັດຫມູ.', 'ມີການຢືນຢັນວ່າ ງົວແປດໂຕ ທີ່ໂຮງຮຽນການແຂ່ງຂັນ ແຣນນວິດັກ ໃນຊິດນີ ໄດ້ຮັບການຕິດເຊື້ອໄຂ້ຫວັດຫມູຈາກໄຂ້ຫວັດຫມູ.']


In [10]:
trueSentenceListLoVi, outputSentenceListLoVi, trueSentenceListViLo, outputSentenceListViLo = eval_model(model, tokenizer, test_dataset)

1016
1016
1016
1016


In [11]:
resultsLoVi = bleu.compute(predictions = outputSentenceListLoVi, references = trueSentenceListLoVi)
print(resultsLoVi)

resultsViLo = bleu.compute(predictions = outputSentenceListViLo, references = trueSentenceListViLo)
print(resultsViLo)

{'bleu': 0.16683088387053444, 'precisions': [0.44981065803194115, 0.2241573985208604, 0.11944202266782912, 0.06432293226328083], 'brevity_penalty': 1.0, 'length_ratio': 1.0580072000929044, 'translation_length': 36442, 'reference_length': 34444}
{'bleu': 0.023738192726294953, 'precisions': [0.1761455525606469, 0.03935185185185185, 0.011550768750487786, 0.003965910049784829], 'brevity_penalty': 1.0, 'length_ratio': 1.4407766990291262, 'translation_length': 14840, 'reference_length': 10300}


In [14]:
import json

with open("/content/drive/MyDrive/outputSentenceListLoVi.json", 'w') as f:
    json.dump(outputSentenceListLoVi, f, indent = 2, ensure_ascii = False)

with open("/content/drive/MyDrive/outputSentenceListViLo.json", 'w') as f:
    json.dump(outputSentenceListViLo, f, indent = 2, ensure_ascii = False)

with open("/content/drive/MyDrive/trueSentenceListLoVi.json", 'w') as f:
    json.dump(trueSentenceListLoVi, f, indent = 2, ensure_ascii = False)

with open("/content/drive/MyDrive/trueSentenceListViLo.json", 'w') as f:
    json.dump(trueSentenceListViLo, f, indent = 2, ensure_ascii = False)