In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install datasets sentencepiece transformers evaluate sacrebleu



In [None]:
from datasets import load_dataset
from IPython.display import display
import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch
from torch import optim
from torch.nn import functional as F
from transformers import AdamW, AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm_notebook
import evaluate

sns.set()
bleu = evaluate.load("bleu")
sacrebleu = evaluate.load("sacrebleu")

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print("Using device: %s" % (device))

Using device: cuda


In [None]:
dataset = load_dataset('alt')
train_dataset = dataset['train']
test_dataset = dataset['test']
train_dataset[0]

{'SNT.URLID': '80188',
 'SNT.URLID.SNTID': '1',
 'url': 'http://en.wikinews.org/wiki/2007_Rugby_World_Cup:_Italy_31_-_5_Portugal',
 'translation': {'bg': 'ফ্রান্সের প্যারিসের পার্ক দি প্রিন্সেস-এ হওয়া ২০০৭-এর রাগবি বিশ্বকাপের পুল সি-তে ইটালি পর্তুগালকে ৩১-৫ গোলে হারিয়েছে।',
  'en': 'Italy have defeated Portugal 31-5 in Pool C of the 2007 Rugby World Cup at Parc des Princes, Paris, France.',
  'en_tok': 'Italy have defeated Portugal 31-5 in Pool C of the 2007 Rugby World Cup at Parc des Princes , Paris , France .',
  'fil': 'Natalo ng Italya ang Portugal sa puntos na 31-5 sa Grupong C noong 2007 sa Pandaigdigang laro ng Ragbi sa Parc des Princes, Paris, France.',
  'hi': '2007 में फ़्रांस, पेरिस के पार्क डेस प्रिंसेस में हुए रग्बी विश्व कप के पूल C में इटली ने पुर्तगाल को 31-5 से हराया।',
  'id': 'Italia berhasil mengalahkan Portugal 31-5 di grup C dalam Piala Dunia Rugby 2007 di Parc des Princes, Paris, Perancis.',
  'ja': 'フランスのパリ、パルク・デ・プランスで行われた2007年ラグビーワールドカップのプールCで、イタリアは31対5でポルトガルを

In [None]:
from transformers import M2M100Tokenizer, M2M100ForConditionalGeneration

model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")

In [None]:
model.load_state_dict(torch.load("/content/drive/MyDrive/m2m/m2m100_418M_FineTunedEpoch14_Early.pt"))
model.to(device)
model.eval()
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")

In [None]:
max_seq_len = model.config.max_length

In [None]:
def encode_str(text, text_target, tokenizer, seq_len):

    # Tokenize and add special tokens
    tokenizerOutp = tokenizer(
        text = text,
        text_target = text_target,
        return_tensors = 'pt',
        padding = 'max_length',
        truncation = True,
        max_length = seq_len).to(device)

    return tokenizerOutp['input_ids'][0], tokenizerOutp['labels'][0], tokenizerOutp['attention_mask'][0]


def format_translation_data(translations, tokenizer, input_lang, target_lang, seq_len = max_seq_len):

    # Get the translations for the batch
    input_text = translations[input_lang]
    target_text = translations[target_lang]

    if input_text is None or target_text is None:
        return None

    if ((input_lang == 'lo') & (target_lang == 'vi')):
        tokenizer.src_lang = "lo"
        tokenizer.tgt_lang = "vi"
    elif ((input_lang == 'vi') & (target_lang == 'lo')):
        tokenizer.src_lang = "vi"
        tokenizer.tgt_lang = "lo"
    else:
        print('WARNING: SOMETHING WRONG WHEN RANDOMIZING LANG')

    input_token_ids, target_token_ids, attention_mask = encode_str(
        input_text, target_text, tokenizer, seq_len)

    return input_token_ids, target_token_ids, attention_mask


def transform_batch(batch, tokenizer, input_lang, target_lang):
    inputs = []
    targets = []
    attentionMask = []
    for translation_set in batch['translation']:
        formatted_data = format_translation_data(
            translation_set, tokenizer, input_lang, target_lang, max_seq_len)

        if formatted_data is None:
            continue

        input_ids, target_ids, attention_mask = formatted_data

        inputs.append(input_ids.unsqueeze(0))
        targets.append(target_ids.unsqueeze(0))
        attentionMask.append(attention_mask.unsqueeze(0))

    batch_input_ids = torch.cat(inputs).cuda()
    batch_target_ids = torch.cat(targets).cuda()
    attentionMask = torch.cat(attentionMask).cuda()

    return batch_input_ids, batch_target_ids, attentionMask


def get_data_generator(dataset, input_lang, target_lang, tokenizer, batch_size = 32):
    dataset = dataset.shuffle()
    for i in range(0, len(dataset), batch_size):
        raw_batch = dataset[i:i+batch_size]
        yield transform_batch(raw_batch, tokenizer, input_lang, target_lang)


def eval_model(model, tokenizer, gdataset, input_lang = 'lo', target_lang = 'vi'):

    model.eval()
    test_generator = get_data_generator(gdataset, input_lang, target_lang, tokenizer, batch_size = 4)

    trueSentenceListLoVi = []
    outputSentenceListLoVi = []

    trueSentenceListViLo = []
    outputSentenceListViLo = []

    for i, (input_batch, label_batch, attention_mask_batch) in enumerate(test_generator):

        outpSentenceLoVi = model.generate(input_batch, num_beams = 20, num_return_sequences=1, max_new_tokens = max_seq_len, forced_bos_token_id = tokenizer.get_lang_id(target_lang))
        outpSentenceLoVi = tokenizer.batch_decode(outpSentenceLoVi, skip_special_tokens = True)
        outputSentenceListLoVi = outputSentenceListLoVi + outpSentenceLoVi

        trueSentenceLoVi = tokenizer.batch_decode(label_batch, skip_special_tokens = True)
        trueSentenceListLoVi = trueSentenceListLoVi + trueSentenceLoVi


        outpSentenceViLo = model.generate(label_batch, num_beams = 20, num_return_sequences=1, max_new_tokens = max_seq_len, forced_bos_token_id = tokenizer.get_lang_id(input_lang))
        outpSentenceViLo = tokenizer.batch_decode(outpSentenceViLo, skip_special_tokens = True)
        outputSentenceListViLo = outputSentenceListViLo + outpSentenceViLo

        trueSentenceViLo = tokenizer.batch_decode(input_batch, skip_special_tokens = True)
        trueSentenceListViLo = trueSentenceListViLo + trueSentenceViLo


    print(len(trueSentenceListLoVi))
    print(len(outputSentenceListLoVi))
    print(len(trueSentenceListViLo))
    print(len(outputSentenceListViLo))

    return (trueSentenceListLoVi, outputSentenceListLoVi, trueSentenceListViLo, outputSentenceListViLo)


In [None]:
# srcLang = "lo"
# tgtLang = "vi"

srcLang = "vi"
tgtLang = "lo"

inputText = "Neil Armstrong là người đầu tiên bước chân lên mặt trăng."
outputText = ""

tokenizer.src_lang = srcLang
tokenizer.tgt_lang = tgtLang
encodedInputs = encode_str(inputText, outputText, tokenizer, max_seq_len)

outputSent = model.generate(encodedInputs[0].unsqueeze(0), num_beams = 50, num_return_sequences = 3, max_new_tokens = max_seq_len, forced_bos_token_id = tokenizer.get_lang_id(tgtLang))
outputSent = tokenizer.batch_decode(outputSent, skip_special_tokens = True)
print(outputSent)

In [None]:
trueSentenceListLoVi, outputSentenceListLoVi, trueSentenceListViLo, outputSentenceListViLo = eval_model(model, tokenizer, test_dataset)

1016
1016
1016
1016


In [None]:
import json

with open("/content/drive/MyDrive/outputSentenceListLoVi.json", 'w') as f:
    json.dump(outputSentenceListLoVi, f, indent = 2, ensure_ascii = False)

with open("/content/drive/MyDrive/outputSentenceListViLo.json", 'w') as f:
    json.dump(outputSentenceListViLo, f, indent = 2, ensure_ascii = False)

with open("/content/drive/MyDrive/trueSentenceListLoVi.json", 'w') as f:
    json.dump(trueSentenceListLoVi, f, indent = 2, ensure_ascii = False)

with open("/content/drive/MyDrive/trueSentenceListViLo.json", 'w') as f:
    json.dump(trueSentenceListViLo, f, indent = 2, ensure_ascii = False)