In [1]:
from transformers import (
       MT5ForConditionalGeneration, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, AutoTokenizer, MT5TokenizerFast,  MT5Tokenizer
)
import torch
import os
from torch.utils.data import random_split
import datasets

In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = "0,1,2"

In [3]:
# checkpoint = "google/mt5-small"
checkpoint = './pretrained_models/mT5/checkpoint-80125'
tokenizer =  MT5Tokenizer.from_pretrained(checkpoint)

In [4]:
path = './Datasets/tenlgu-hindi/fulldata'
hindi = []
telugu = []
for folder in os.listdir(path):
    subpath = os.path.join(path,folder)
    for file in os.listdir(subpath):
        if file.endswith('.hi'):
            # print(file)
            with open(os.path.join(subpath, file),'r') as hindifile:
                hindi.extend(hindifile.readlines())
        elif file.endswith('.te'):
            # print(file)
            with open(os.path.join(subpath, file),'r') as telugufile:
                telugu.extend(telugufile.readlines())
    assert len(hindi) == len(telugu)

In [5]:
def prepareData(hindi, telugu):
    size=  len(hindi)
    data = []
    for i in range(size):
        if(len(hindi[i].strip().split()) > 150 or len(telugu[i].strip().split())> 150):continue
        data.append({
            'id': i,
            "translation": {
                "hi": hindi[i].strip(),
                "te": telugu[i].strip()
            }
        })
    print(f'Total Data Size : {len(data)}')
    dataset = datasets.Dataset.from_list(data)
    return dataset
hi_te_books = prepareData(hindi, telugu)

Total Data Size : 549409


In [6]:
max_input_length = 256
max_target_length = 256

source_lang = "hi"

target_lang = "te"

prefix = "हिंदी से तेलुगू में अनुवाद करें:"

def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    labels = tokenizer(targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [7]:
tokenized_hi_te_books = hi_te_books.map(preprocess_function, batched=True)

Map:   0%|          | 0/549409 [00:00<?, ? examples/s]

In [8]:
tokenized_hi_te_split = tokenized_hi_te_books.train_test_split(train_size=0.7, shuffle=True, seed = 0)
tokenized_hi_te_train = tokenized_hi_te_split['train']
tokenized_hi_te_test = tokenized_hi_te_split['test'].train_test_split(train_size=0.5, seed = 1)

In [9]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [10]:
import evaluate
metric = evaluate.load("sacrebleu")

In [11]:
import numpy as np


def postprocess_text(preds, labels):

    preds = [pred.strip() for pred in preds]

    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):

    preds, labels = eval_preds

    if isinstance(preds, tuple):

        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)

    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]

    result["gen_len"] = np.mean(prediction_lens)

    result = {k: round(v, 4) for k, v in result.items()}

    return result

In [12]:
model = MT5ForConditionalGeneration.from_pretrained(checkpoint,resume_download = True)

In [13]:
small_val = tokenized_hi_te_test['train'].train_test_split(test_size = 0.1)['test']

In [14]:
training_args = Seq2SeqTrainingArguments(

    output_dir="./pretrained_models/mT5",

    evaluation_strategy="epoch",

    learning_rate=2e-5,

    weight_decay=0.01,

    save_total_limit=15,

    num_train_epochs=5,

    predict_with_generate=True,

    fp16=False,

    logging_dir="./logs/t5logs_hite",
    
    logging_strategy='epoch',
    
    save_strategy= 'epoch',
        
    report_to=['tensorboard'],

    load_best_model_at_end= True
    
)

trainer = Seq2SeqTrainer(

    model=model,

    args=training_args,

    train_dataset=tokenized_hi_te_train,

    eval_dataset=small_val,

    tokenizer=tokenizer,

    data_collator=data_collator,

    compute_metrics=compute_metrics,

)

In [15]:
from accelerate import Accelerator
accelerator = Accelerator()
tokenized_hi_te_split, trainer = accelerator.prepare(tokenized_hi_te_split, trainer)

In [16]:
trainer.train() 

In [17]:
trainer = Seq2SeqTrainer(

    model=model,

    args=training_args,

    train_dataset=tokenized_hi_te_train,

    eval_dataset=tokenized_hi_te_test['test'],

    tokenizer=tokenizer,

    data_collator=data_collator,

    compute_metrics=compute_metrics,

)

In [18]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: translation, id. If translation, id are not expected by `MT5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 82411
  Batch size = 24


{'eval_loss': 2.215162754058838,
 'eval_bleu': 8.723,
 'eval_gen_len': 14.1217,
 'eval_runtime': 3401.5515,
 'eval_samples_per_second': 24.227,
 'eval_steps_per_second': 1.01}