In [1]:
from transformers import (
    MBartForConditionalGeneration, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, AutoTokenizer, MBart50TokenizerFast
)
import torch
import os
from torch.utils.data import random_split
import datasets

In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = "0,1,2,3"

In [3]:
checkpoint = 'facebook/mbart-large-50-many-to-many-mmt'
tokenizer = MBart50TokenizerFast.from_pretrained(checkpoint)   #("facebook/mbart-large-50-many-to-many-mmt")

In [4]:
path = './Datasets/tenlgu-hindi/fulldata'
hindi = []
telugu = []
for folder in os.listdir(path):
    subpath = os.path.join(path,folder)
    for file in os.listdir(subpath):
        if file.endswith('.hi'):
            # print(file)
            with open(os.path.join(subpath, file),'r') as hindifile:
                hindi.extend(hindifile.readlines())
        elif file.endswith('.te'):
            # print(file)
            with open(os.path.join(subpath, file),'r') as telugufile:
                telugu.extend(telugufile.readlines())
    assert len(hindi) == len(telugu)


In [5]:
def prepareData(hindi, telugu):
    size=  len(hindi)
    data = []
    for i in range(size):
        if(len(hindi[i].strip().split()) > 150 or len(telugu[i].strip().split())> 150):continue
        data.append({
            'id': i,
            "translation": {
                "hi": hindi[i].strip(),
                "te": telugu[i].strip()
            }
        })
    print(f'Total Data Size : {len(data)}')
    dataset = datasets.Dataset.from_list(data)
    return dataset
hi_te_books = prepareData(hindi, telugu)

Total Data Size : 549409


In [6]:
max_input_length = 256
max_target_length = 256

source_lang = "hi"

target_lang = "te"

prefix = "हिंदी से तेलुगू में अनुवाद करें:"

def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    labels = tokenizer(targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [7]:
tokenized_hi_te_books = hi_te_books.map(preprocess_function, batched=True)

Map:   0%|          | 0/549409 [00:00<?, ? examples/s]

In [8]:
tokenized_hi_te_split = tokenized_hi_te_books.train_test_split(train_size=0.7, shuffle=True, seed = 0)
tokenized_hi_te_train = tokenized_hi_te_split['train']
tokenized_hi_te_test = tokenized_hi_te_split['test'].train_test_split(train_size=0.5, seed = 1)

In [9]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [10]:
import evaluate
metric = evaluate.load("sacrebleu")

In [11]:
import numpy as np


def postprocess_text(preds, labels):

    preds = [pred.strip() for pred in preds]

    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):

    preds, labels = eval_preds

    if isinstance(preds, tuple):

        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)

    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]

    result["gen_len"] = np.mean(prediction_lens)

    result = {k: round(v, 4) for k, v in result.items()}

    return result

In [12]:
model = MBartForConditionalGeneration.from_pretrained(checkpoint,resume_download = True)

In [13]:
training_args = Seq2SeqTrainingArguments(

    output_dir="./pretrained_models/",

    evaluation_strategy="epoch",

    learning_rate=2e-5,

    per_device_train_batch_size=8,

    per_device_eval_batch_size=8,

    weight_decay=0.01,

    save_total_limit=15,

    num_train_epochs=5,

    predict_with_generate=True,

    fp16=True,

    logging_dir="./logs/",
    
    logging_steps=10000,

    save_steps=10000,
                        
    report_to=['tensorboard']
)

trainer = Seq2SeqTrainer(

    model=model,

    args=training_args,

    train_dataset=tokenized_hi_te_train,

    eval_dataset=tokenized_hi_te_test["train"],

    tokenizer=tokenizer,

    data_collator=data_collator,

    compute_metrics=compute_metrics,

)

Using cuda_amp half precision backend


In [14]:
## Accelerate is a library created by Hugging Face that enables the same PyTorch code to be run across any distributed configuration by adding just four lines of code 1. In short, it makes training and inference at scale simple, efficient, and adaptable 1.
## Accelerate abstracts exactly and only the boilerplate code related to multi-GPUs/TPU/fp16 and leaves the rest of your code unchanged 2. 
## By adding a few lines to any standard PyTorch training script, you can now run on any kind of single or distributed node setting (single CPU, 
## single GPU, multi-GPUs and TPUs) as well as with or without mixed precision (fp16) 2.
# from accelerate import Accelerator
# accelerator = Accelerator()
# tokenized_hi_te_split, trainer = accelerator.prepare(tokenized_hi_te_split, trainer)


In [15]:
# trainer.train() 

In [18]:
## Loading the model from checkpoint
model = MBartForConditionalGeneration.from_pretrained("./pretrained_models/checkpoint-30000")

loading configuration file ./pretrained_models/checkpoint-30000/config.json
Model config MBartConfig {
  "_name_or_path": "facebook/mbart-large-50-many-to-many-mmt",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "relu",
  "add_bias_logits": false,
  "add_final_layer_norm": true,
  "architectures": [
    "MBartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": 

In [30]:
a = tokenized_hi_te_test["test"].train_test_split(test_size= 0.04)['test']

In [31]:
training_args = Seq2SeqTrainingArguments(

    output_dir="./pretrained_models/",

    evaluation_strategy="epoch",

    learning_rate=2e-5,

    per_device_train_batch_size=8,

    per_device_eval_batch_size=8,

    weight_decay=0.01,

    save_total_limit=15,

    num_train_epochs=5,

    predict_with_generate=True,

    generation_max_length= 256,
    
    generation_num_beams= 2,
    
    fp16=True,

    logging_dir="./logs/",
    
    logging_steps=10000,

    save_steps=10000,
                        
    report_to=['tensorboard']
)

trainer = Seq2SeqTrainer(

    model=model,

    args=training_args,

    train_dataset=tokenized_hi_te_train,

    eval_dataset=a,

    tokenizer=tokenizer,

    data_collator=data_collator,

    compute_metrics=compute_metrics,

)

PyTorch: setting up devices
Using cuda_amp half precision backend


In [18]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `MBartForConditionalGeneration.forward` and have been ignored: translation, id. If translation, id are not expected by `MBartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 82412
  Batch size = 32
You're using a MBart50TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 1.0571668148040771,
 'eval_bleu': 38.7506,
 'eval_gen_len': 28.4632,
 'eval_runtime': 10234.4166,
 'eval_samples_per_second': 8.052,
 'eval_steps_per_second': 0.252}

In [29]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `MBartForConditionalGeneration.forward` and have been ignored: translation, id. If translation, id are not expected by `MBartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 8242
  Batch size = 32


{'eval_loss': 1.0809255838394165,
 'eval_bleu': 37.9944,
 'eval_gen_len': 28.6859,
 'eval_runtime': 632.9504,
 'eval_samples_per_second': 13.022,
 'eval_steps_per_second': 0.408}