found here : https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb

In [1]:
from datasets import load_dataset
from transformers import (
    MBartForConditionalGeneration, MBartTokenizer, 
    Seq2SeqTrainingArguments, Seq2SeqTrainer
  )

import torch
from torch.utils.data import random_split

In [2]:
rootdata = "../data_test/parallel/"
data = []
with open(rootdata+"IITB.en-hi.en") as f2, open(rootdata+"IITB.en-hi.hi") as f1:
    for src, tgt in zip(f1, f2):
      data.append(
          {
              "translation": {
                  "hi": src.strip(),
                  "en": tgt.strip()
              }
          }
      )
print(f'total size of data is {len(data)}')

total size of data is 1609682


In [3]:
data[0]

{'translation': {'hi': 'अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें',
  'en': 'Give your application an accessibility workout'}}

In [4]:
data = data[:100]

In [5]:
len(data)

100

In [6]:
# splitting dataset into train, validation
split = 0.2
train_dataset, eval_dataset = random_split(data, lengths=[int((1-split)*len(data)), int(split*len(data))])

In [7]:
# defining collator functioon for preparing batches on the fly ..
def data_collator(features:list):
    labels = [f["translation"]["en"] for f in features]
    inputs = [f["translation"]["hi"] for f in features]

    batch = tokenizer.prepare_seq2seq_batch(src_texts=inputs, src_lang="hi_IN", tgt_lang="en_XX", tgt_texts=labels, max_length=32, max_target_length=32)

    for k in batch:
        batch[k] = torch.tensor(batch[k])

    return batch

In [8]:
from transformers import MarianMTModel, MarianTokenizer

In [15]:
# initiating model, tokenizer
# model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-cc25")
# tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25")

model_name = 'Helsinki-NLP/opus-mt-en-hi'
model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)

In [18]:
# model = MarianMTModel.from_pretrained(model_name)
# tokenizer = MarianTokenizer.from_pretrained(model_name)
# sample_text = "अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें"
sample_text = "Give your application an accessibility workout"
batch = tokenizer([sample_text], return_tensors="pt")
gen = model.generate(**batch)
tokenizer.batch_decode(gen, skip_special_tokens=True)

['अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें']

In [19]:
# defining training related arguments
args = Seq2SeqTrainingArguments(output_dir="indic-mbart",
                        do_train=True,
                        do_eval=True,
                        evaluation_strategy="epoch",
                        per_device_train_batch_size=16,
                        per_device_eval_batch_size=16,
                        learning_rate=5e-5,
                        num_train_epochs=2,
                        logging_dir="/home/abarthe/.tensorboard_files/logs")

In [20]:
# defining trainer using 🤗
trainer = Seq2SeqTrainer(model=model, 
                args=args, 
                data_collator=data_collator, 
                train_dataset=train_dataset, 
                eval_dataset=eval_dataset)

In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss,Runtime,Samples Per Second
1,No log,3.782735,0.0636,314.556
2,No log,3.293343,0.064,312.617


TrainOutput(global_step=10, training_loss=4.564515686035156, metrics={'train_runtime': 1.7641, 'train_samples_per_second': 5.669, 'total_flos': 2287769223168.0, 'epoch': 2.0, 'init_mem_cpu_alloc_delta': 46096, 'init_mem_gpu_alloc_delta': 305772544, 'init_mem_cpu_peaked_delta': 18258, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 209758, 'train_mem_gpu_alloc_delta': 910295040, 'train_mem_cpu_peaked_delta': 45516, 'train_mem_gpu_peaked_delta': 776058368})

In [22]:
sample_text = "Give your application an accessibility workout"
batch = tokenizer([sample_text], return_tensors="pt")
gen = model.generate(**batch)
tokenizer.batch_decode(gen, skip_special_tokens=True)

RuntimeError: Input, output and indices must be on the current device

In [None]:
inputs = "अंतिम प्रविष्ट घटना को हाइलाइट करो"
inputs_tokenized = tokenizer(inputs, return_tensors="pt", padding=True)

In [None]:
trainer.predict(inputs)

In [None]:
ft_model = "finetuned/test"
trainer.save_model(ft_model)
tokenizer.save_pretrained(ft_model)

In [None]:
# from transformers import pipeline
# model_id = "vasudevgupta/mbart-iitb-hin-eng"
# translator = pipeline("translation_hi_to_en", model=model_id, tokenizer=model_id)

In [None]:
model_trained = trainer.model
tokenizer_trained = trainer.tokenizer

In [None]:
tokenizer_trained.type

In [None]:
translated = model_trained.generate(**tokenizer_trained(inputs, return_tensors="pt", padding=True))

In [None]:
# lets see how our model performs
inputs = "अंतिम प्रविष्ट घटना को हाइलाइट करो"
inputs_tokenized = tokenizer(inputs, return_tensors="pt", padding=True)

# translation = translator(inputs, return_text=True)
# translation = [t["translation_text"] for t in translation]
# print(translation)

In [None]:
inputs_tokenized

In [None]:
inputs = 

In [None]:
from transformers import pipeline
translator = pipeline("translation_hi_to_en", model=trainer.model, tokenizer=tokenizer)

In [None]:
# lets see how our model performs
inputs = "अंतिम प्रविष्ट घटना को हाइलाइट करो"

translation = translator(inputs, return_text=True)
translation = [t["translation_text"] for t in translation]
print(translation)

In [None]:
input