In [1]:
import os
import sys
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import transformers
import tensorflow as tf
from datasets import load_dataset
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, AdamWeightDecay




In [2]:
!wget http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/moses/en-hi.txt.zip -O opus_en_hi.zip
!unzip opus_en_hi.zip

data = pd.read_csv("en-hi.txt", delimiter="\t", names=["en", "hi"])

'wget' is not recognized as an internal or external command,
operable program or batch file.
'unzip' is not recognized as an internal or external command,
operable program or batch file.


FileNotFoundError: [Errno 2] No such file or directory: 'en-hi.txt'

In [4]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 1659083
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 520
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2507
    })
})

In [5]:
raw_datasets['train'][0]

{'translation': {'en': 'Give your application an accessibility workout',
  'hi': 'अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें'}}

In [2]:
model_checkpoint = 'Helsinki-NLP/opus-mt-en-hi'

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)



In [7]:
tokenizer("Hello, world!")

{'input_ids': [12110, 2, 319, 61, 0], 'attention_mask': [1, 1, 1, 1, 1]}

In [8]:
with tokenizer.as_target_tokenizer():
    print(tokenizer(['अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें']))

{'input_ids': [[63, 2025, 18, 16155, 346, 20311, 24, 2279, 679, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}




In [9]:
max_input_length = 512
max_target_length = 512

source_lang = "en"
target_lang = "hi"

def preprocess_function(examples):
    inputs = [ex[source_lang] for ex in examples['translation']]
    targets = [ex[target_lang] for ex in examples['translation']]
    model_inputs = tokenizer(inputs, max_length=max_input_length,truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

def preprocess_bidirectional(examples):
    inputs = [ex['en'] for ex in examples['translation']] + [ex['hi'] for ex in examples['translation']]
    targets = [ex['hi'] for ex in examples['translation']] + [ex['en'] for ex in examples['translation']]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_datasets = raw_datasets.map(preprocess_bidirectional, batched=True)

In [10]:
preprocess_function(raw_datasets['train'][:2])

{'input_ids': [[3872, 85, 2501, 132, 15441, 36398, 0], [32643, 28541, 36253, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1]], 'labels': [[63, 2025, 18, 16155, 346, 20311, 24, 2279, 679, 0], [26618, 16155, 346, 33383, 0]]}

In [11]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/1659083 [00:00<?, ? examples/s]

Map:   0%|          | 0/520 [00:00<?, ? examples/s]

Map:   0%|          | 0/2507 [00:00<?, ? examples/s]

In [12]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)




All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Helsinki-NLP/opus-mt-en-hi.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [13]:
batch_size = 16
learning_rate = 2e-5
weight_decay = 0.01
num_train_epochs = 10

In [14]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors='tf')

In [15]:
generation_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors='tf', pad_to_multiple_of=128)

In [16]:
train_dataset = model.prepare_tf_dataset(
    tokenized_datasets["test"],
    batch_size=batch_size,
    shuffle=True,
    collate_fn=data_collator,
)

In [17]:
validation_dataset = model.prepare_tf_dataset(
    tokenized_datasets["validation"],
    batch_size=batch_size,
    shuffle=False,
    collate_fn=data_collator,
)

In [18]:
generation_dataset = model.prepare_tf_dataset(
    tokenized_datasets["test"],
    batch_size=8,
    shuffle=False,
    collate_fn=generation_data_collator,
)

In [19]:
optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)
model.compile(optimizer=optimizer)

In [20]:
model.fit(train_dataset, validation_data=validation_dataset, epochs=1) 




<tf_keras.src.callbacks.History at 0x21f3cf4a7e0>

In [21]:
model.save_pretrained('tf_model/')

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61949]]}


# Model Testing

In [22]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = TFAutoModelForSeq2SeqLM.from_pretrained('tf_model/')

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at tf_model/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [27]:
input_text = "Why can't you just stay calm for once?"
tokenized = tokenizer([input_text], return_tensors='np')
out = model.generate(**tokenized, max_length=128)
print(out)

tf.Tensor(
[[61949   278   118    38   418     6    39  7607   693     6    39    29
     57   175    22     0 61949]], shape=(1, 17), dtype=int32)


In [28]:
with tokenizer.as_target_tokenizer():
    print(tokenizer.decode(out[0], skip_special_tokens=True))

क्यों आप एक बार के लिए शांत रहने के लिए नहीं कर सकते?
