# Step 2: Translation Model Creation

Translate the sentences to English

In [2]:
import os
import sys
import transformers
import pandas as pd
import tensorflow as tf
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import AdamWeightDecay
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM
from ftfy import fix_text

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load Model 
model_checkpoint = "Helsinki-NLP/opus-mt-es-en"

# Neural machine translation model for translating from English (en) to Spanish (es)
raw_datasets = load_dataset("Helsinki-NLP/opus_books", "en-es")

In [4]:
train_test_split = raw_datasets['train'].train_test_split(test_size=0.005)  # Split 10% for test
# From the remaining train, split again for validation
validation_test_split = train_test_split['train'].train_test_split(test_size=0.005)  # 10% of 90% => 9% of original

# Organize the splits into a new DatasetDict
raw_datasets = DatasetDict({
    'train': validation_test_split['train'],
    'validation': validation_test_split['test'],
    'test': train_test_split['test']
})

In [5]:
import sentencepiece
print("SentencePiece version:", sentencepiece.__version__)

SentencePiece version: 0.2.0


In [6]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)



In [7]:
max_input_length = 128
max_target_length = 128

source_lang = "es"
target_lang = "en"

def preprocess_function(examples):
    inputs = [ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

preprocess_function(raw_datasets["train"][:2])




{'input_ids': [[4879, 746, 3, 0], [9636, 31721, 12, 9259, 28, 46481, 8114, 17939, 4, 419, 27769, 1961, 3, 0]], 'attention_mask': [[1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[52, 2697, 746, 395, 0], [1045, 8, 22765, 2957, 3393, 3110, 237, 115, 8361, 1008, 5, 7989, 3, 0]]}

In [8]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

batch_size = 16
learning_rate = 2e-5
weight_decay = 0.01
num_train_epochs = 1

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

Map: 100%|██████████| 92536/92536 [00:19<00:00, 4722.07 examples/s]
Map: 100%|██████████| 466/466 [00:00<00:00, 5188.96 examples/s]
Map: 100%|██████████| 468/468 [00:00<00:00, 5064.55 examples/s]
All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Helsinki-NLP/opus-mt-es-en.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [9]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")
generation_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf", pad_to_multiple_of=128)

In [10]:
train_dataset = model.prepare_tf_dataset(
    tokenized_datasets["test"],
    batch_size=batch_size,
    shuffle=True,
    collate_fn=data_collator,
)

validation_dataset = model.prepare_tf_dataset(
    tokenized_datasets["validation"],
    batch_size=batch_size,
    shuffle=False,
    collate_fn=data_collator,
)

generation_dataset = model.prepare_tf_dataset(
    tokenized_datasets["validation"],
    batch_size=8,
    shuffle=False,
    collate_fn=generation_data_collator,
)

In [11]:
optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)
model.compile(optimizer=optimizer)
model.fit(train_dataset, validation_data=validation_dataset, epochs=1)



<tf_keras.src.callbacks.History at 0x1564b0690>

In [12]:
# Save model and tokenizer
model.save_pretrained("translation_model")
tokenizer.save_pretrained("translation_model")

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]]}


('translation_model/tokenizer_config.json',
 'translation_model/special_tokens_map.json',
 'translation_model/vocab.json',
 'translation_model/source.spm',
 'translation_model/target.spm',
 'translation_model/added_tokens.json')