In [33]:
#Checking if GPU is running or not

!nvidia-smi

Sun Nov 26 14:02:50 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   59C    P0    27W /  70W |   4601MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [34]:
!pip install datasets transformers[sentencepiece] sacrebleu -q

In [35]:
import os
import sys
import transformers
import tensorflow as tf
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import AdamWeightDecay
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM

In [36]:
model_checkpoint = "Helsinki-NLP/opus-mt-mul-en"

## Helsinki-NLP/opus-mt-mul-en

source: https://huggingface.co/Helsinki-NLP/opus-mt-mul-en



# The Dataset

Dataset: snow_simplified_japanese_corpus
Source: https://huggingface.co/datasets/snow_simplified_japanese_corpus


In [37]:
raw_datasets = load_dataset("snow_simplified_japanese_corpus")

In [38]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['ID', 'original_ja', 'simplified_ja', 'original_en'],
        num_rows: 50000
    })
})

In [39]:
train_ratio = 0.9
val_ratio = 0.05
test_ratio = 0.05

In [40]:
num_samples = len(raw_datasets["train"])
num_train = int(num_samples * train_ratio)
num_val = int(num_samples * val_ratio)
num_test = num_samples - num_train - num_val

In [41]:
splits = {
    "train": raw_datasets["train"].select(list(range(num_train))),
    "validation": raw_datasets["train"].select(list(range(num_train, num_train + num_val))),
    "test": raw_datasets["train"].select(list(range(num_train + num_val, num_samples))),
}


In [42]:
from datasets import DatasetDict
split_dataset = DatasetDict(splits)

In [43]:
split_dataset

DatasetDict({
    train: Dataset({
        features: ['ID', 'original_ja', 'simplified_ja', 'original_en'],
        num_rows: 45000
    })
    validation: Dataset({
        features: ['ID', 'original_ja', 'simplified_ja', 'original_en'],
        num_rows: 2500
    })
    test: Dataset({
        features: ['ID', 'original_ja', 'simplified_ja', 'original_en'],
        num_rows: 2500
    })
})

In [44]:
split_dataset['train'][1]

{'ID': '2',
 'original_ja': '多くの動物が人間によって滅ぼされた。',
 'simplified_ja': '多くの動物が人間によって殺された。',
 'original_en': 'many animals have been destroyed by men .'}

#Preprocessing the data

In [45]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [46]:
tokenizer("Hello, this is a sentence!")

{'input_ids': [17115, 146, 3, 73, 17, 9, 4, 5210, 4084, 58, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [47]:
tokenizer(["Hello, this is a sentence!", "This is another sentence."])

{'input_ids': [[17115, 146, 3, 73, 17, 9, 4, 5210, 4084, 58, 0], [6583, 269, 17, 1905, 6205, 4, 5210, 4084, 2, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [48]:
with tokenizer.as_target_tokenizer():
    print(tokenizer(["私の名前はチラグです"]))

{'input_ids': [[4, 15541, 2504, 8231, 9389, 3643, 26994, 14214, 29846, 7317, 14387, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


In [49]:
max_input_length = 128
max_target_length = 128

source_lang = "original_en"
target_lang = "original_ja"


def preprocess_function(examples):
    inputs = examples[source_lang]
    targets = examples[target_lang]

    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [50]:
preprocess_function(split_dataset["train"][:2])

{'input_ids': [[32, 96, 341, 28, 37, 325, 557, 338, 1442, 325, 365, 182, 221, 5188, 256, 4, 2, 0], [95, 288, 6669, 6, 121, 221, 31, 128, 43, 3907, 237, 44, 70, 334, 4, 2, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[4, 30593, 5116, 2187, 41481, 3605, 15660, 7547, 4703, 15541, 3605, 3643, 5239, 4703, 4538, 8012, 20566, 7307, 445, 0], [4, 5706, 7547, 2504, 17891, 10557, 5116, 1858, 17443, 3605, 10905, 7720, 8983, 1, 12166, 5699, 5853, 445, 0]]}

In [51]:
tokenized_datasets = split_dataset.map(preprocess_function, batched=True)

In [52]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Helsinki-NLP/opus-mt-mul-en.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [53]:
batch_size = 16
learning_rate = 2e-5
weight_decay = 0.01
num_train_epochs = 20

In [54]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

In [55]:
generation_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf", pad_to_multiple_of=128)

In [56]:
train_dataset = model.prepare_tf_dataset(
    tokenized_datasets["test"],
    batch_size=batch_size,
    shuffle=True,
    collate_fn=data_collator,
)


In [57]:
validation_dataset = model.prepare_tf_dataset(
    tokenized_datasets["validation"],
    batch_size=batch_size,
    shuffle=False,
    collate_fn=data_collator,
)

In [58]:
generation_dataset = model.prepare_tf_dataset(
    tokenized_datasets["validation"],
    batch_size=8,
    shuffle=False,
    collate_fn=generation_data_collator,
)

In [59]:
optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)
model.compile(optimizer=optimizer)

In [60]:
model.fit(train_dataset, validation_data=validation_dataset, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7e8e9f802110>

In [61]:
model.save_pretrained("tf_model/")

# Model Testing

In [62]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = TFAutoModelForSeq2SeqLM.from_pretrained("tf_model/")

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at tf_model/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [63]:
input_text  = "My name is chirag."

tokenized = tokenizer([input_text], return_tensors='np')
out = model.generate(**tokenized, max_length=128)
print(out)

tf.Tensor(
[[64171     4 15541  2504  8231  9389  3643   391 14214 15127  7317 14387
    445     0]], shape=(1, 14), dtype=int32)


In [74]:
with tokenizer.as_target_tokenizer():
    print(tokenizer.decode(out[0], skip_special_tokens=True))

私の名前は chiラクです。




In [82]:
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM


decoded_translations = tokenizer.batch_decode(out, skip_special_tokens=True)[0]

# Example reference translations in Japanese
reference_translations = [
    "私の名前はチラグです"
]

# Calculate BLEU score
overall_bleu_score = corpus_bleu([reference_translations], [decoded_translations], smoothing_function=None)

# Token-wise BLEU scores
token_bleu_scores = []
for n in range(1, 5):  # You can adjust the n-gram range
    token_bleu = sentence_bleu([reference_translations[0].split()], decoded_translations.split(), weights=(1/n,)*n)
    token_bleu_scores.append(token_bleu)

print(f"Overall BLEU Score: {overall_bleu_score * 100:.2f}")
#print(f"Token-wise BLEU Scores: {token_bleu_scores}")



Overall BLEU Score: 31.61
