# Language Translation Model using Huggingface

### Import Dependencies

In [None]:
# !pip install datasets transformers[sentencepiece] sacrebleu -q

# !pip install --upgrade accelerate
# !pip uninstall -y transformers accelerate
# !pip install transformers accelerate

In [1]:
import os
import sys
import tensorflow as tf
import transformers
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import TFAutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq
from transformers import AdamWeightDecay

### Loading Pre-trained model

 Some Famous Translation Models:
- Google T5
- Facebook M2M100
- Google Cloud Translation API
- Tune AI
- Amazon Comprehend

In [2]:
# Load Helsinki model - https://huggingface.co/Helsinki-NLP/opus-mt-en-hi
# Load dataset - https://huggingface.co/datasets/cfilt/iitb-english-hindi

model_checkpoint = "Helsinki-NLP/opus-mt-en-hi"
dataset = load_dataset("cfilt/iitb-english-hindi")

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 1659083
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 520
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2507
    })
})

In [4]:
dataset["train"]

Dataset({
    features: ['translation'],
    num_rows: 1659083
})

In [5]:
dataset["validation"]


Dataset({
    features: ['translation'],
    num_rows: 520
})

In [6]:
dataset["test"]


Dataset({
    features: ['translation'],
    num_rows: 2507
})

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [8]:
tokenizer("There is Sunny who is teaching Data Science")

{'input_ids': [547, 23, 11647, 11453, 66, 23, 1762, 3671, 14932, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [9]:
with tokenizer.as_target_tokenizer():
    print(tokenizer("It is a sunny day.")) #Tokenization as targets

{'input_ids': [56, 142, 23, 19, 44, 16, 2930, 813, 667, 44, 7364, 3, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}




In [10]:
max_input_length = 128
max_target_length = 128

source_lang = "en"
target_lang = "hi"

def preprocess_function(sentence):
    """
    Basically from the sentence, we are assigning the target_lang as the label for each sentence, and creating list of it
    """
    inputs = [ex[source_lang]for ex in sentence["translation"]]
    targets = [ex[target_lang]for ex in sentence["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_target_length, truncation=True) 
    tokenizer(inputs, max_length=max_input_length, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [11]:
tokenized_data = dataset.map(preprocess_function, batched=True)

In [17]:
tokenized_data["train"]["labels"][0]

[63, 2025, 18, 16155, 346, 20311, 24, 2279, 679, 0]

In [12]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

tf_model.h5:   0%|          | 0.00/306M [00:00<?, ?B/s]

2024-03-10 15:37:36.009011: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2024-03-10 15:37:36.009127: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2024-03-10 15:37:36.009157: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2024-03-10 15:37:36.009520: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-03-10 15:37:36.009552: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Helsinki-NLP/opus-mt-en-hi.
If yo

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [13]:
batch_size = 32
learning_rate = 0.001
weight_decay = 0.01
num_train_epochs = 1

In [14]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors="tf")

In [19]:
train_dataset = model.prepare_tf_dataset(
    tokenized_data["test"],
    batch_size=batch_size,
    shuffle=True,
    collate_fn=data_collator
)

validation_dataset = model.prepare_tf_dataset(
    tokenized_data["validation"],
    batch_size=batch_size,
    shuffle=True,
    collate_fn=data_collator)

In [21]:
optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)

In [23]:
model.compile(optimizer = optimizer)

In [24]:
model.summary()

Model: "tf_marian_mt_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model (TFMarianMainLayer)   multiple                  76381184  
                                                                 
 final_logits_bias (BiasLay  multiple                  61950     
 er)                                                             
                                                                 
Total params: 76443134 (291.61 MB)
Trainable params: 76381184 (291.37 MB)
Non-trainable params: 61950 (241.99 KB)
_________________________________________________________________


In [25]:
model.fit(train_dataset, validation_data=validation_dataset, epochs=num_train_epochs)

Cause: for/else statement not yet supported
Cause: for/else statement not yet supported


2024-03-10 15:42:13.523820: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.




: 

In [None]:
model.save_pretrained("tf_model/")

### Inferencing from Model

In [4]:
from transformers import AutoTokenizer
from transformers import TFAutoModelForSeq2SeqLM
model_checkpoint = "Helsinki-NLP/opus-mt-en-hi"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = TFAutoModelForSeq2SeqLM.from_pretrained("tf_model/")

2024-03-10 16:46:51.557487: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2024-03-10 16:46:51.557508: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2024-03-10 16:46:51.557517: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2024-03-10 16:46:51.557667: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-03-10 16:46:51.557678: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at tf_model/.
If your task is simila

In [5]:
input_text  = "I am learning Coding. How are you"

tokenized = tokenizer([input_text], return_tensors='np')
out = model.generate(**tokenized, max_length=128)
print(out)

tf.Tensor(
[[61949   104 10328   260 13055     6   207    11  2713   153   254     2
    118   280    28    40     0]], shape=(1, 17), dtype=int32)


In [6]:
with tokenizer.as_target_tokenizer():
    print(tokenizer.decode(out[0], skip_special_tokens=True))

मैं पटाखा के बारे में पढ़ रहा हूँ, आप कैसे हैं।


