In [1]:
#Checking if GPU is running or not

!nvidia-smi

Sat Dec  9 08:19:17 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   63C    P8    11W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
!pip install --upgrade huggingface-hub



In [3]:
!pip install sacrebleu



In [5]:
#to use hugging face library need to install transformer(hv h.face functionality)
!pip install datasets transformers[sentencepiece] sacrebleu -q

In [6]:
import os
import sys
import transformers
import tensorflow as tf
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import AdamWeightDecay                                    #adamwi8decay is an optimizer
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM

In [7]:
model_checkpoint = "Helsinki-NLP/opus-mt-en-hi" #to load model

### Helsinki-NLP/opus-mt-en-hi model
source: https://huggingface.co/Helsinki-NLP/opus-mt-en-hi

### The Dataset
Source: https://huggingface.co/datasets/cfilt/iitb-english-hindi

In [8]:
raw_datasets = load_dataset("cfilt/iitb-english-hindi")   #using h.face API(load_dataset) can load dataset



In [9]:
raw_datasets   #it gives metadata info related to data(dont give full data)

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 1659083
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 520
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2507
    })
})

In [10]:
raw_datasets['train'][1]

{'translation': {'en': 'Accerciser Accessibility Explorer',
  'hi': 'एक्सेर्साइसर पहुंचनीयता अन्वेषक'}}

## Preprocessing the data

In [11]:
#  to convert eng-hin to nums
# generate embedding, to genrate it need tokenizer
# when raw data uploaded it convert to numeric reprstn cze transformer have prebuild tokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)



In [12]:
tokenizer("Hello, this is a sentence!")

{'input_ids': [12110, 2, 90, 23, 19, 8800, 61, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [13]:
tokenizer(["Hello, this is a sentence!", "This is another sentence."])

{'input_ids': [[12110, 2, 90, 23, 19, 8800, 61, 0], [239, 23, 414, 8800, 3, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}

In [14]:
#target_tokenizer is even target can embed to num
with tokenizer.as_target_tokenizer():
    print(tokenizer(["एक्सेर्साइसर पहुंचनीयता अन्वेषक"]))

{'input_ids': [[26618, 16155, 346, 33383, 0]], 'attention_mask': [[1, 1, 1, 1, 1]]}




In [15]:
#preprocess functn

max_input_length = 128                                                  # hypertunners
max_target_length = 128

source_lang = "en"
target_lang = "hi"


def preprocess_function(examples):                                       #preproceesor functn to cnvrt eng to hin
    inputs = [ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [16]:
preprocess_function(raw_datasets["train"][:2])

{'input_ids': [[3872, 85, 2501, 132, 15441, 36398, 0], [32643, 28541, 36253, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1]], 'labels': [[63, 2025, 18, 16155, 346, 20311, 24, 2279, 679, 0], [26618, 16155, 346, 33383, 0]]}

In [17]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)     #to do tokeniztn on all the data using map funtn

In [18]:
#to load model  #download data and loads
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Helsinki-NLP/opus-mt-en-hi.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [19]:
batch_size = 16
learning_rate = 2e-5
weight_decay = 0.01
num_train_epochs = 1  #its a pretrained model so 1 epoch is eng to learn, while realwrld  data incrse epoch

In [20]:
#collator pss data in batchwise to avoid crash
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

In [21]:
#same adding extra paramtr
generation_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf", pad_to_multiple_of=128)

In [22]:
#loadup training data
train_dataset = model.prepare_tf_dataset(
    tokenized_datasets["test"],                          #taking testng data from dataset to trn dataset cze training data is huge so tks time
    batch_size=batch_size,
    shuffle=True,
    collate_fn=data_collator
)

In [23]:
validation_dataset = model.prepare_tf_dataset(
    tokenized_datasets["validation"],
    batch_size=batch_size,
    shuffle=False,
    collate_fn=data_collator
)

In [24]:
generation_dataset = model.prepare_tf_dataset(
    tokenized_datasets["validation"],
    batch_size=8,
    shuffle=False,
    collate_fn=generation_data_collator,
)

In [25]:
#compile model : 1.define optimizer
optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)
model.compile(optimizer=optimizer)

In [26]:
#fit data to model
model.fit(train_dataset, validation_data=validation_dataset, epochs=1)



<keras.src.callbacks.History at 0x795d21d08e50>

In [27]:
model.save_pretrained("tf_model/")

### Model Testing

In [28]:
#loading toknzr, model to test on new data
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = TFAutoModelForSeq2SeqLM.from_pretrained("tf_model/")

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at tf_model/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [29]:
input_text  = "I am an Indian Data Scientist"

tokenized = tokenizer([input_text], return_tensors='np')
out = model.generate(**tokenized, max_length=128)
print(out)

tf.Tensor(
[[61949   104    38 15111  2403     6  6162   254     0 61949 61949 61949
  61949 61949 61949]], shape=(1, 15), dtype=int32)


In [30]:
with tokenizer.as_target_tokenizer():
    print(tokenizer.decode(out[0], skip_special_tokens=True))

मैं एक भारतीय डाटा के वैज्ञानिक हूँ




In [31]:
#calculating sacreblue score
import sacrebleu
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset

# Loading  model and tokenizer
model_name = "Helsinki-NLP/opus-mt-en-de"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Load your validation data
test_dataset = load_dataset("cfilt/iitb-english-hindi", split="validation")

# Load the reference translations for the validation set
reference_translations = [ex["translation"]["hi"] for ex in test_dataset]

# Generate hypotheses for the validation set using your translation model
hypotheses = []
for example in test_dataset:
    input_text = example["translation"]["en"]
    tokenized = tokenizer([input_text], return_tensors="pt")
    out = model.generate(**tokenized, max_length=128)
    generated_translation = tokenizer.decode(out[0], skip_special_tokens=True)
    hypotheses.append(generated_translation)

# Calculate BLEU scores
bleu = sacrebleu.corpus_bleu(hypotheses, [reference_translations])

# Print BLEU score
print(f"BLEU score for the test set: {bleu.score}")



BLEU score for the test set: 0.015583524379614529
