In [None]:
!pip install datasets
!pip install transformers
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [None]:
from datasets import load_dataset
raw_datasets = load_dataset('kde4', lang1='en', lang2='hi')
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 97227
    })
})

In [None]:
split_datasets = raw_datasets['train'].train_test_split(train_size = 0.9, seed=20)
split_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 87504
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 9723
    })
})

In [None]:
# Renaming test to validation

split_datasets['validation'] = split_datasets.pop('test')

In [None]:
split_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 87504
    })
    validation: Dataset({
        features: ['id', 'translation'],
        num_rows: 9723
    })
})

In [None]:
raw_datasets['train'][1]['translation']

{'en': 'Add Feeds to Akregator', 'hi': 'फ़ीडों को एकेरेगेटर में जोड़ें'}

In [None]:
from transformers import pipeline

model_checkpoint = "Helsinki-NLP/opus-mt-en-hi"
translator = pipeline("translation", model=model_checkpoint)


source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]



[{'translation_text': 'सभी लड़ी फैलाएँ (A)'}]

In [None]:
translator("I am doing a particular work")

[{'translation_text': 'मैं एक विशेष काम कर रहा हूँ'}]

In [None]:
from transformers import AutoTokenizer
model_checkpoint ='Helsinki-NLP/opus-mt-en-hi'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors = 'pt')


In [None]:
# Note: Tokenizer must process the target in the output language(i., hindi)

en_sentence = split_datasets['train'][1]['translation']['en']
hi_sentence = split_datasets['train'][1]['translation']['hi']
inputs = tokenizer(en_sentence, text_target=hi_sentence)
inputs


# the input IDs associated with the English sentence,
# while the IDs associated with the Hindi one are stored in the labels field.

{'input_ids': [2866, 16910, 0], 'attention_mask': [1, 1, 1], 'labels': [8161, 10238, 0]}

In [None]:
print(tokenizer.convert_ids_to_tokens(inputs['input_ids']))
print(tokenizer.convert_ids_to_tokens(inputs['labels']))

['▁First', '▁Slide', '</s>']
['▁स्लाइड', '▁मिटाएं', '</s>']


In [None]:
print(split_datasets['train'][1]['translation']['en'])
print(split_datasets['train'][1]['translation']['hi'])

First Slide
स्लाइड मिटाएं


In [None]:
# Defining a function to preprocess the above dataset by setting max_length = 128

max_length = 128

def preprocess_function(examples):
  inputs = [ex['en'] for ex in examples['translation']]
  targets = [ex['hi'] for ex in examples['translation']]
  model_inputs = tokenizer(
      inputs, text_target = targets, max_length=max_length, truncation = True)
  return model_inputs

In [None]:
tokenized_datasets = split_datasets.map(preprocess_function, batched = True,
                                        remove_columns=split_datasets["train"].column_names,)

Map:   0%|          | 0/87504 [00:00<?, ? examples/s]

Map:   0%|          | 0/9723 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 87504
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 9723
    })
})

In [None]:
from transformers import TFAutoModelForSeq2SeqLM
model_checkpoint = 'Helsinki-NLP/opus-mt-en-hi'
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, from_pt = True)

All PyTorch model weights were used when initializing TFMarianMTModel.

All the weights of TFMarianMTModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [None]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer, model = model, return_tensors = 'tf')

In [None]:
# Example:

batch = data_collator([tokenized_datasets['train'][i] for i in range(1, 3)])
batch.keys()

dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])

In [None]:
batch['labels']

<tf.Tensor: shape=(2, 6), dtype=int32, numpy=
array([[ 8161, 10238,     0,  -100,  -100,  -100],
       [10453,    20,    17,   345,    13,     0]], dtype=int32)>

In [None]:
batch['decoder_input_ids']

<tf.Tensor: shape=(2, 6), dtype=int32, numpy=
array([[61949,  8161, 10238,     0, 61949, 61949],
       [61949, 10453,    20,    17,   345,    13]], dtype=int32)>

As we can see that the decoder input IDs are the shifted version of labels

In [None]:
# By using data_collator to convert each of our datasets to a tf.data.Dataset

tf_train_dataset = model.prepare_tf_dataset(
    tokenized_datasets["train"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=32,
)
tf_eval_dataset = model.prepare_tf_dataset(
    tokenized_datasets["validation"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

In [None]:
!pip install sacrebleu
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.1 responses-0.18.0


In [None]:
import evaluate

metric = evaluate.load('sacrebleu')

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [None]:
# Example

predictions = [
    "This plugin lets you translate web pages between several languages automatically."
]
references = [
    [
        "This plugin allows you to automatically translate web pages between several languages."
    ]
]
metric.compute(predictions=predictions, references=references)

# Note:  Score	  Interpretation

       # 40 - 50	High quality translations
       # 50 - 60	Very high quality, adequate, and fluent translations
       # > 60	    Quality often better than human

# The score can go from 0 to 100, and higher is better.

{'score': 46.750469682990165,
 'counts': [11, 6, 4, 3],
 'totals': [12, 11, 10, 9],
 'precisions': [91.66666666666667,
  54.54545454545455,
  40.0,
  33.333333333333336],
 'bp': 0.9200444146293233,
 'sys_len': 12,
 'ref_len': 13}

In [None]:
predictions = ["This plugin"]
references = [
    [
        "This plugin allows you to automatically translate web pages between several languages."
    ]
]
metric.compute(predictions=predictions, references=references)

{'score': 0.0,
 'counts': [2, 1, 0, 0],
 'totals': [2, 1, 0, 0],
 'precisions': [100.0, 100.0, 0.0, 0.0],
 'bp': 0.004086771438464067,
 'sys_len': 2,
 'ref_len': 13}

In [None]:
import numpy as np
import tensorflow as tf
from tqdm import tqdm

generation_data_collator = DataCollatorForSeq2Seq(
    tokenizer, model=model, return_tensors="tf", pad_to_multiple_of=128
)

tf_generate_dataset = model.prepare_tf_dataset(
    tokenized_datasets["validation"],
    collate_fn=generation_data_collator,
    shuffle=False,
    batch_size=8,
)


@tf.function(jit_compile=True)
def generate_with_xla(batch):
    return model.generate(
        input_ids=batch["input_ids"],
        attention_mask=batch["attention_mask"],
        max_new_tokens=128,
    )


def compute_metrics():
    all_preds = []
    all_labels = []

    for batch, labels in tqdm(tf_generate_dataset):
        predictions = generate_with_xla(batch)
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        labels = labels.numpy()
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        decoded_preds = [pred.strip() for pred in decoded_preds]
        decoded_labels = [[label.strip()] for label in decoded_labels]
        all_preds.extend(decoded_preds)
        all_labels.extend(decoded_labels)

    result = metric.compute(predictions=all_preds, references=all_labels)
    return {"bleu": result["score"]}

In [None]:
!pip install huggingface_hub
from huggingface_hub import notebook_login
notebook_login()



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import create_optimizer
import tensorflow as tf

num_epochs = 3
num_train_steps = len(tf_train_dataset) * num_epochs
optimizer, schedule = create_optimizer(
    init_lr = 5e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate = 0.01,
)

model.compile(optimizer = optimizer)

In [None]:
model.fit(
    tf_train_dataset,
    validation_data = tf_eval_dataset,
    epochs=num_epochs,
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7a3926542f80>

In [None]:
model.push_to_hub('En-Hi_Translation_Model')
tokenizer.push_to_hub('En-Hi_Translation_Model')

tf_model.h5:   0%|          | 0.00/306M [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Areeb123/En-Hi_Translation_Model/commit/25c17703219e7921cb91b0ec74a9fe339f158c32', commit_message='Upload tokenizer', commit_description='', oid='25c17703219e7921cb91b0ec74a9fe339f158c32', pr_url=None, pr_revision=None, pr_num=None)