In [46]:
!pip install transformers --upgrade



In [47]:
!pip install datasets transformers[sentencepiece] sacrebleu -q

In [59]:
from transformers import pipeline
import os
import sys
import transformers
import tensorflow as tf
from datasets import load_dataset,DatasetDict
from transformers import AutoTokenizer

from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import AdamWeightDecay
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM

In [51]:
model_checkpoints="sayanmandal/t5-small_6_3-hi_en-to-en"

In [59]:
raw_datasets = load_dataset("findnitai/english-to-hinglish")

In [None]:
raw_datasets

In [None]:
from datasets import load_dataset, DatasetDict
train_temp_split = raw_datasets['train'].train_test_split(test_size=0.01)
val_test_split = train_temp_split['test'].train_test_split(test_size=0.5)

final_datasets = DatasetDict({
    'train': train_temp_split['train'].remove_columns(
        [col for col in train_temp_split['train'].column_names if col != 'translation']
    ),
    'validation': val_test_split['train'].remove_columns(
        [col for col in val_test_split['train'].column_names if col != 'translation']
    ),
    'test': val_test_split['test'].remove_columns(
        [col for col in val_test_split['test'].column_names if col != 'translation']
    )
})

print(final_datasets)


DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 94551
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 28365
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 66186
    })
})


In [None]:
final_datasets['train'][1]

{'translation': {'en': 'Is there road construction happening on James River Freeway',
  'hi_ng': 'kya James River Freeway par road construction ho raha hai',
  'source': 0}}

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoints)

In [None]:
tokenizer("Hello, this is a sentence!")

{'input_ids': [1514, 129, 144, 118, 103, 115, 8128, 447, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [52]:
max_input_length = 128
max_target_length = 128

source_lang = "en"
target_lang = "hi_ng"
mode="translation"

def preprocess_function(examples):
    inputs = [ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)


    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
preprocess_function(final_datasets["train"][:2])

{'input_ids': [[659, 103, 115, 261, 706, 1], [1061, 387, 130, 5021, 3863, 8485, 690, 2578, 139, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[160, 439, 706, 321, 122, 1], [1498, 336, 134, 104, 633, 3863, 8485, 1029, 6264, 2200, 2140, 1]]}

In [None]:
tokenized_datasets = final_datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/94551 [00:00<?, ? examples/s]

Map:   0%|          | 0/28365 [00:00<?, ? examples/s]

Map:   0%|          | 0/66186 [00:00<?, ? examples/s]

In [None]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoints, from_pt=True)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFT5ForConditionalGeneration: ['decoder.embed_tokens.weight', 'encoder.embed_tokens.weight', 'lm_head.weight']
- This IS expected if you are initializing TFT5ForConditionalGeneration from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFT5ForConditionalGeneration from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [None]:
batch_size = 32
learning_rate = 2e-5
weight_decay = 0.01
num_train_epochs = 1

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

In [None]:
generation_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf", pad_to_multiple_of=128)

In [None]:
train_dataset = model.prepare_tf_dataset(
    tokenized_datasets["test"],
    batch_size=batch_size,
    shuffle=True,
    collate_fn=data_collator,
)

In [None]:
validation_dataset = model.prepare_tf_dataset(
    tokenized_datasets["validation"],
    batch_size=batch_size,
    shuffle=False,
    collate_fn=data_collator,
)

In [None]:
generation_dataset = model.prepare_tf_dataset(
    tokenized_datasets["validation"],
    batch_size=8,
    shuffle=False,
    collate_fn=generation_data_collator,
)

In [None]:
optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)
model.compile(optimizer=optimizer,
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:

history = model.fit(train_dataset,
                    validation_data=validation_dataset,
                    epochs=1)
val_loss, val_accuracy = model.evaluate(validation_dataset)
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")


156/156 [==============================] - 55s 163ms/step - loss: 3.7620 - val_loss: 3.9403

In [53]:
model = pipeline(mode, model=model_checkpoints)




In [54]:
def translate():
  text = input("Enter a text: ")
  results = model(text)
  return results[0]['translation_text']

In [61]:
translate()#run this cell for translation

Enter a text: me project pe kaam kr raha hun


"I'm working on the project"