## Installing Necessary Libraries

In [3]:
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install sacrebleu
!pip install accelerate
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding
!pip install sentencepiece

Collecting transformers
  Downloading transformers-4.33.3-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m43.1 MB/s[0m eta [36m0:00:0

## Importing necessary libraries

In [4]:
import accelerate

# Importing model and tokenizer
from transformers import AutoModel, AutoTokenizer

# For data preprocessing
import pandas as pd
from datasets import load_dataset

# For training args
from transformers import DataCollatorForSeq2Seq
import evaluate
import numpy as np
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

# For reloading model for use
import warnings
warnings.filterwarnings('ignore')
import torch
from transformers import AutoTokenizer,AutoModelForSeq2SeqLM

### Mounting google drive

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Initializing model and tokenizer

In [None]:
model_name = "Helsinki-NLP/opus-mt-en-ROMANCE"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/265 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/779k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/799k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.46M [00:00<?, ?B/s]



Downloading pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

## Creating Dataset

### Downloading from URL

In [None]:
url1 = 'https://raw.githubusercontent.com/google-research-datasets/Hinglish-TOP-Dataset/main/Dataset/Human%20Annotated%20Data/test.tsv'
url2 = 'https://raw.githubusercontent.com/google-research-datasets/Hinglish-TOP-Dataset/main/Dataset/Synthetically%20Generated%20Data/train.tsv'
url2_1 = 'https://raw.githubusercontent.com/google-research-datasets/Hinglish-TOP-Dataset/main/Dataset/Human%20Annotated%20Data/train.tsv'
data = pd.read_csv(url2,sep='\t')
data2 = pd.read_csv(url2_1,sep='\t')
test_data = pd.read_csv(url1,sep='\t')
data.drop([column for column in data if column not in ['en_query','cs_query']],axis=1,inplace=True)
data2.drop([column for column in data2 if column not in ['en_query','cs_query']],axis=1,inplace=True)
test_data.drop([column for column in test_data if column not in ['en_query','cs_query']],axis=1,inplace=True)
new_ind = [i for i in range(170083,173076)]
data2.set_index(pd.Index(new_ind),inplace=True)
data = pd.concat([data,data2],axis=0)
data

In [None]:
test_data

Unnamed: 0,en_query,cs_query
0,Pause my timer .,mere timer ko roko
1,Is it raining in Hawaii ?,Kya hawaii me raining ho rahi hai?
2,Set a reminder for me to wake up at 630 am tom...,Mere liye reminder set karo to wake up at 6:30...
3,Please pause timer,Please timer ko rokey
4,play the first album for killswitch engaged,Killswitch engaged ka pehla album play kariye
...,...,...
6508,Stop alarm now,alarm ko abhi stop kare
6509,set alarm every hour,Har ghante ke liye alarm set kare
6510,text Bobby,Bobby ko text kare
6511,remind me to pick up laundry at 6 pm,Muje shaam 6 baje laundry ko pick up karne ke ...


### Store and Load train_data and test_data to drive

In [None]:
data.to_csv('/content/drive/MyDrive/Hinglish_translation/data.csv',index=False)
test_data.to_csv('/content/drive/MyDrive/Hinglish_translation/test_data.csv',index=False)

In [None]:
translation_data = load_dataset('csv',data_files = {'train':'/content/drive/MyDrive/Hinglish_translation/data.csv','test':'/content/drive/MyDrive/Hinglish_translation/test_data.csv'})

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
translation_data

DatasetDict({
    train: Dataset({
        features: ['en_query', 'cs_query'],
        num_rows: 173076
    })
    test: Dataset({
        features: ['en_query', 'cs_query'],
        num_rows: 6513
    })
})

## Data Preprocessing

In [None]:
source_lang = "en_query"
target_lang = "cs_query"
prefix = "translate English to Hinglish: "


def preprocess_function(examples):
    inputs = [prefix + example for example in examples['en_query']]
    targets = [example for example in examples['cs_query']]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True,)
    return model_inputs

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

### Tokenizing data

In [None]:
tokenized_data = translation_data.map(preprocess_function, batched=True)

Map:   0%|          | 0/173076 [00:00<?, ? examples/s]

Map:   0%|          | 0/6513 [00:00<?, ? examples/s]

In [None]:
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['en_query', 'cs_query', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 173076
    })
    test: Dataset({
        features: ['en_query', 'cs_query', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 6513
    })
})

## Finetuning Model

### Defining custom training arguments

In [None]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)
metric = evaluate.load("sacrebleu")
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

### Training the model

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/Hinglish_translation",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    metric_for_best_model="bleu",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,0.4512,0.932242,38.0962,18.3805


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,0.4512,0.932242,38.0962,18.3805
2,0.3421,0.931612,38.1869,18.4482
3,0.2871,0.939605,38.7419,18.5655
4,0.26,0.949627,39.1479,18.6049
5,0.2448,0.957092,38.9606,18.6023


TrainOutput(global_step=13525, training_loss=0.37608933975868436, metrics={'train_runtime': 4969.9184, 'train_samples_per_second': 174.124, 'train_steps_per_second': 2.721, 'total_flos': 7218095369748480.0, 'train_loss': 0.37608933975868436, 'epoch': 5.0})

## Saving the model and reloading it

In [None]:
trainer.save_model("/content/drive/MyDrive/Hinglish_translation/finetuned-t5-model-23f")
tokenizer.save_pretrained("/content/drive/MyDrive/Hinglish_translation/finetuned-t5-model-23f")

('/content/drive/MyDrive/Hinglish_translation/finetuned-t5-model-23f/tokenizer_config.json',
 '/content/drive/MyDrive/Hinglish_translation/finetuned-t5-model-23f/special_tokens_map.json',
 '/content/drive/MyDrive/Hinglish_translation/finetuned-t5-model-23f/vocab.json',
 '/content/drive/MyDrive/Hinglish_translation/finetuned-t5-model-23f/source.spm',
 '/content/drive/MyDrive/Hinglish_translation/finetuned-t5-model-23f/target.spm',
 '/content/drive/MyDrive/Hinglish_translation/finetuned-t5-model-23f/added_tokens.json')

In [8]:
# Load the model architecture
model = AutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/Hinglish_translation/finetuned-t5-model-23f")

# Load the tokenizer if needed
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/Hinglish_translation/finetuned-t5-model-23f/")

## Use of Finetuned Model with BLEU : 39.14%

In [19]:
def translate(text):
  #input_ids = tokenizer.encode("translate English to Hinglish: "+text,return_tensors='pt')
  input_ids = tokenizer.encode("translate English to Hinglish: " + text, return_tensors="pt", max_length=512, truncation=True)
  #print(input_ids)
  output = model.generate(input_ids=input_ids)
  decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
  return decoded_output

### Making predictions of the 3 statements given

In [10]:
text =  'Definitely share your feedback  in the comment section.'
text2 = 'I was waiting for my bag.'
text3 = "So even if it's a big video, I will clearly mention all the products"

In [20]:
translation1 = translate(text)
translation2 = translate(text2)
translation3 = translate(text3)

In [21]:
output = pd.DataFrame(data={'English':[text,text2,text3],'Hinglish':[translation1,translation2,translation3]})
output.to_csv('/content/drive/MyDrive/Hinglish_translation/output.csv')

In [22]:
output

Unnamed: 0,English,Hinglish
0,Definitely share your feedback in the comment...,comment section me apne feedback ko share kare...
1,I was waiting for my bag.,mai apni bag ko waiting tha
2,"So even if it's a big video, I will clearly me...",Agar agar ye ek bada video hai toh mai sabhi p...
