### Installations

In [None]:
!pip install datasets
!pip install langchain-huggingface
!pip install langchain_community
!pip install googletrans
!pip install torch
!pip install transformers
!pip install evaluate
!pip install deep-translator
!pip install langchain-groq

### Helper functions

In [None]:
from datasets import load_dataset
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset
import datasets
import evaluate
import torch

# Setting device to GPU if GPU is available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

train_dataset = load_dataset("cfilt/iitb-english-hindi", split="train")
valid_dataset = load_dataset("cfilt/iitb-english-hindi", split="validation")
test_dataset  = load_dataset("cfilt/iitb-english-hindi", split="test")


def prepare_dataset(dataset_loaded, number_of_examples, prompt):
  source_sentences = [entry["en"] for entry in dataset_loaded['translation']][:number_of_examples]
  target_sentences = [entry["hi"] for entry in dataset_loaded['translation']][:number_of_examples]

  prompted_sentences = [prompt.format(text) for text in source_sentences]

  return prompted_sentences, source_sentences, target_sentences


def evaluation_score(metric, predictions, references):
  metric_loaded = evaluate.load(metric)
  results = metric_loaded.compute(predictions=predictions, references=references)
  return results[metric]


In [None]:
from deep_translator import GoogleTranslator
from google.colab import userdata
from langchain_groq import ChatGroq
from langchain.schema import HumanMessage
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

GROQ_API_KEY = userdata.get('GROQ_API_KEY')

# Initialize the Groq model
groq_model = ChatGroq(model_name="llama3-8b-8192", groq_api_key=GROQ_API_KEY)

def translate_google(inputs, single_sentence=False):
  output_google = []
  if single_sentence == False:
    for text in inputs:
      translations = GoogleTranslator(source="en", target="hi").translate(text)
      output_google.append(translations)
  else:
    translations = GoogleTranslator(source="en", target="hi").translate(inputs)
    output_google.append(translations)

  return output_google


def translate_llama(inputs, single_sentence=False):
  output_llama = []

  prompt_template = PromptTemplate(
    input_variables=["text"],  # Variable to be replaced dynamically
    template="Translate {text} to Hindi only"
  )

  chain = LLMChain(llm=groq_model, prompt=prompt_template)

  if single_sentence == False:
    for input in inputs:
      response = chain.run(text=input)
      output_llama.append(response)
  else:
    response = chain.run(text=inputs)
    output_llama.append(response)

  return output_llama


### Baseline Model Translations - Google Translate, Llama

In [None]:
prompt = "Translate to Hindi: {}. Translation:"
prompt_inputs, source_sentences, target_sentences = prepare_dataset(valid_dataset, 30, prompt)

#print("Actual Translation: ", target_sentences)

# Baseline model predictions
output_predictions_google = translate_google(source_sentences)
#print("Google translations: ",output_predictions_google)
output_predictions_llama = translate_llama(source_sentences)
#print("Llama translations: ",output_predictions_llama)

print("--------- Google Translate Results --------")
print("Meteor Score: ", evaluation_score(metric="meteor", predictions=output_predictions_google, references=target_sentences))
print("BLEU Score: ", evaluation_score(metric="bleu", predictions=output_predictions_google, references=target_sentences))

print("--------- Llama3 Results --------")
print("Meteor Score: ", evaluation_score(metric="meteor", predictions=output_predictions_llama, references=target_sentences))
print("BLEU Score: ", evaluation_score(metric="bleu", predictions=output_predictions_llama, references=target_sentences))

###  BLOOMZ Model Translations

In [None]:
prompt = "Translate to Hindi: {}. Translation:"
prompt_inputs, source_sentences, target_sentences = prepare_dataset(valid_dataset, 30, prompt)

def model_prediction(inputs, pipe, batch_size, max_length=10000, truncation="do_not_truncate"):
  output_pred=[]

  for out in pipe(inputs, batch_size=batch_size, max_length=max_length, truncation=truncation):
      output_pred.append(out[0]['generated_text'].split("Translation:")[-1].strip())

  return output_pred

#print("Actual Translation: ", target_sentences)
pipe = pipeline("text-generation", model= "bigscience/bloomz-560m",device=device)
output_predictions = model_prediction(prompt_inputs, pipe, 1)
#print("Model Translations: ", output_predictions)

print("--------- Bloomz Results --------")
print("Meteor Score: ", evaluation_score(metric="meteor", predictions=output_predictions, references=target_sentences))
print("BLEU Score: ", evaluation_score(metric="bleu", predictions=output_predictions, references=target_sentences))



### MBart-50 Model Translations

In [None]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

prompt = "Translate to Hindi: {}. Translation:"
prompt_inputs, source_sentences, target_sentences = prepare_dataset(valid_dataset, 30, prompt)

mbart_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt").to(device)
mbart_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

def mbart_translations(model, tokenizer, source_sentences, single_sentence=False):
  model_predictions = []
  if single_sentence == False:
    for input in source_sentences:

      tokenizer.src_lang = "en_XX"
      encoded_hi = tokenizer(input, return_tensors="pt").to(device)
      generated_tokens = model.generate(
          **encoded_hi,
          forced_bos_token_id=tokenizer.lang_code_to_id["hi_IN"]
      )
      res = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
      model_predictions.append(res)
  else:
    tokenizer.src_lang = "en_XX"
    encoded_hi = tokenizer(source_sentences, return_tensors="pt").to(device)
    generated_tokens = model.generate(
        **encoded_hi,
        forced_bos_token_id=tokenizer.lang_code_to_id["hi_IN"]
    )
    res = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
    model_predictions.append(res)

  return model_predictions

model_predictions = mbart_translations(mbart_model, mbart_tokenizer, source_sentences)
#print("Actual Translation: ", target_sentences)

#print("Model Translations: ", model_predictions)

print("--------- MBart Results --------")
print("Meteor Score: ", evaluation_score(metric="meteor", predictions=model_predictions, references=target_sentences))
print("BLEU Score: ", evaluation_score(metric="bleu", predictions=model_predictions, references=target_sentences))


### NLLB Model Translations

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
prompt = "Translate to Hindi: {}. Translation:"
prompt_inputs, source_sentences, target_sentences = prepare_dataset(valid_dataset, 30, prompt)

# Load model directly
nllb_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
nllb_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M").to(device)

def nllb_translations(model, tokenizer, source_sentences, single_sentence=False):
  model_predictions = []
  if single_sentence == False:
    for input in source_sentences:
      inputs = tokenizer(input, return_tensors="pt").to(device)

      translated_tokens = model.generate(
          **inputs, forced_bos_token_id=tokenizer.convert_tokens_to_ids("hin_Deva"), max_length=50
      )

      res = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
      model_predictions.append(res)
  else:
    inputs = tokenizer(source_sentences, return_tensors="pt").to(device)

    translated_tokens = model.generate(
        **inputs, forced_bos_token_id=tokenizer.convert_tokens_to_ids("hin_Deva"), max_length=50
    )

    res = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
    model_predictions.append(res)

  return model_predictions

model_predictions = nllb_translations(nllb_model, nllb_tokenizer, source_sentences)

#print("Actual Translation: ", target_sentences)

#print("Model Translations: ", model_predictions)

print("---------NLLB Results --------")
print("Meteor Score: ", evaluation_score(metric="meteor", predictions=model_predictions, references=target_sentences))
print("BLEU Score: ", evaluation_score(metric="bleu", predictions=model_predictions, references=target_sentences))



### M2M100 Model Translations

In [None]:
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

m2m100_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M").to(device)
m2m100_tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")

def m2m100_translations(model, tokenizer, source_sentences, single_sentence=False):
  model_predictions = []
  if single_sentence == False:
    for input in source_sentences:
      tokenizer.src_lang = "en"
      encoded_hi = tokenizer(input, return_tensors="pt").to(device)
      generated_tokens = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.get_lang_id("hi"))
      res = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
      model_predictions.append(res)
  else:
    tokenizer.src_lang = "en"
    encoded_hi = tokenizer(source_sentences, return_tensors="pt").to(device)
    generated_tokens = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.get_lang_id("hi"))
    res = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
    model_predictions.append(res)

  return model_predictions

model_predictions = nllb_translations(m2m100_model, m2m100_tokenizer, source_sentences)

#print("Actual Translation: ", target_sentences)

#print("Model Translations: ", model_predictions)

print("---------M2M100 Results --------")
print("Meteor Score: ", evaluation_score(metric="meteor", predictions=model_predictions, references=target_sentences))
print("BLEU Score: ", evaluation_score(metric="bleu", predictions=model_predictions, references=target_sentences))



### Translation Comparision using Test Sentence by User

In [None]:
import time

user_input = input("Enter text to translate to Hindi: ")

#bloomz_pred = model_prediction(prompt.format(input), pipe, 1)
nllb_pred = nllb_translations(nllb_model, nllb_tokenizer, user_input, single_sentence=True)
mbart_pred = mbart_translations(mbart_model, mbart_tokenizer, user_input, single_sentence=True)
m2m100_pred = m2m100_translations(m2m100_model, m2m100_tokenizer, user_input, single_sentence=True)

#print(bloomz_pred)
print("NLLB ",nllb_pred)
print("MBart ",mbart_pred)
print("M2M100 ",m2m100_pred)


google_pred = translate_google(user_input, single_sentence=True)
llama_pred = translate_llama(user_input, single_sentence=True)

print("Google Translate ",google_pred)
print("Llama ",llama_pred)
