In [None]:
!pip install googletrans==4.0.0-rc1

In [None]:
!git clone https://github.com/TharinduDR/TransQuest.git

In [None]:
import os

os.chdir('/content/TransQuest')

Modify the transformers in requirements.txt file to transformers==4.40.2 to ensure no error while running the notebook
Solution Credits : https://huggingface.co/TransQuest/monotransquest-da-any_en/discussions/3

In [None]:
!pip install -r requirements.txt

In [None]:
import os

os.chdir('/content')

In [None]:
import pandas as pd

df=pd.read_csv("data.csv")

In [None]:
df.shape

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

In [None]:
random_samples = df['source_txt'].sample(frac=1, random_state=42).to_list()

In [None]:
from transformers import MarianMTModel, MarianTokenizer

model_name = 'Helsinki-NLP/opus-mt-en-mr'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

def method_1_translate_en_to_mr(text):
    translated = model.generate(**tokenizer(text, return_tensors="pt", padding=True))
    translation = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
    return translation[0]

model_1_mr_samples=[method_1_translate_en_to_mr(item) for item in random_samples]

In [None]:
from googletrans import Translator
import time


def method_2_translate_en_to_mr(text):
    try:
        return Translator().translate(text, src="en", dest="mr").text
    except Exception as e:
        time.sleep(10)
        return method_2_translate_en_to_mr(text)


model_2_mr_samples = [method_2_translate_en_to_mr(item) for item in random_samples]

In [None]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
model_checkpoint = "aryaumesh/english-to-marathi"

tokenizer = MBart50TokenizerFast.from_pretrained(model_checkpoint)
model = MBartForConditionalGeneration.from_pretrained(model_checkpoint)

def method_3_translate_en_to_mr(text):
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model.generate(**inputs)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

model_3_mr_samples=[method_3_translate_en_to_mr(item) for item in random_samples]

In [None]:
import torch
from transquest.algo.sentence_level.monotransquest.run_model import MonoTransQuestModel

model = MonoTransQuestModel("xlmroberta", "TransQuest/monotransquest-da-en_any", num_labels=1, use_cuda=torch.cuda.is_available())

def get_qe_score(text_1:str,text_2:str) -> float:
    predictions, raw_outputs = model.predict([[text_1,text_2]])
    return float(raw_outputs)

In [None]:
model_1_score=[get_qe_score(item,model_1_mr_samples[i]) for i,item in enumerate(random_samples)]

In [None]:
model_3_score=[get_qe_score(item,model_3_mr_samples[i]) for i,item in enumerate(random_samples)]

In [None]:
import pandas as pd

final_df = pd.DataFrame({
    "en_text": random_samples,
    "model_1_mr_text": model_1_mr_samples,
    "model_1_score": model_1_score,
    "model_2_mr_text": model_2_mr_samples,
    "model_3_mr_text": model_3_mr_samples,
    "model_3_score": model_3_score
})


In [None]:
final_df.head()

In [None]:
import matplotlib.pyplot as plt

print(final_df["model_1_score"].mean())
final_df["model_1_score"].plot.box()

plt.show()

In [None]:
import matplotlib.pyplot as plt

print(final_df["model_3_score"].mean())
final_df["model_3_score"].plot.box()

plt.show()

In [None]:
final_df.to_csv("transquest_scores.csv",index=False)