In [None]:
!pip install transformers

In [None]:
import pandas as pd

df=pd.read_csv("data.csv")
df.shape

In [None]:
import pandas as pd

unique_sentences = pd.unique(df[['source_txt', 'plagiarism_txt']].values.ravel())

len(unique_sentences)

In [None]:
import torch
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from torch.utils.data import DataLoader
from tqdm import tqdm

model_checkpoint = "aryaumesh/english-to-marathi"
tokenizer = MBart50TokenizerFast.from_pretrained(model_checkpoint)
model = MBartForConditionalGeneration.from_pretrained(model_checkpoint)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def translate_en_to_mr_batch(sentences, batch_size):
    """
    Translates a list of English sentences to Marathi using the MBart model in batches, utilizing GPU if available.

    Args:
        sentences (list): List of English sentences to translate.
        batch_size (int): Number of sentences to process per batch.

    Returns:
        dict: A dictionary with original sentences as keys and translated sentences as values.
    """

    data_loader = DataLoader(sentences, batch_size=batch_size, shuffle=False)
    result = {}

    with tqdm(total=len(sentences), desc="Translating", unit="sentence") as pbar:
        for batch in data_loader:
            inputs = tokenizer(list(batch), return_tensors="pt", padding=True, truncation=True).to(device)

            outputs = model.generate(**inputs)
            translations = tokenizer.batch_decode(outputs, skip_special_tokens=True)

            result.update(dict(zip(batch, translations)))

            pbar.update(len(batch))

    return result

translations = translate_en_to_mr_batch(unique_sentences, batch_size=32)

In [None]:
def fun(text: str) -> str:
    return translations[text]


df["source_txt"] = df["source_txt"].apply(fun)
df["plagiarism_txt"] = df["plagiarism_txt"].apply(fun)

In [None]:
df.to_csv("marathi_plagiarism_data.csv",index=False)