In [1]:
import pandas as pd

df = pd.read_csv('task2.csv')
df.head(3)

Unnamed: 0,text,native_text,translation
0,indulo anno assami masaladinusulu kuda vesi ch...,ఇందులో అన్నో అస్సామి మసలాదినుసులు కుడ వేసి చేస...,A lot of Assamese spices also go into its making.
1,"ondu all, one tadupusin erandu doskayp botukko...","ஒண்டு ஆல், ஒனே தடுப்பூசின் இரண்டு டாஸ்கேப் போட...","Not one, but two doses of the same vaccine."
2,tumhen mal main jane yaa train yaa flite main ...,तुम्हें मल मैन जाने या ट्रेन या फ्लाइट मैन ट्र...,You will be asked to show your vaccine certifi...


In [2]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.1, random_state=0)

train.reset_index(inplace=True)
test.reset_index(inplace=True)

In [3]:
print(len(test))

850


In [None]:
from transformers import NllbTokenizer

In [5]:
tokenizer = NllbTokenizer.from_pretrained('facebook/nllb-200-distilled-600M')

In [6]:
import torch

class TranslationDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_len):
        super(TranslationDataset, self).__init__()

        self.data=data['native_text']
        self.target = data['translation']
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]
        translation = self.target[idx]

        source = self.tokenizer.encode_plus(
            text,
            return_tensors='pt',
            truncation=True,
            max_length=self.max_len,
            padding='max_length'
        )
        target = self.tokenizer.encode_plus(
            translation,
            return_tensors='pt',
            truncation=True,
            max_length=self.max_len,
            padding='max_length'
        )

        return {
            'input_ids': source['input_ids'].flatten(),
            'attention_mask': source['attention_mask'].flatten(),
            'labels': target['input_ids'].flatten(),
        }

train_ds = TranslationDataset(train, tokenizer, 512)
test_ds = TranslationDataset(test, tokenizer, 512)

In [11]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained('facebook/nllb-200-distilled-600M')

In [12]:
model.cuda()

M2M100ForConditionalGeneration(
  (model): M2M100Model(
    (shared): Embedding(256206, 1024, padding_idx=1)
    (encoder): M2M100Encoder(
      (embed_tokens): Embedding(256206, 1024, padding_idx=1)
      (embed_positions): M2M100SinusoidalPositionalEmbedding()
      (layers): ModuleList(
        (0-11): 12 x M2M100EncoderLayer(
          (self_attn): M2M100Attention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerNorm

In [13]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# Define training arguments for the model
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=1000,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    report_to='none'
)

In [None]:
# Initializing the Trainer object
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
)

# Starting the training process
trainer.train()

In [11]:
def translate(text):
    inputs = tokenizer(
        text,
        return_tensors='pt',
        padding='max_length',
        truncation=True,
        max_length=512
    )
    result = model.generate(
        **inputs.to(model.device),
        num_beams=4,
        forced_bos_token_id=tokenizer.lang_code_to_id["eng_Latn"]
    )
    tr = tokenizer.batch_decode(result, skip_special_tokens=True)
    return tr[0]

In [None]:
out = pd.read_csv('translation.csv')

out['output'] = out['native_text'].apply(lambda x: translate(x))

In [13]:
out.head(3)

Unnamed: 0,index,text,native_text,translation,output
0,8101,"arsiaiku,hyderabadloni udyogul state insurence...","ఆర్సియైకు,హైదరాబాద్లోని ఉద్యోగుల్ స్టేట్ ఇన్షు...",Employees' State Insurance Corporation Medical...,"The RCI, Employees' State Insurance Corporatio..."
1,5597,olive bachchi sifan putwai udutirunth batma pa...,ஒலிவ் பச்சி சிஃபான் புட்வை உடுதிருந்த் பாத்மா ...,"Clad in an olive-green chiffon saree, Fatma lo...",I felt like I was talking about an olive grove...
2,394,ih bohut vadhia hai|,ਇਹ ਬੋਹੁਤ ਵਧਿਆ ਹਾਈ|,That's great.,It's very hectic.


In [14]:
out.to_csv('task2_output.csv', index=False)

In [15]:
%pip install -q sacrebleu
%pip install -q rouge_score
%pip install -q datasets

In [None]:
import sacrebleu
from datasets import load_metric

rouge = load_metric('rouge')
bleu_calc = sacrebleu.BLEU()

In [None]:
test['output'] = test['native_text'].apply(lambda x: translate(x))

In [16]:
print(bleu_calc.corpus_score(test['output'].tolist(), [test['translation'].tolist()]))

BLEU = 35.09 62.6/39.7/28.6/21.5 (BP = 0.998 ratio = 0.998 hyp_len = 31568 ref_len = 31629)


In [17]:
rouge_score = rouge.compute(predictions=test['output'].tolist(), references=test['translation'].tolist())
print(rouge_score)

{'rouge1': AggregateScore(low=Score(precision=0.5982260015044203, recall=0.586687728720013, fmeasure=0.588395071327604), mid=Score(precision=0.6090383968353283, recall=0.5979467457017437, fmeasure=0.5991336826345741), high=Score(precision=0.6205648086911699, recall=0.6101997731202324, fmeasure=0.610703677963987)), 'rouge2': AggregateScore(low=Score(precision=0.38572363995227416, recall=0.37979733211233024, fmeasure=0.380110453125751), mid=Score(precision=0.3993735371702426, recall=0.3930406813805263, fmeasure=0.3934213602422587), high=Score(precision=0.4125526452542492, recall=0.4062831283614229, fmeasure=0.4067131804773679)), 'rougeL': AggregateScore(low=Score(precision=0.5582783074140499, recall=0.5484048818106864, fmeasure=0.5494473633445973), mid=Score(precision=0.5699244345604186, recall=0.5603384416674695, fmeasure=0.5612858677556972), high=Score(precision=0.5819841724745507, recall=0.5723916645499619, fmeasure=0.573080584554977)), 'rougeLsum': AggregateScore(low=Score(precision=