In [None]:
!pip install transformers
!pip install sentencepiece
!pip install sacremoses
!pip install git+https://github.com/nltk/nltk_contrib.git#egg=nltk_contrib
!pip install jiwer nltk sacrebleu rouge-score
!pip install --upgrade jiwer
!pip install torchmetrics
!pip install evaluate

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99
Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sacremoses
Successfully installed sacremoses-0.1.1
Collecting nltk_contrib
  Cloning https://github.com/nltk/nltk_contrib.git to /tmp/pip-install-psm_1w3t/nltk-contrib_67205cc6ee7047729a6d3b6d64aa0ac6
  Running command git clone --filter=blob:none --quiet https://github.com/nltk/nltk_contrib.git /tmp/pip-install-psm_1w3t/nltk-contrib_67205cc6ee7047729a6d3b6d64aa0ac6
  Resolved https://github.com/nltk/nltk_contrib.git to co

In [None]:
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import MarianMTModel, MarianTokenizer, AdamW,AutoTokenizer
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
import sacrebleu
from nltk.translate.meteor_score import meteor_score
import jiwer
from jiwer import wer
from rouge_score import rouge_scorer
from torchmetrics.text import TranslationEditRate
from tqdm import tqdm
import evaluate
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import AutoModelForSeq2SeqLM


### MarianMT eval - synthetic data

In [None]:
# Define the translation dataset with references
class TranslationDatasetWithRefs(Dataset):
    def __init__(self, data):
        self.data = data
        self.tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-cs")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_text = self.data['cs_query'][idx]
        target_text = self.data['en_query'][idx]

        input_ids = self.tokenizer(' '.join(input_text.split()[:100]), return_tensors='pt',truncation=True,padding='max_length', max_length=100)['input_ids'].squeeze()
        labels = self.tokenizer(' '.join(target_text.split()[:100]), return_tensors='pt',truncation=True,padding='max_length', max_length=100)['input_ids'].squeeze()

        # Convert labels to text without special tokens
        references = [self.tokenizer.decode(labels, skip_special_tokens=True)]

        return {"input_ids": input_ids, "labels": labels, "references": references}

In [None]:
# Load the test data
test_df = pd.read_csv("/content/drive/MyDrive/SEM 3/NLP/nlp_project/Dataset/test.tsv", delimiter='\t')

In [None]:
# Create a translation dataset and dataloader for testing
test_dataset = TranslationDatasetWithRefs(test_df)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
# Set up the model and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Load the best model
model = MarianMTModel.from_pretrained("/content/drive/MyDrive/SEM 3/NLP/nlp_project/results/MarianMT/best_model").to(device)

model.eval()
bleu_scores = []
wer_scores = []
ter_scores = []
meteor_scores = []
rouge1_scores = []
rouge2_scores = []
#ter = TranslationEditRate(normalize=True)


# Load the TER metric
ter = evaluate.load("ter")
wer = evaluate.load("wer")
rouge = evaluate.load('rouge')
bleu = evaluate.load("bleu")
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Testing"):
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        references = batch['references']


        # Generate predictions
        generated_ids = model.generate(input_ids, max_length=100)  # Adjust max_length as needed

        # Convert generated IDs to text
        predictions = [test_dataset.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)]


        # Convert generated IDs to text
        #predictions = [val_dataset.tokenizer.decode(generated_ids[0], skip_special_tokens=True)]

        bleu_score = bleu.compute(predictions=predictions[0], references=references[0])
        bleu_scores.append(bleu_score['bleu'])


        # Calculate WER score
        wer_score = wer.compute(predictions=predictions[0],references=references[0])
        wer_scores.append(wer_score)

        # Calculate TER score
        ter_score = ter.compute(predictions=predictions[0],references=references[0],
                       case_sensitive=False)
        #ter_score = TER(references[0][0], predictions[0])
        ter_scores.append(ter_score['score'])

        # Calculate ROUGE score
        rouge_score = rouge.compute(predictions=predictions[0],references=references[0])
        rouge1_scores.append(rouge_score['rouge1'])
        rouge2_scores.append(rouge_score['rouge2'])

Testing: 100%|██████████| 319/319 [04:57<00:00,  1.07it/s]


In [None]:
# Calculate average scores
avg_bleu = sum(bleu_scores) / len(bleu_scores)
avg_wer = sum(wer_scores) / len(wer_scores)
avg_ter = sum(ter_scores) / len(ter_scores)
avg_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
avg_rouge2 = sum(rouge2_scores) / len(rouge2_scores)

# Print metric values
print("Testing Scores")
print("Metric\t\t\tWER\t\t\tTER\t\t\tBLEU\t\t\tROUGE-1\t\t\tROUGE-2")
print(f"Averages\t\t{avg_wer:.4f}\t\t\t{avg_ter/100:.4f}\t\t\t{avg_bleu:.4f}\t\t\t{avg_rouge1:.4f}\t\t\t{avg_rouge2:.4f}")

Testing Scores
Metric			WER			TER			BLEU			ROUGE-1			ROUGE-2
Averages		0.3072			0.2392			0.6082			0.8761			0.7171


Testing Scores
Metric			WER			TER			BLEU			ROUGE-1			ROUGE-2
Averages		0.3072			0.2392			0.6082			0.8761			0.7171

### MBART eval - synthetic data

In [None]:
# Define the translation dataset with references
class TranslationDatasetWithRefs(Dataset):
    def __init__(self, data):
        self.data = data
        self.tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_text = self.data['cs_query'][idx]
        target_text = self.data['en_query'][idx]

        input_ids = self.tokenizer(' '.join(input_text.split()[:50]), return_tensors='pt',truncation=True,padding='max_length', max_length=50)['input_ids'].squeeze()
        labels = self.tokenizer(' '.join(target_text.split()[:50]), return_tensors='pt',truncation=True,padding='max_length', max_length=50)['input_ids'].squeeze()

        # Convert labels to text without special tokens
        references = [self.tokenizer.decode(labels, skip_special_tokens=True)]

        return {"input_ids": input_ids, "labels": labels, "references": references}

In [None]:
# Create a translation dataset and dataloader for testing
test_dataset = TranslationDatasetWithRefs(test_df)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
# Load the best model
model = AutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/SEM 3/NLP/nlp_project/results/MBART_large/best_model").to(device)
model.eval()
bleu_scores = []
wer_scores = []
ter_scores = []
meteor_scores = []
rouge1_scores = []
rouge2_scores = []
#ter = TranslationEditRate(normalize=True)


# Load the TER metric
ter = evaluate.load("ter")
wer = evaluate.load("wer")
rouge = evaluate.load('rouge')
bleu = evaluate.load("bleu")
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Testing"):
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        references = batch['references']


        # Generate predictions
        generated_ids = model.generate(input_ids, max_length=100)  # Adjust max_length as needed

        # Convert generated IDs to text
        predictions = [test_dataset.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)]


        # Convert generated IDs to text
        #predictions = [val_dataset.tokenizer.decode(generated_ids[0], skip_special_tokens=True)]

        bleu_score = bleu.compute(predictions=predictions[0], references=references[0])
        bleu_scores.append(bleu_score['bleu'])


        # Calculate WER score
        wer_score = wer.compute(predictions=predictions[0],references=references[0])
        wer_scores.append(wer_score)

        # Calculate TER score
        ter_score = ter.compute(predictions=predictions[0],references=references[0],
                       case_sensitive=False)
        #ter_score = TER(references[0][0], predictions[0])
        ter_scores.append(ter_score['score'])

        # Calculate ROUGE score
        rouge_score = rouge.compute(predictions=predictions[0],references=references[0])
        rouge1_scores.append(rouge_score['rouge1'])
        rouge2_scores.append(rouge_score['rouge2'])

Testing: 100%|██████████| 319/319 [12:53<00:00,  2.43s/it]


In [None]:
# Calculate average scores
avg_bleu = sum(bleu_scores) / len(bleu_scores)
avg_wer = sum(wer_scores) / len(wer_scores)
avg_ter = sum(ter_scores) / len(ter_scores)
avg_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
avg_rouge2 = sum(rouge2_scores) / len(rouge2_scores)

# Print metric values
print("Testing Scores")
print("Metric\t\t\tWER\t\t\tTER\t\t\tBLEU\t\t\tROUGE-1\t\t\tROUGE-2")
print(f"Averages\t\t{avg_wer:.4f}\t\t\t{avg_ter/100:.4f}\t\t\t{avg_bleu:.4f}\t\t\t{avg_rouge1:.4f}\t\t\t{avg_rouge2:.4f}")

Testing Scores
Metric			WER			TER			BLEU			ROUGE-1			ROUGE-2
Averages		0.2647			0.2014			0.6562			0.8990			0.7573


Testing Scores
Metric			WER			TER			BLEU			ROUGE-1			ROUGE-2
Averages		0.2647			0.2014			0.6562			0.8990			0.7573

### MT5 eval -synthetic data

In [None]:
# Define the translation dataset with references
class TranslationDatasetWithRefs(Dataset):
    def __init__(self, data):
        self.data = data
        self.tokenizer = T5Tokenizer.from_pretrained("t5-small")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_text = self.data['cs_query'][idx]
        target_text = self.data['en_query'][idx]

        input_ids = self.tokenizer(' '.join(input_text.split()[:50]), return_tensors='pt',truncation=True,padding='max_length', max_length=50)['input_ids'].squeeze()
        labels = self.tokenizer(' '.join(target_text.split()[:50]), return_tensors='pt',truncation=True,padding='max_length', max_length=50)['input_ids'].squeeze()

        # Convert labels to text without special tokens
        references = [self.tokenizer.decode(labels, skip_special_tokens=True)]

        return {"input_ids": input_ids, "labels": labels, "references": references}

In [None]:
# Create a translation dataset and dataloader for testing
test_dataset = TranslationDatasetWithRefs(test_df)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# Load the best model
model = T5ForConditionalGeneration.from_pretrained("/content/drive/MyDrive/SEM 3/NLP/nlp_project/results/MT5_small/best_model").to(device)
model.eval()
bleu_scores = []
wer_scores = []
ter_scores = []
meteor_scores = []
rouge1_scores = []
rouge2_scores = []
#ter = TranslationEditRate(normalize=True)


# Load the TER metric
ter = evaluate.load("ter")
wer = evaluate.load("wer")
rouge = evaluate.load('rouge')
bleu = evaluate.load("bleu")
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Testing"):
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        references = batch['references']


        # Generate predictions
        generated_ids = model.generate(input_ids, max_length=50)  # Adjust max_length as needed

        # Convert generated IDs to text
        predictions = [test_dataset.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)]


        # Convert generated IDs to text
        #predictions = [val_dataset.tokenizer.decode(generated_ids[0], skip_special_tokens=True)]

        bleu_score = bleu.compute(predictions=predictions[0], references=references[0])
        bleu_scores.append(bleu_score['bleu'])


        # Calculate WER score
        wer_score = wer.compute(predictions=predictions[0],references=references[0])
        wer_scores.append(wer_score)

        # Calculate TER score
        ter_score = ter.compute(predictions=predictions[0],references=references[0],
                       case_sensitive=False)
        #ter_score = TER(references[0][0], predictions[0])
        ter_scores.append(ter_score['score'])

        # Calculate ROUGE score
        rouge_score = rouge.compute(predictions=predictions[0],references=references[0])
        rouge1_scores.append(rouge_score['rouge1'])
        rouge2_scores.append(rouge_score['rouge2'])

Testing: 100%|██████████| 319/319 [05:16<00:00,  1.01it/s]


In [None]:
# Calculate average scores
avg_bleu = sum(bleu_scores) / len(bleu_scores)
avg_wer = sum(wer_scores) / len(wer_scores)
avg_ter = sum(ter_scores) / len(ter_scores)
avg_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
avg_rouge2 = sum(rouge2_scores) / len(rouge2_scores)

# Print metric values
print("Testing Scores")
print("Metric\t\t\tWER\t\t\tTER\t\t\tBLEU\t\t\tROUGE-1\t\t\tROUGE-2")
print(f"Averages\t\t{avg_wer:.4f}\t\t\t{avg_ter/100:.4f}\t\t\t{avg_bleu:.4f}\t\t\t{avg_rouge1:.4f}\t\t\t{avg_rouge2:.4f}")

Testing Scores
Metric			WER			TER			BLEU			ROUGE-1			ROUGE-2
Averages		0.9738			0.9162			0.2369			0.5685			0.3587


Testing Scores
Metric			WER			TER			BLEU			ROUGE-1			ROUGE-2
Averages		0.9738			0.9162			0.2369			0.5685			0.3587

### Fine tuned MBART evaluation on phinc

In [None]:
# Load the test data
test_df = pd.read_csv("/content/drive/MyDrive/SEM 3/NLP/nlp_project/Dataset/phinc/test.csv")

In [None]:
# Define the translation dataset with references
class TranslationDatasetWithRefs(Dataset):
    def __init__(self, data):
        self.data = data
        self.tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_text = self.data['cs_query'][idx]
        target_text = self.data['en_query'][idx]

        input_ids = self.tokenizer(' '.join(str(input_text).split()), return_tensors='pt',truncation=True,padding='max_length', max_length=50)['input_ids'].squeeze()
        labels = self.tokenizer(' '.join(str(target_text).split()), return_tensors='pt',truncation=True,padding='max_length', max_length=50)['input_ids'].squeeze()

        # Convert labels to text without special tokens
        references = [self.tokenizer.decode(labels, skip_special_tokens=True)]

        return {"input_ids": input_ids, "labels": labels, "references": references}

In [None]:
# Create a translation dataset and dataloader for testing
test_dataset = TranslationDatasetWithRefs(test_df)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
# Load the best model
model = AutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/SEM 3/NLP/nlp_project/results/MBART_finetune/best_model_fine").to(device)
model.eval()
bleu_scores = []
wer_scores = []
ter_scores = []
meteor_scores = []
rouge1_scores = []
rouge2_scores = []
#ter = TranslationEditRate(normalize=True)


# Load the TER metric
ter = evaluate.load("ter")
wer = evaluate.load("wer")
rouge = evaluate.load('rouge')
bleu = evaluate.load("bleu")
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Testing"):
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        references = batch['references']


        # Generate predictions
        generated_ids = model.generate(input_ids, max_length=100)  # Adjust max_length as needed

        # Convert generated IDs to text
        predictions = [test_dataset.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)]


        # Convert generated IDs to text
        #predictions = [val_dataset.tokenizer.decode(generated_ids[0], skip_special_tokens=True)]

        bleu_score = bleu.compute(predictions=predictions[0], references=references[0])
        bleu_scores.append(bleu_score['bleu'])


        # Calculate WER score
        wer_score = wer.compute(predictions=predictions[0],references=references[0])
        wer_scores.append(wer_score)

        # Calculate TER score
        ter_score = ter.compute(predictions=predictions[0],references=references[0],
                       case_sensitive=False)
        #ter_score = TER(references[0][0], predictions[0])
        ter_scores.append(ter_score['score'])

        # Calculate ROUGE score
        rouge_score = rouge.compute(predictions=predictions[0],references=references[0])
        rouge1_scores.append(rouge_score['rouge1'])
        rouge2_scores.append(rouge_score['rouge2'])

Testing: 100%|██████████| 39/39 [02:39<00:00,  4.10s/it]


In [None]:
# Calculate average scores
avg_bleu = sum(bleu_scores) / len(bleu_scores)
avg_wer = sum(wer_scores) / len(wer_scores)
avg_ter = sum(ter_scores) / len(ter_scores)
avg_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
avg_rouge2 = sum(rouge2_scores) / len(rouge2_scores)

# Print metric values
print("Testing Scores")
print("Metric\t\t\tWER\t\t\tTER\t\t\tBLEU\t\t\tROUGE-1\t\t\tROUGE-2")
print(f"Averages\t\t{avg_wer:.4f}\t\t\t{avg_ter/100:.4f}\t\t\t{avg_bleu:.4f}\t\t\t{avg_rouge1:.4f}\t\t\t{avg_rouge2:.4f}")

Testing Scores
Metric			WER			TER			BLEU			ROUGE-1			ROUGE-2
Averages		0.6925			0.6304			0.2194			0.5351			0.3012


Testing Scores
Metric			WER			TER			BLEU			ROUGE-1			ROUGE-2
Averages		0.6925			0.6304			0.2194			0.5351			0.3012

### MT5 fine tuned evaluation on phinc data

In [None]:
# Load the test data
test_df = pd.read_csv("/content/drive/MyDrive/SEM 3/NLP/nlp_project/Dataset/phinc/test.csv")

In [None]:
# Define the translation dataset with references
class TranslationDatasetWithRefs(Dataset):
    def __init__(self, data):
        self.data = data
        self.tokenizer = T5Tokenizer.from_pretrained("t5-small")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_text = self.data['cs_query'][idx]
        target_text = self.data['en_query'][idx]

        input_ids = self.tokenizer(' '.join(str(input_text).split()), return_tensors='pt',truncation=True,padding='max_length', max_length=50)['input_ids'].squeeze()
        labels = self.tokenizer(' '.join(str(target_text).split()), return_tensors='pt',truncation=True,padding='max_length', max_length=50)['input_ids'].squeeze()

        # Convert labels to text without special tokens
        references = [self.tokenizer.decode(labels, skip_special_tokens=True)]

        return {"input_ids": input_ids, "labels": labels, "references": references}

In [None]:
# Create a translation dataset and dataloader for testing
test_dataset = TranslationDatasetWithRefs(test_df)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# Load the best model
model = T5ForConditionalGeneration.from_pretrained("/content/drive/MyDrive/SEM 3/NLP/nlp_project/results/MT5_finetune/best_model_fine").to(device)
model.eval()
bleu_scores = []
wer_scores = []
ter_scores = []
meteor_scores = []
rouge1_scores = []
rouge2_scores = []
#ter = TranslationEditRate(normalize=True)


# Load the TER metric
ter = evaluate.load("ter")
wer = evaluate.load("wer")
rouge = evaluate.load('rouge')
bleu = evaluate.load("bleu")
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Testing"):
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        references = batch['references']


        # Generate predictions
        generated_ids = model.generate(input_ids, max_length=100)  # Adjust max_length as needed

        # Convert generated IDs to text
        predictions = [test_dataset.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)]


        # Convert generated IDs to text
        #predictions = [val_dataset.tokenizer.decode(generated_ids[0], skip_special_tokens=True)]

        bleu_score = bleu.compute(predictions=predictions[0], references=references[0])
        bleu_scores.append(bleu_score['bleu'])


        # Calculate WER score
        wer_score = wer.compute(predictions=predictions[0],references=references[0])
        wer_scores.append(wer_score)

        # Calculate TER score
        ter_score = ter.compute(predictions=predictions[0],references=references[0],
                       case_sensitive=False)
        #ter_score = TER(references[0][0], predictions[0])
        ter_scores.append(ter_score['score'])

        # Calculate ROUGE score
        rouge_score = rouge.compute(predictions=predictions[0],references=references[0])
        rouge1_scores.append(rouge_score['rouge1'])
        rouge2_scores.append(rouge_score['rouge2'])

Testing: 100%|██████████| 155/155 [02:54<00:00,  1.12s/it]


In [None]:
# Calculate average scores
avg_bleu = sum(bleu_scores) / len(bleu_scores)
avg_wer = sum(wer_scores) / len(wer_scores)
avg_ter = sum(ter_scores) / len(ter_scores)
avg_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
avg_rouge2 = sum(rouge2_scores) / len(rouge2_scores)

# Print metric values
print("Testing Scores")
print("Metric\t\t\tWER\t\t\tTER\t\t\tBLEU\t\t\tROUGE-1\t\t\tROUGE-2")
print(f"Averages\t\t{avg_wer:.4f}\t\t\t{avg_ter/100:.4f}\t\t\t{avg_bleu:.4f}\t\t\t{avg_rouge1:.4f}\t\t\t{avg_rouge2:.4f}")

Testing Scores
Metric			WER			TER			BLEU			ROUGE-1			ROUGE-2
Averages		1.1808			1.1411			0.1011			0.2897			0.1488
