In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers
!pip install sentencepiece
!pip install sacremoses
!pip install git+https://github.com/nltk/nltk_contrib.git#egg=nltk_contrib
!pip install jiwer nltk sacrebleu rouge-score
!pip install --upgrade jiwer
!pip install torchmetrics
!pip install evaluate

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99
Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sacremoses
Successfully installed sacremoses-0.1.1
Collecting nltk_contrib
  Cloning https://github.com/nltk/nltk_contrib.git to /tmp/pip-install-d78l8red/nltk-contrib_f382bc198e6740f4b951879c2de93899
  Running command git clone --filter=blob:none --quiet https://github.com/nltk/nltk_contrib.git /tmp/pip-install-d78l8red/nltk-contrib_f382bc198e6740f4b951879c2de93899
  Resolved https://github.com/nltk/nltk_contrib.git to co

In [None]:
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import MBartForConditionalGeneration, MBartTokenizer, AdamW, AutoTokenizer
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
import sacrebleu
from nltk.translate.meteor_score import meteor_score
import jiwer
from jiwer import wer
from rouge_score import rouge_scorer
from torchmetrics.text import TranslationEditRate
from tqdm import tqdm

train

In [None]:
train_df = pd.read_csv("/content/drive/MyDrive/nlp_project/Dataset/phinc/train.csv")
val_df = pd.read_csv("/content/drive/MyDrive/nlp_project/Dataset/phinc/val.csv")

In [None]:
class TranslationDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_text = self.data['cs_query'][idx]
        target_text = self.data['en_query'][idx]

        input_ids = self.tokenizer(' '.join(str(input_text).split()[:50]), return_tensors='pt', truncation=True,max_length=50,padding='max_length')['input_ids'].squeeze()
        labels = self.tokenizer(' '.join(str(target_text).split()[:50]), return_tensors='pt', truncation=True,max_length=50,padding='max_length')['input_ids'].squeeze()

        return {"input_ids": input_ids, "labels": labels}

In [None]:
# Define the translation dataset with references
class TranslationDatasetWithRefs(Dataset):
    def __init__(self, data):
        self.data = data
        self.tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_text = self.data['cs_query'][idx]
        target_text = self.data['en_query'][idx]

        input_ids = self.tokenizer(' '.join(str(input_text).split()), return_tensors='pt',truncation=True,padding='max_length', max_length=50)['input_ids'].squeeze()
        labels = self.tokenizer(' '.join(str(target_text).split()), return_tensors='pt',truncation=True,padding='max_length', max_length=50)['input_ids'].squeeze()

        # Convert labels to text without special tokens
        references = [self.tokenizer.decode(labels, skip_special_tokens=True)]

        return {"input_ids": input_ids, "labels": labels, "references": references}

In [None]:
# Create translation datasets and dataloaders for training and validation
train_dataset = TranslationDataset(train_df)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_dataset = TranslationDatasetWithRefs(val_df)
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False)  # No need to shuffle validation data



```
# This is formatted as code
```

### mBERT

In [None]:
# Set up the model and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt").to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

In [None]:
# Fine-tuning loop with validation and model checkpointing
num_epochs = 5
best_bleu = 0.0

for epoch in range(num_epochs):
    total_loss = 0.0
    model.train()

    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs} (Training)"):
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}: Training Loss - {average_loss}")

    # Validation loop
    model.eval()
    bleu_scores = []

    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs} (Validation)"):
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            references = batch['references']

            # Generate predictions
            generated_ids = model.generate(input_ids, max_length=128)  # Adjust max_length as needed

            # Convert generated IDs to text
            predictions = [val_dataset.tokenizer.decode(generated_ids[0], skip_special_tokens=True)]

            # Calculate BLEU score
            bleu_score = corpus_bleu(references, predictions)
            bleu_scores.append(bleu_score)

    # Calculate average BLEU score
    avg_bleu = sum(bleu_scores) / len(bleu_scores)

    print(f"Epoch {epoch + 1}/{num_epochs}: Average BLEU - {avg_bleu}\n")

    # Save the model if the average BLEU score is the best so far
    if avg_bleu > best_bleu:
        best_bleu = avg_bleu
        model.save_pretrained(f"/content/drive/MyDrive/nlp_project/results/mBART-PHINC-WO-Pre/best_model_{epoch}_{avg_bleu}")
        print("best model saved!\n\n")

print("Training completed.")

Epoch 1/5 (Training): 100%|██████████| 2404/2404 [18:46<00:00,  2.13it/s]


Epoch 1/5: Training Loss - 1.3041881373192428


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Epoch 1/5 (Validation): 100%|██████████| 722/722 [09:29<00:00,  1.27it/s]


Epoch 1/5: Average BLEU - 0.640257875178401

best model saved!




Epoch 2/5 (Training): 100%|██████████| 2404/2404 [18:48<00:00,  2.13it/s]


Epoch 2/5: Training Loss - 0.5416241454459516


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Epoch 2/5 (Validation): 100%|██████████| 722/722 [08:57<00:00,  1.34it/s]


Epoch 2/5: Average BLEU - 0.6626252564753994

best model saved!




Epoch 3/5 (Training): 100%|██████████| 2404/2404 [18:47<00:00,  2.13it/s]


Epoch 3/5: Training Loss - 0.31049990554041107


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Epoch 3/5 (Validation): 100%|██████████| 722/722 [09:00<00:00,  1.33it/s]


Epoch 3/5: Average BLEU - 0.6624019173666208



Epoch 4/5 (Training): 100%|██████████| 2404/2404 [18:47<00:00,  2.13it/s]


Epoch 4/5: Training Loss - 0.16181285615495952


Epoch 4/5 (Validation): 100%|██████████| 722/722 [09:06<00:00,  1.32it/s]


Epoch 4/5: Average BLEU - 0.6478491719236364



Epoch 5/5 (Training): 100%|██████████| 2404/2404 [18:46<00:00,  2.13it/s]


Epoch 5/5: Training Loss - 0.09520684312753994


Epoch 5/5 (Validation): 100%|██████████| 722/722 [09:16<00:00,  1.30it/s]

Epoch 5/5: Average BLEU - 0.6403348835962402

Training completed.





test

In [None]:
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import MarianMTModel, MarianTokenizer, AdamW
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
import sacrebleu
from nltk.translate.meteor_score import meteor_score
import jiwer
from jiwer import wer
from rouge_score import rouge_scorer
from torchmetrics.text import TranslationEditRate
from tqdm import tqdm
import evaluate

In [None]:
# Load the test data
test_df = pd.read_csv("/content/drive/MyDrive/nlp_project/Dataset/phinc/test.csv")

In [None]:
# Create a translation dataset and dataloader for testing
test_dataset = TranslationDatasetWithRefs(test_df)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [None]:
# Load the best model
model = MBartForConditionalGeneration.from_pretrained("/content/drive/MyDrive/nlp_project/results/mBART-PHINC-WO-Pre/best_model_1_0.6626252564753994").to(device)

model.eval()
bleu_scores = []
wer_scores = []
ter_scores = []
meteor_scores = []
rouge1_scores = []
rouge2_scores = []
#ter = TranslationEditRate(normalize=True)


# Load the TER metric
ter = evaluate.load("ter")
wer = evaluate.load("wer")
rouge = evaluate.load('rouge')
bleu = evaluate.load("bleu")
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Testing"):
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        references = batch['references']


        # Generate predictions
        generated_ids = model.generate(input_ids, max_length=100)  # Adjust max_length as needed

        # Convert generated IDs to text
        predictions = [test_dataset.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)]


        # Convert generated IDs to text
        #predictions = [val_dataset.tokenizer.decode(generated_ids[0], skip_special_tokens=True)]

        bleu_score = bleu.compute(predictions=predictions[0], references=references[0])
        bleu_scores.append(bleu_score['bleu'])


        # Calculate WER score
        wer_score = wer.compute(predictions=predictions[0],references=references[0])
        wer_scores.append(wer_score)

        # Calculate TER score
        ter_score = ter.compute(predictions=predictions[0],references=references[0],
                       case_sensitive=False)
        #ter_score = TER(references[0][0], predictions[0])
        ter_scores.append(ter_score['score'])

        # Calculate ROUGE score
        rouge_score = rouge.compute(predictions=predictions[0],references=references[0])
        rouge1_scores.append(rouge_score['rouge1'])
        rouge2_scores.append(rouge_score['rouge2'])

Downloading builder script:   0%|          | 0.00/9.99k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Testing: 100%|██████████| 1237/1237 [14:34<00:00,  1.41it/s]


In [None]:
# Calculate average scores
avg_bleu = sum(bleu_scores) / len(bleu_scores)
avg_wer = sum(wer_scores) / len(wer_scores)
avg_ter = sum(ter_scores) / len(ter_scores)
avg_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
avg_rouge2 = sum(rouge2_scores) / len(rouge2_scores)

# Print metric values
print("Testing Scores")
print("Metric\t\t\tWER\t\t\tTER\t\t\tBLEU\t\t\tROUGE-1\t\t\tROUGE-2")
print(f"Averages\t\t{avg_wer:.4f}\t\t\t{avg_ter/100:.4f}\t\t\t{avg_bleu:.4f}\t\t\t{avg_rouge1:.4f}\t\t\t{avg_rouge2:.4f}")

Testing Scores
Metric			WER			TER			BLEU			ROUGE-1			ROUGE-2
Averages		0.9092			0.8539			0.1450			0.5458			0.3139


#MarianMT_finetune

In [None]:
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import MarianMTModel, MarianTokenizer, AdamW,AutoTokenizer
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
import sacrebleu
from nltk.translate.meteor_score import meteor_score
import jiwer
from jiwer import wer
from rouge_score import rouge_scorer
from torchmetrics.text import TranslationEditRate
from tqdm import tqdm
import evaluate
from transformers import AutoModelForSeq2SeqLM

In [None]:
train_df = pd.read_csv("/content/drive/MyDrive/nlp_project/Dataset/phinc/train.csv")
val_df = pd.read_csv("/content/drive/MyDrive/nlp_project/Dataset/phinc/val.csv")

In [None]:
class TranslationDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-cs")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_text = self.data['cs_query'][idx]
        target_text = self.data['en_query'][idx]

        input_ids = self.tokenizer(str(input_text), return_tensors='pt', truncation=True,max_length=100,padding='max_length')['input_ids'].squeeze()
        labels = self.tokenizer(str(target_text), return_tensors='pt', truncation=True,max_length=100,padding='max_length')['input_ids'].squeeze()

        return {"input_ids": input_ids, "labels": labels}

In [None]:
# Define the translation dataset with references
class TranslationDatasetWithRefs(Dataset):
    def __init__(self, data):
        self.data = data
        self.tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-cs")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_text = self.data['cs_query'][idx]
        target_text = self.data['en_query'][idx]

        input_ids = self.tokenizer(str(input_text), return_tensors='pt',truncation=True,padding='max_length', max_length=100)['input_ids'].squeeze()
        labels = self.tokenizer(str(target_text), return_tensors='pt',truncation=True,padding='max_length', max_length=100)['input_ids'].squeeze()

        # Convert labels to text without special tokens
        references = [self.tokenizer.decode(labels, skip_special_tokens=True)]

        return {"input_ids": input_ids, "labels": labels, "references": references}

In [None]:
# Create translation datasets and dataloaders for training and validation
train_dataset = TranslationDataset(train_df)
train_dataloader = DataLoader(train_dataset, batch_size=50, shuffle=True)
val_dataset = TranslationDatasetWithRefs(val_df)
val_dataloader = DataLoader(val_dataset, batch_size=50, shuffle=False)  # No need to shuffle validation data

In [None]:
# Set up the model and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MarianMTModel.from_pretrained("/content/drive/MyDrive/nlp_project/results/MarianMT/best_model").to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)



In [None]:
# Fine-tuning loop with validation and model checkpointing
num_epochs = 3
best_loss = 9999

for epoch in range(num_epochs):
    total_loss = 0.0
    model.train()

    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs} (Training)"):
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}: Training Loss - {average_loss}")

    if average_loss < best_loss:
        best_loss = average_loss
        model.save_pretrained("/content/drive/MyDrive/nlp_project/results/MarianMT_finetune/best_model_finetune")
        print("best model saved!\n\n")

print("Training completed.")

Epoch 1/3 (Training): 100%|██████████| 193/193 [02:38<00:00,  1.22it/s]


Epoch 1/3: Training Loss - 0.3803178893469776
best model saved!




Epoch 2/3 (Training): 100%|██████████| 193/193 [02:36<00:00,  1.23it/s]


Epoch 2/3: Training Loss - 0.3810721668245879


Epoch 3/3 (Training): 100%|██████████| 193/193 [02:35<00:00,  1.24it/s]

Epoch 3/3: Training Loss - 0.3808857140763436
Training completed.





#test

In [None]:
# Load the test data
test_df = pd.read_csv("/content/drive/MyDrive/nlp_project/Dataset/phinc/test.csv")

In [None]:
# Create a translation dataset and dataloader for testing
test_dataset = TranslationDatasetWithRefs(test_df)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [None]:
# Load the best model
model = MarianMTModel.from_pretrained("/content/drive/MyDrive/nlp_project/results/MarianMT_finetune/best_model_finetune").to(device)

model.eval()
bleu_scores = []
wer_scores = []
ter_scores = []
meteor_scores = []
rouge1_scores = []
rouge2_scores = []
#ter = TranslationEditRate(normalize=True)


# Load the TER metric
ter = evaluate.load("ter")
wer = evaluate.load("wer")
rouge = evaluate.load('rouge')
bleu = evaluate.load("bleu")
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Testing"):
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        references = batch['references']


        # Generate predictions
        generated_ids = model.generate(input_ids, max_length=100)  # Adjust max_length as needed

        # Convert generated IDs to text
        predictions = [test_dataset.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)]


        # Convert generated IDs to text
        #predictions = [val_dataset.tokenizer.decode(generated_ids[0], skip_special_tokens=True)]

        bleu_score = bleu.compute(predictions=predictions[0], references=references[0])
        bleu_scores.append(bleu_score['bleu'])


        # Calculate WER score
        wer_score = wer.compute(predictions=predictions[0],references=references[0])
        wer_scores.append(wer_score)

        # Calculate TER score
        ter_score = ter.compute(predictions=predictions[0],references=references[0],
                       case_sensitive=False)
        #ter_score = TER(references[0][0], predictions[0])
        ter_scores.append(ter_score['score'])

        # Calculate ROUGE score
        rouge_score = rouge.compute(predictions=predictions[0],references=references[0])
        rouge1_scores.append(rouge_score['rouge1'])
        rouge2_scores.append(rouge_score['rouge2'])

Testing: 100%|██████████| 1237/1237 [08:38<00:00,  2.38it/s]


In [None]:
# Calculate average scores
avg_bleu = sum(bleu_scores) / len(bleu_scores)
avg_wer = sum(wer_scores) / len(wer_scores)
avg_ter = sum(ter_scores) / len(ter_scores)
avg_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
avg_rouge2 = sum(rouge2_scores) / len(rouge2_scores)

# Print metric values
print("Testing Scores")
print("Metric\t\t\tWER\t\t\tTER\t\t\tBLEU\t\t\tROUGE-1\t\t\tROUGE-2")
print(f"Averages\t\t{avg_wer:.4f}\t\t\t{avg_ter/100:.4f}\t\t\t{avg_bleu:.4f}\t\t\t{avg_rouge1:.4f}\t\t\t{avg_rouge2:.4f}")

Testing Scores
Metric			WER			TER			BLEU			ROUGE-1			ROUGE-2
Averages		1.0047			0.9481			0.1017			0.4822			0.2501
