In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers
!pip install sentencepiece
!pip install sacremoses
!pip install git+https://github.com/nltk/nltk_contrib.git#egg=nltk_contrib
!pip install jiwer nltk sacrebleu rouge-score
!pip install --upgrade jiwer
!pip install torchmetrics
!pip install evaluate

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99
Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sacremoses
Successfully installed sacremoses-0.1.1
Collecting nltk_contrib
  Cloning https://github.com/nltk/nltk_contrib.git to /tmp/pip-install-3zs0bj7e/nltk-contrib_9b32091f6ed54b81b25034b1d12c31b3
  Running command git clone --filter=blob:none --quiet https://github.com/nltk/nltk_contrib.git /tmp/pip-install-3zs0bj7e/nltk-contrib_9b32091f6ed54b81b25034b1d12c31b3
  Resolved https://github.com/nltk/nltk_contrib.git to co

In [None]:
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import MarianMTModel, MarianTokenizer, AdamW,AutoTokenizer
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
import sacrebleu
from nltk.translate.meteor_score import meteor_score
import jiwer
from jiwer import wer
from rouge_score import rouge_scorer
from torchmetrics.text import TranslationEditRate
from tqdm import tqdm
from transformers import AutoModelForSeq2SeqLM
from transformers import T5ForConditionalGeneration, T5Tokenizer

train

In [None]:
train_df = pd.read_csv("/content/drive/MyDrive/nlp_project/Dataset/train.tsv",delimiter='\t')
val_df = pd.read_csv("/content/drive/MyDrive/nlp_project/Dataset/val.tsv",delimiter='\t')

In [None]:
class TranslationDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-cs")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_text = self.data['cs_query'][idx]
        target_text = self.data['en_query'][idx]

        input_ids = self.tokenizer(' '.join(input_text.split()[:100]), return_tensors='pt', truncation=True,max_length=100,padding='max_length')['input_ids'].squeeze()
        labels = self.tokenizer(' '.join(target_text.split()[:100]), return_tensors='pt', truncation=True,max_length=100,padding='max_length')['input_ids'].squeeze()

        return {"input_ids": input_ids, "labels": labels}

In [None]:
# Define the translation dataset with references
class TranslationDatasetWithRefs(Dataset):
    def __init__(self, data):
        self.data = data
        self.tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-cs")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_text = self.data['cs_query'][idx]
        target_text = self.data['en_query'][idx]

        input_ids = self.tokenizer(' '.join(input_text.split()[:100]), return_tensors='pt',truncation=True,padding='max_length', max_length=100)['input_ids'].squeeze()
        labels = self.tokenizer(' '.join(target_text.split()[:100]), return_tensors='pt',truncation=True,padding='max_length', max_length=100)['input_ids'].squeeze()

        # Convert labels to text without special tokens
        references = [self.tokenizer.decode(labels, skip_special_tokens=True)]

        return {"input_ids": input_ids, "labels": labels, "references": references}

In [None]:
# Create translation datasets and dataloaders for training and validation
train_dataset = TranslationDataset(train_df[:100000])
train_dataloader = DataLoader(train_dataset, batch_size=50, shuffle=True)
val_dataset = TranslationDatasetWithRefs(val_df)
val_dataloader = DataLoader(val_dataset, batch_size=50, shuffle=False)  # No need to shuffle validation data

### MarianMT

In [None]:
# Set up the model and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-cs").to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)



In [None]:
# Fine-tuning loop with validation and model checkpointing
num_epochs = 3
best_bleu = 0.0

for epoch in range(num_epochs):
    total_loss = 0.0
    model.train()

    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs} (Training)"):
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}: Training Loss - {average_loss}")

    # Validation loop
    model.eval()
    bleu_scores = []

    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs} (Validation)"):
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            references = batch['references']

            # Generate predictions
            generated_ids = model.generate(input_ids, max_length=128)  # Adjust max_length as needed

            # Convert generated IDs to text
            predictions = [val_dataset.tokenizer.decode(generated_ids[0], skip_special_tokens=True)]

            # Calculate BLEU score
            bleu_score = corpus_bleu(references, predictions)
            bleu_scores.append(bleu_score)

    # Calculate average BLEU score
    avg_bleu = sum(bleu_scores) / len(bleu_scores)

    print(f"Epoch {epoch + 1}/{num_epochs}: Average BLEU - {avg_bleu}\n")

    # Save the model if the average BLEU score is the best so far
    if avg_bleu > best_bleu:
        best_bleu = avg_bleu
        model.save_pretrained("/content/drive/MyDrive/nlp_project/results/MarianMT/best_model")
        print("best model saved!\n\n")

print("Training completed.")

Epoch 1/3 (Training): 100%|██████████| 2000/2000 [08:47<00:00,  3.79it/s]


Epoch 1/3: Training Loss - 0.09966550359874964


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Epoch 1/3 (Validation): 100%|██████████| 477/477 [06:20<00:00,  1.25it/s]


Epoch 1/3: Average BLEU - 0.9522358392499927

best model saved!




Epoch 2/3 (Training): 100%|██████████| 2000/2000 [08:45<00:00,  3.80it/s]


Epoch 2/3: Training Loss - 0.07117461261712014


Epoch 2/3 (Validation): 100%|██████████| 477/477 [05:41<00:00,  1.40it/s]


Epoch 2/3: Average BLEU - 0.9630123585905087

best model saved!




Epoch 3/3 (Training): 100%|██████████| 2000/2000 [08:40<00:00,  3.84it/s]


Epoch 3/3: Training Loss - 0.05967495138756931


Epoch 3/3 (Validation): 100%|██████████| 477/477 [05:38<00:00,  1.41it/s]

Epoch 3/3: Average BLEU - 0.9629942724934702

Training completed.





### MBART


In [None]:
class TranslationDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_text = self.data['cs_query'][idx]
        target_text = self.data['en_query'][idx]

        input_ids = self.tokenizer(input_text, return_tensors='pt', truncation=True,
                                    max_length=50, padding='max_length')['input_ids'].squeeze()
        labels = self.tokenizer(target_text, return_tensors='pt', truncation=True,
                                 max_length=50, padding='max_length')['input_ids'].squeeze()

        return {"input_ids": input_ids, "labels": labels}

In [None]:
# Define the translation dataset with references
class TranslationDatasetWithRefs(Dataset):
    def __init__(self, data):
        self.data = data
        self.tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_text = self.data['cs_query'][idx]
        target_text = self.data['en_query'][idx]

        input_ids = self.tokenizer(' '.join(input_text.split()[:50]), return_tensors='pt',truncation=True,padding='max_length', max_length=50)['input_ids'].squeeze()
        labels = self.tokenizer(' '.join(target_text.split()[:50]), return_tensors='pt',truncation=True,padding='max_length', max_length=50)['input_ids'].squeeze()

        # Convert labels to text without special tokens
        references = [self.tokenizer.decode(labels, skip_special_tokens=True)]

        return {"input_ids": input_ids, "labels": labels, "references": references}

In [None]:
# Create translation datasets and dataloaders for training and validation
train_dataset = TranslationDataset(train_df[:50000])
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataset = TranslationDatasetWithRefs(val_df)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)  # No need to shuffle validation data

In [None]:
# Set up the model and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Choose your MBART model
model_name = "facebook/mbart-large-50-many-to-many-mmt"
# Load the MBART model
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)



In [None]:
# Fine-tuning loop with validation and model checkpointing
num_epochs = 3
best_bleu = 0.0

for epoch in range(num_epochs):
    total_loss = 0.0
    model.train()

    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs} (Training)"):
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}: Training Loss - {average_loss}")

    # Validation loop
    model.eval()
    bleu_scores = []

    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs} (Validation)"):
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            references = batch['references']

            # Generate predictions
            generated_ids = model.generate(input_ids, max_length=128)  # Adjust max_length as needed

            # Convert generated IDs to text
            predictions = [val_dataset.tokenizer.decode(generated_ids[0], skip_special_tokens=True)]

            # Calculate BLEU score
            bleu_score = corpus_bleu(references, predictions)
            bleu_scores.append(bleu_score)

    # Calculate average BLEU score
    avg_bleu = sum(bleu_scores) / len(bleu_scores)

    print(f"Epoch {epoch + 1}/{num_epochs}: Average BLEU - {avg_bleu}\n")

    # Save the model if the average BLEU score is the best so far
    if avg_bleu > best_bleu:
        best_bleu = avg_bleu
        model.save_pretrained("/content/drive/MyDrive/nlp_project/results/MBART_large/best_model")
        print("best model saved!\n\n")

print("Training completed.")

Epoch 1/3 (Training): 100%|██████████| 6250/6250 [22:54<00:00,  4.55it/s]


Epoch 1/3: Training Loss - 0.15761706722021102


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Epoch 1/3 (Validation): 100%|██████████| 2977/2977 [23:02<00:00,  2.15it/s]


Epoch 1/3: Average BLEU - 0.9150602215896705

best model saved!




Epoch 2/3 (Training): 100%|██████████| 6250/6250 [22:55<00:00,  4.54it/s]


Epoch 2/3: Training Loss - 0.08827320582807065


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Epoch 2/3 (Validation): 100%|██████████| 2977/2977 [22:57<00:00,  2.16it/s]


Epoch 2/3: Average BLEU - 0.9189482670400909

best model saved!




Epoch 3/3 (Training): 100%|██████████| 6250/6250 [22:57<00:00,  4.54it/s]


Epoch 3/3: Training Loss - 0.06902846349552273


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Epoch 3/3 (Validation): 100%|██████████| 2977/2977 [23:11<00:00,  2.14it/s]


Epoch 3/3: Average BLEU - 0.9226437472187894

best model saved!


Training completed.


### MT5

In [None]:
# Set up the model and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
from transformers import T5ForConditionalGeneration, T5Tokenizer
# Load the T5 model
model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)



In [None]:
class TranslationDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.tokenizer = T5Tokenizer.from_pretrained("t5-small")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_text = self.data['cs_query'][idx]
        target_text = self.data['en_query'][idx]

        input_ids = self.tokenizer(input_text, return_tensors='pt', truncation=True,
                                    max_length=50, padding='max_length')['input_ids'].squeeze()
        labels = self.tokenizer(target_text, return_tensors='pt', truncation=True,
                                 max_length=50, padding='max_length')['input_ids'].squeeze()

        return {"input_ids": input_ids, "labels": labels}

In [None]:
# Define the translation dataset with references
class TranslationDatasetWithRefs(Dataset):
    def __init__(self, data):
        self.data = data
        self.tokenizer = T5Tokenizer.from_pretrained("t5-small")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_text = self.data['cs_query'][idx]
        target_text = self.data['en_query'][idx]

        input_ids = self.tokenizer(' '.join(input_text.split()[:50]), return_tensors='pt',truncation=True,padding='max_length', max_length=50)['input_ids'].squeeze()
        labels = self.tokenizer(' '.join(target_text.split()[:50]), return_tensors='pt',truncation=True,padding='max_length', max_length=50)['input_ids'].squeeze()

        # Convert labels to text without special tokens
        references = [self.tokenizer.decode(labels, skip_special_tokens=True)]

        return {"input_ids": input_ids, "labels": labels, "references": references}

In [None]:
# Create translation datasets and dataloaders for training and validation
train_dataset = TranslationDataset(train_df[:50000])
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataset = TranslationDatasetWithRefs(val_df)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)  # No need to shuffle validation data

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# Fine-tuning loop with validation and model checkpointing
num_epochs = 3
best_bleu = 0.0

for epoch in range(num_epochs):
    total_loss = 0.0
    model.train()

    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs} (Training)"):
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}: Training Loss - {average_loss}")

    # Validation loop
    model.eval()
    bleu_scores = []

    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs} (Validation)"):
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            references = batch['references']

            # Generate predictions
            generated_ids = model.generate(input_ids, max_length=50)  # Adjust max_length as needed

            # Convert generated IDs to text
            predictions = [val_dataset.tokenizer.decode(generated_ids[0], skip_special_tokens=True)]

            # Calculate BLEU score
            bleu_score = corpus_bleu(references, predictions)
            bleu_scores.append(bleu_score)

    # Calculate average BLEU score
    avg_bleu = sum(bleu_scores) / len(bleu_scores)

    print(f"Epoch {epoch + 1}/{num_epochs}: Average BLEU - {avg_bleu}\n")

    # Save the model if the average BLEU score is the best so far
    if avg_bleu > best_bleu:
        best_bleu = avg_bleu
        model.save_pretrained("/content/drive/MyDrive/nlp_project/results/MT5_small/best_model")
        print("best model saved!\n\n")

print("Training completed.")

Epoch 1/3 (Training): 100%|██████████| 1563/1563 [02:02<00:00, 12.75it/s]


Epoch 1/3: Training Loss - 0.5528093915068027


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Epoch 1/3 (Validation): 100%|██████████| 745/745 [07:39<00:00,  1.62it/s]


Epoch 1/3: Average BLEU - 0.7256025620721318

best model saved!




Epoch 2/3 (Training): 100%|██████████| 1563/1563 [02:03<00:00, 12.66it/s]


Epoch 2/3: Training Loss - 0.3247207480215218


Epoch 2/3 (Validation): 100%|██████████| 745/745 [06:20<00:00,  1.96it/s]


Epoch 2/3: Average BLEU - 0.7771388735043575

best model saved!




Epoch 3/3 (Training): 100%|██████████| 1563/1563 [02:03<00:00, 12.67it/s]


Epoch 3/3: Training Loss - 0.26968117788557966


Epoch 3/3 (Validation): 100%|██████████| 745/745 [05:49<00:00,  2.13it/s]


Epoch 3/3: Average BLEU - 0.8018909409174928

best model saved!


Training completed.


### Fine tune MBART on phinc dataset

In [None]:
train_df = pd.read_csv("/content/drive/MyDrive/nlp_project/Dataset/phinc/train.csv")

In [None]:
class TranslationDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_text = self.data['cs_query'][idx]
        target_text = self.data['en_query'][idx]

        input_ids = self.tokenizer(' '.join(str(input_text).split()[:50]), return_tensors='pt', truncation=True,max_length=50,padding='max_length')['input_ids'].squeeze()
        labels = self.tokenizer(' '.join(str(target_text).split()[:50]), return_tensors='pt', truncation=True,max_length=50,padding='max_length')['input_ids'].squeeze()

        return {"input_ids": input_ids, "labels": labels}

In [None]:
# Create translation datasets and dataloaders for training and validation
train_dataset = TranslationDataset(train_df)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

In [None]:
# Set up the model and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/nlp_project/results/MBART_large/best_model").to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)



In [None]:
# Fine-tuning loop with validation and model checkpointing
num_epochs = 3
best_loss = 9999

for epoch in range(num_epochs):
    total_loss = 0.0
    model.train()

    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs} (Training)"):
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}: Training Loss - {average_loss}")

    if average_loss > best_loss:
        best_loss = average_loss
        model.save_pretrained("/content/drive/MyDrive/nlp_project/results/MBART_finetune/best_model_fine")
        print("best model saved!\n\n")

print("Training completed.")

Epoch 1/3 (Training): 100%|██████████| 1202/1202 [04:25<00:00,  4.52it/s]


Epoch 1/3: Training Loss - 0.48302928760598185


Epoch 2/3 (Training): 100%|██████████| 1202/1202 [04:25<00:00,  4.53it/s]


Epoch 2/3: Training Loss - 0.277462662121718


Epoch 3/3 (Training): 100%|██████████| 1202/1202 [04:25<00:00,  4.52it/s]

Epoch 3/3: Training Loss - 0.13785264624620536
Training completed.





In [None]:
model.save_pretrained("/content/drive/MyDrive/nlp_project/results/MBART_finetune/best_model_fine")

### Fine tune MT5 on phinc

In [None]:
train_df = pd.read_csv("/content/drive/MyDrive/nlp_project/Dataset/phinc/train.csv")

In [None]:
class TranslationDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.tokenizer = T5Tokenizer.from_pretrained("t5-small")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_text = self.data['cs_query'][idx]
        target_text = self.data['en_query'][idx]

        input_ids = self.tokenizer(' '.join(str(input_text).split()[:50]), return_tensors='pt', truncation=True,max_length=50,padding='max_length')['input_ids'].squeeze()
        labels = self.tokenizer(' '.join(str(target_text).split()[:50]), return_tensors='pt', truncation=True,max_length=50,padding='max_length')['input_ids'].squeeze()

        return {"input_ids": input_ids, "labels": labels}

In [None]:
# Create translation datasets and dataloaders for training and validation
train_dataset = TranslationDataset(train_df)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# Set up the model and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
from transformers import T5ForConditionalGeneration, T5Tokenizer
# Load the T5 model
model = T5ForConditionalGeneration.from_pretrained("/content/drive/MyDrive/nlp_project/results/MT5_small/best_model").to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)



In [None]:
# Fine-tuning loop with validation and model checkpointing
num_epochs = 3
best_loss = 9999

for epoch in range(num_epochs):
    total_loss = 0.0
    model.train()

    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs} (Training)"):
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}: Training Loss - {average_loss}")

    if average_loss < best_loss:
        best_loss = average_loss
        model.save_pretrained("/content/drive/MyDrive/nlp_project/results/MT5_finetune/best_model_fine")
        print("best model saved!\n\n")

print("Training completed.")

Epoch 1/3 (Training): 100%|██████████| 301/301 [00:24<00:00, 12.14it/s]


Epoch 1/3: Training Loss - 1.2617863386968442
best model saved!




Epoch 2/3 (Training): 100%|██████████| 301/301 [00:24<00:00, 12.14it/s]


Epoch 2/3: Training Loss - 1.1592591006890485
best model saved!




Epoch 3/3 (Training): 100%|██████████| 301/301 [00:24<00:00, 12.13it/s]


Epoch 3/3: Training Loss - 1.122371028626084
best model saved!


Training completed.
