In [None]:
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
ls

[0m[01;34mdrive[0m/  [01;34msample_data[0m/


In [None]:
cd drive/MyDrive/Translation2/

/content/drive/MyDrive/Translation2


In [None]:
ls

[0m[01;34mData[0m/          metrics_history.json  [01;34m__pycache__[0m/   train.py            utilities.py
dictionary.py  models.py             [01;34msaved_models[0m/  Translation2.ipynb


In [None]:
!pip install torch transformers datasets sacrebleu nltk wandb sentencepiece

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading datasets-3.2.0-py3-none-any.whl 

In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')  # optional


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
%%writefile train.py

import torch
import torch.nn as nn
import torch.optim as optim
import argparse
import time
import os
import json
import pickle
import numpy as np
from random import shuffle
from models import Encoder, Decoder, Transformer
from utilities import load_files, load_batches, tokenize

# Additional imports for metrics
from sacrebleu.metrics import BLEU, CHRF
import nltk
from nltk.translate.meteor_score import meteor_score
nltk.download('punkt')

PAD_TOKEN = 0
SOS_TOKEN = 1
EOS_TOKEN = 2

class MetricsLogger:
    def __init__(self, output_path="metrics_history.json"):
        self.output_path = output_path
        self.metrics_history = []

    def log_epoch_metrics(self, epoch, metrics_dict):
        entry = {"epoch": epoch}
        entry.update(metrics_dict)
        self.metrics_history.append(entry)

    def save(self):
        with open(self.output_path, "w") as f:
            json.dump(self.metrics_history, f, indent=2)

def compute_metrics(predictions, references):
    bleu_metric = BLEU()
    chrf_metric = CHRF()

    bleu_result = bleu_metric.corpus_score(predictions, [references])
    chrf_result = chrf_metric.corpus_score(predictions, [references])

    # METEOR
    meteor_vals = []
    for hyp, ref in zip(predictions, references):
        hyp_tokens = hyp.split()
        ref_tokens = ref.split()
        meteor_vals.append(meteor_score([ref_tokens], hyp_tokens))
    meteor_avg = 100 * np.mean(meteor_vals)

    return {
        "bleu": bleu_result.score,
        "chrf": chrf_result.score,
        "meteor": meteor_avg
    }

def ids_to_string(ids_list, dictionary):
    """
    Utility to convert a list of token IDs into a single string, ignoring special tokens.
    dictionary.index2word is assumed to map index -> token.
    """
    words = []
    for tok_id in ids_list:
        if tok_id in [PAD_TOKEN, SOS_TOKEN, EOS_TOKEN]:
            continue
        words.append(dictionary.index2word[tok_id])
    return " ".join(words)

def greedy_decode(model, src, src_mask, max_len=60, start_symbol=SOS_TOKEN):
    model.eval()  # set model to evaluation mode
    memory = model.encoder(src, src_mask)
    ys = torch.ones(src.size(0), 1, dtype=torch.long, device=model.device) * start_symbol

    for i in range(max_len - 1):
        target_mask = model.make_target_mask(ys)
        out, _ = model.decoder(ys, memory, target_mask, src_mask)
        prob = out[:, -1, :]
        next_word = torch.argmax(prob, dim=1).unsqueeze(1)
        ys = torch.cat([ys, next_word], dim=1)
    return ys

class Trainer:
    def initialize_weights(self, model):
        if hasattr(model, 'weight') and model.weight.dim() > 1:
            nn.init.xavier_uniform_(model.weight.data)

    def save_dictionary(self, dictionary, input=True):
        directory = f'saved_models/{self.input_lang_dic.name}2{self.output_lang_dic.name}/'
        os.makedirs(directory, exist_ok=True)
        file_path = directory + ('input_dic.pkl' if input else 'output_dic.pkl')
        with open(file_path, 'wb') as f:
            pickle.dump(dictionary, f, pickle.HIGHEST_PROTOCOL)

    def __init__(self, lang1, lang2, data_directory, reverse, MAX_LENGTH, MAX_FILE_SIZE, batch_size, lr=0.0005,
                 hidden_size=256, encoder_layers=3, decoder_layers=3, encoder_heads=8, decoder_heads=8,
                 encoder_ff_size=512, decoder_ff_size=512, encoder_dropout=0.1, decoder_dropout=0.1,
                 device='cpu', lr_scheduler_type='linear', warmup_steps=500, early_stopping_patience=2):
        """
        Extended constructor to accept the same hyperparams as in mbart_train.py, e.g.:
         - lr_scheduler_type
         - warmup_steps
         - early_stopping_patience
        """
        self.MAX_LENGTH = MAX_LENGTH
        self.MAX_FILE_SIZE = MAX_FILE_SIZE
        self.device = device
        self.lr_scheduler_type = lr_scheduler_type
        self.warmup_steps = warmup_steps
        self.early_stopping_patience = early_stopping_patience

        if reverse:
            lang1, lang2 = lang2, lang1

        # ====================
        # Load raw sentences
        # ====================
        self.input_lang_dic, self.output_lang_dic, self.input_lang_list, self.output_lang_list = load_files(
            lang1, lang2, data_directory, reverse, self.MAX_FILE_SIZE, self.MAX_LENGTH)

        # Add them to dictionary
        for sentence in self.input_lang_list:
            self.input_lang_dic.add_sentence(sentence)
        for sentence in self.output_lang_list:
            self.output_lang_dic.add_sentence(sentence)

        # Save dictionary
        self.save_dictionary(self.input_lang_dic, input=True)
        self.save_dictionary(self.output_lang_dic, input=False)

        # ====================
        # Tokenize entire data
        # ====================
        self.tokenized_input_lang = [
            tokenize(sentence, self.input_lang_dic, self.MAX_LENGTH)
            for sentence in self.input_lang_list
        ]
        self.tokenized_output_lang = [
            tokenize(sentence, self.output_lang_dic, self.MAX_LENGTH)
            for sentence in self.output_lang_list
        ]

        # This was your original single data loader, we keep it (do not remove):
        self.batch_size = batch_size
        self.data_loader = load_batches(
            self.tokenized_input_lang,
            self.tokenized_output_lang,
            self.batch_size,
            self.device
        )

        # ====================
        # NEW: 70/15/15 Split
        # ====================
        # Combine inputs/outputs into pairs so we can shuffle them together
        combined = list(zip(self.tokenized_input_lang, self.tokenized_output_lang))
        shuffle(combined)  # Shuffle pairs in-place

        total_count = len(combined)
        train_end = int(0.70 * total_count)  # 70%
        valid_end = int(0.85 * total_count)  # 15% after train
        # test_end = total_count (implicitly 15%)

        train_pairs = combined[:train_end]
        valid_pairs = combined[train_end:valid_end]
        test_pairs  = combined[valid_end:]

        # Separate them back into inputs / outputs
        train_input, train_output = zip(*train_pairs) if train_pairs else ([], [])
        valid_input, valid_output = zip(*valid_pairs) if valid_pairs else ([], [])
        test_input,  test_output  = zip(*test_pairs)  if test_pairs  else ([], [])

        # Now create separate data loaders
        self.train_loader = load_batches(train_input, train_output, self.batch_size, self.device)
        self.valid_loader = load_batches(valid_input, valid_output, self.batch_size, self.device)
        self.test_loader  = load_batches(test_input,  test_output,  self.batch_size, self.device)

        # ====================
        # Build the model
        # ====================
        input_size = self.input_lang_dic.n_count
        output_size = self.output_lang_dic.n_count

        encoder_part = Encoder(
            input_size, hidden_size, encoder_layers, encoder_heads,
            encoder_ff_size, encoder_dropout, self.device
        )
        decoder_part = Decoder(
            output_size, hidden_size, decoder_layers, decoder_heads,
            decoder_ff_size, decoder_dropout, self.device
        )

        self.transformer = Transformer(encoder_part, decoder_part, self.device, PAD_TOKEN).to(self.device)
        self.transformer.apply(self.initialize_weights)

        self.loss_func = nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)
        self.optimizer = optim.Adam(self.transformer.parameters(), lr=lr)
        # self.scheduler = ...  (if needed)

    def train(self, epochs, saved_model_directory):
        start_time = time.time()
        best_loss = float('inf')
        epochs_no_improve = 0

        # Initialize the metrics logger
        logger = MetricsLogger(output_path="metrics_history.json")

        for epoch in range(epochs):
            # =====================================
            # 1) Training loop on TRAIN LOADER
            # =====================================
            shuffle(self.train_loader)  # shuffle the training batches each epoch (if you want)
            train_loss = 0.0
            self.transformer.train()

            for input_batch, target_batch in self.train_loader:
                self.optimizer.zero_grad()

                output, _ = self.transformer(input_batch, target_batch[:, :-1])
                output_dim = output.shape[-1]
                output = output.contiguous().view(-1, output_dim)
                target_flat = target_batch[:, 1:].contiguous().view(-1)

                loss = self.loss_func(output, target_flat)
                loss.backward()
                nn.utils.clip_grad_norm_(self.transformer.parameters(), 1)
                self.optimizer.step()

                train_loss += loss.item()

            train_loss /= len(self.train_loader) if len(self.train_loader) > 0 else 1

            # =====================================
            # 2) Evaluation on VALID LOADER
            # =====================================
            preds_text, refs_text = [], []
            val_loss = 0.0
            self.transformer.eval()
            with torch.no_grad():
                for input_batch, target_batch in self.valid_loader:
                    # Forward pass just to compute validation loss
                    output, _ = self.transformer(input_batch, target_batch[:, :-1])
                    output_dim = output.shape[-1]
                    output = output.contiguous().view(-1, output_dim)
                    target_flat = target_batch[:, 1:].contiguous().view(-1)
                    loss = self.loss_func(output, target_flat)
                    val_loss += loss.item()

                    # For metrics: decode predictions and compare with references
                    input_mask = self.transformer.make_input_mask(input_batch)
                    decoded_output = greedy_decode(
                        self.transformer, input_batch, input_mask, max_len=self.MAX_LENGTH
                    )
                    for i in range(input_batch.size(0)):
                        pred_string = ids_to_string(decoded_output[i].tolist(), self.output_lang_dic)
                        ref_string  = ids_to_string(target_batch[i].tolist(),  self.output_lang_dic)
                        preds_text.append(pred_string)
                        refs_text.append(ref_string)

            val_loss /= len(self.valid_loader) if len(self.valid_loader) > 0 else 1
            metric_results = compute_metrics(preds_text, refs_text)

            # =====================================
            # 3) Save model checkpoint
            # =====================================
            model_directory = f"{saved_model_directory}/{self.input_lang_dic.name}2{self.output_lang_dic.name}/"
            os.makedirs(model_directory, exist_ok=True)

            epoch_model_path = os.path.join(model_directory, f"transformer_model_epoch_{epoch}.pt")
            torch.save(self.transformer.state_dict(), epoch_model_path)

            config = {
                "model_type": "transformer",
                "hidden_size": 256,
                "num_attention_heads": 8,
                "num_hidden_layers": 3,
                "vocab_size": self.input_lang_dic.n_count,
                "max_position_embeddings": self.MAX_LENGTH,
                "hidden_dropout_prob": 0.1
            }
            with open(os.path.join(model_directory, 'config.json'), 'w') as f:
                json.dump(config, f)

            # =====================================
            # 4) Logging
            # =====================================
            epoch_time = int(time.time() - start_time)
            remaining_time = (epochs - epoch - 1) * epoch_time
            print(f"Epoch: {epoch}, Time: {epoch_time}s, Estimated {remaining_time} sec remaining.")
            print(f"\tTraining Loss: {train_loss:.4f} | Validation Loss: {val_loss:.4f}")
            print(f"\tBLEU: {metric_results['bleu']:.2f}, CHRF: {metric_results['chrf']:.2f}, METEOR: {metric_results['meteor']:.2f}\n")

            # Log to JSON
            logger.log_epoch_metrics(epoch, {
                "train_loss": train_loss,
                "val_loss": val_loss,
                "bleu": metric_results["bleu"],
                "chrf": metric_results["chrf"],
                "meteor": metric_results["meteor"]
            })

            # =====================================
            # 5) Early stopping on val_loss
            # =====================================
            if val_loss < best_loss:
                best_loss = val_loss
                epochs_no_improve = 0
            else:
                epochs_no_improve += 1
                if epochs_no_improve >= self.early_stopping_patience:
                    print("Early stopping triggered.")
                    break

        logger.save()
        print('Training finished!')

        # ======================================
        # (Optional) Final Test Evaluation
        # ======================================
        if len(self.test_loader) > 0:
            print("Evaluating on TEST set...")
            preds_text, refs_text = [], []
            test_loss = 0.0
            self.transformer.eval()
            with torch.no_grad():
                for input_batch, target_batch in self.test_loader:
                    # Forward pass for test loss
                    output, _ = self.transformer(input_batch, target_batch[:, :-1])
                    output_dim = output.shape[-1]
                    output = output.contiguous().view(-1, output_dim)
                    target_flat = target_batch[:, 1:].contiguous().view(-1)
                    loss = self.loss_func(output, target_flat)
                    test_loss += loss.item()

                    # Decode for metrics
                    input_mask = self.transformer.make_input_mask(input_batch)
                    decoded_output = greedy_decode(self.transformer, input_batch, input_mask, max_len=self.MAX_LENGTH)
                    for i in range(input_batch.size(0)):
                        pred_string = ids_to_string(decoded_output[i].tolist(), self.output_lang_dic)
                        ref_string  = ids_to_string(target_batch[i].tolist(),  self.output_lang_dic)
                        preds_text.append(pred_string)
                        refs_text.append(ref_string)

            test_loss /= len(self.test_loader)
            test_metrics = compute_metrics(preds_text, refs_text)
            print(f"TEST Loss: {test_loss:.4f}")
            print(f"TEST BLEU: {test_metrics['bleu']:.2f}, TEST CHRF: {test_metrics['chrf']:.2f}, TEST METEOR: {test_metrics['meteor']:.2f}")


def main():
    parser = argparse.ArgumentParser(description='Hyperparameters for training Transformer')
    parser.add_argument('--lang1', type=str, default='french', help='first language in language text file')
    parser.add_argument('--lang2', type=str, default='english', help='second language in language text file')
    parser.add_argument('--data_directory', type=str, default='data', help='data directory')
    parser.add_argument('--reverse', type=int, default=0, help='whether to switch roles of lang1 and lang2 as input/output')
    parser.add_argument('--MAX_LENGTH', type=int, default=60, help='max number of tokens in input')
    parser.add_argument('--MAX_FILE_SIZE', type=int, default=100000, help='max lines to read from files')
    parser.add_argument('--batch_size', type=int, default=128, help='batch size')
    parser.add_argument('--lr', type=float, default=0.0005, help='learning rate')
    parser.add_argument('--hidden_size', type=int, default=256, help='transformer hidden size')
    parser.add_argument('--encoder_layers', type=int, default=3, help='number of encoder layers')
    parser.add_argument('--decoder_layers', type=int, default=3, help='number of decoder layers')
    parser.add_argument('--encoder_heads', type=int, default=8, help='encoder attention heads')
    parser.add_argument('--decoder_heads', type=int, default=8, help='decoder attention heads')
    parser.add_argument('--encoder_ff_size', type=int, default=512, help='encoder FF size')
    parser.add_argument('--decoder_ff_size', type=int, default=512, help='decoder FF size')
    parser.add_argument('--encoder_dropout', type=float, default=0.1, help='encoder dropout')
    parser.add_argument('--decoder_dropout', type=float, default=0.1, help='decoder dropout')
    parser.add_argument('--device', type=str, default='cpu', help='device: cpu or cuda')
    parser.add_argument('--epochs', type=int, default=50, help='training epochs')
    parser.add_argument('--saved_model_directory', type=str, default='saved_models/', help='directory for saving models')

    # Additional arguments for hyperparameter tuning, consistent with mbart_train.py
    parser.add_argument('--lr_scheduler_type', type=str, default='linear', help='LR scheduler type')
    parser.add_argument('--warmup_steps', type=int, default=500, help='Warmup steps for LR scheduler')
    parser.add_argument('--early_stopping_patience', type=int, default=2, help='Epochs with no improvement for early stopping')

    args = parser.parse_args()

    trainer = Trainer(
        lang1=args.lang1,
        lang2=args.lang2,
        data_directory=args.data_directory,
        reverse=args.reverse,
        MAX_LENGTH=args.MAX_LENGTH,
        MAX_FILE_SIZE=args.MAX_FILE_SIZE,
        batch_size=args.batch_size,
        lr=args.lr,
        hidden_size=args.hidden_size,
        encoder_layers=args.encoder_layers,
        decoder_layers=args.decoder_layers,
        encoder_heads=args.encoder_heads,
        decoder_heads=args.decoder_heads,
        encoder_ff_size=args.encoder_ff_size,
        decoder_ff_size=args.decoder_ff_size,
        encoder_dropout=args.encoder_dropout,
        decoder_dropout=args.decoder_dropout,
        device=args.device,
        lr_scheduler_type=args.lr_scheduler_type,
        warmup_steps=args.warmup_steps,
        early_stopping_patience=args.early_stopping_patience
    )
    trainer.train(args.epochs, args.saved_model_directory)

if __name__ == "__main__":
    main()


Writing train.py


In [None]:
!python train.py \
  --lang1 "english" \
  --lang2 "juhoansi" \
  --data_directory "/content/drive/MyDrive/Translation2/Data/english-juhoansi" \
  --reverse 0 \
  --MAX_LENGTH 60 \
  --MAX_FILE_SIZE 100000 \
  --batch_size 8 \
  --lr 0.0005 \
  --hidden_size 256 \
  --encoder_layers 3 \
  --decoder_layers 3 \
  --encoder_heads 8 \
  --decoder_heads 8 \
  --encoder_ff_size 512 \
  --decoder_ff_size 512 \
  --encoder_dropout 0.1 \
  --decoder_dropout 0.1 \
  --device "cuda" \
  --epochs 50 \
  --lr_scheduler_type "linear" \
  --warmup_steps 500 \
  --early_stopping_patience 45 \
  --saved_model_directory "/content/drive/MyDrive/Translation2/saved_models/transformer"



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Epoch: 0, Time: 8s, Estimated 392 sec remaining.
	Training Loss: 5.1024 | Validation Loss: 4.7867
	BLEU: 0.67, CHRF: 5.48, METEOR: 9.81

Epoch: 1, Time: 12s, Estimated 576 sec remaining.
	Training Loss: 4.4093 | Validation Loss: 4.4542
	BLEU: 1.06, CHRF: 5.19, METEOR: 10.91

Epoch: 2, Time: 16s, Estimated 752 sec remaining.
	Training Loss: 4.0317 | Validation Loss: 4.2944
	BLEU: 2.11, CHRF: 12.05, METEOR: 13.95

Epoch: 3, Time: 20s, Estimated 920 sec remaining.
	Training Loss: 3.6825 | Validation Loss: 4.2563
	BLEU: 0.82, CHRF: 15.39, METEOR: 14.80

Epoch: 4, Time: 23s, Estimated 1035 sec remaining.
	Training Loss: 3.3068 | Validation Loss: 4.2355
	BLEU: 1.25, CHRF: 16.94, METEOR: 15.32

Epoch: 5, Time: 25s, Estimated 1100 sec remaining.
	Training Loss: 2.9742 | Validation Loss: 4.1652
	BLEU: 1.77, CHRF: 19.03, METEOR: 17.51

Epoch: 6, Time: 28s, Estimated 1204 sec remaining.


In [None]:
%%writefile plot_metrics.py

import json
import matplotlib.pyplot as plt

def main(json_path):
    with open(json_path, "r") as f:
        data = json.load(f)

    epochs = [item["epoch"] for item in data if "epoch" in item]
    train_loss = [item["train_loss"] for item in data if "train_loss" in item]
    bleu = [item["bleu"] for item in data if "bleu" in item]
    chrf = [item["chrf"] for item in data if "chrf" in item]
    meteor = [item["meteor"] for item in data if "meteor" in item]

    # 1) Plot train_loss
    plt.figure(figsize=(8,5))
    plt.plot(epochs, train_loss, marker="o", label="Train Loss", color="red")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training Loss Over Epochs")
    plt.legend()
    plt.grid(True)
    plt.savefig("train_loss_over_epochs.png")
    plt.show()

    # 2) Plot BLEU
    plt.figure(figsize=(8,5))
    plt.plot(epochs, bleu, marker="o", label="BLEU", color="blue")
    plt.xlabel("Epoch")
    plt.ylabel("BLEU Score")
    plt.title("BLEU Score Over Epochs")
    plt.legend()
    plt.grid(True)
    plt.savefig("bleu_over_epochs.png")
    plt.show()

    # 3) Plot CHRF
    plt.figure(figsize=(8,5))
    plt.plot(epochs, chrf, marker="o", label="CHRF", color="green")
    plt.xlabel("Epoch")
    plt.ylabel("CHRF Score")
    plt.title("CHRF Score Over Epochs")
    plt.legend()
    plt.grid(True)
    plt.savefig("chrf_over_epochs.png")
    plt.show()

    # 4) Plot METEOR (optional)
    if meteor:
        plt.figure(figsize=(8,5))
        plt.plot(epochs[:len(meteor)], meteor, marker="o", label="METEOR", color="purple")
        plt.xlabel("Epoch")
        plt.ylabel("METEOR (%)")
        plt.title("METEOR Score Over Epochs")
        plt.legend()
        plt.grid(True)
        plt.savefig("meteor_over_epochs.png")
        plt.show()

if __name__ == "__main__":
    import sys
    if len(sys.argv) < 2:
        print("Usage: python plot_metrics.py metrics_history.json")
        sys.exit(1)
    json_file = sys.argv[1]
    main(json_file)


Writing plot_metrics.py


In [None]:
!python plot_metrics.py metrics_history.json


Figure(800x500)
Figure(800x500)
Figure(800x500)
Figure(800x500)
