In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
dir = "/content/drive/MyDrive/Colab Notebooks/NLP"
%cd $dir

/content/drive/MyDrive/Colab Notebooks/NLP


In [3]:
!pip install -r requirements.txt

Collecting datasets (from -r requirements.txt (line 4))
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate (from -r requirements.txt (line 5))
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting sacrebleu (from -r requirements.txt (line 6))
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting rouge_score (from -r requirements.txt (line 8))
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting meteor (from -r requirements.txt (line 9))
  Downloading meteor-2.0.17-py3-none-any.whl.metadata (8.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets->-r requirements.txt (line 4))
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets->-r requirements.txt (line 4))
  Downloading xxhash-3.5.0-cp310-cp310-manyli

In [4]:
import os
import pandas as pd
import torch
import matplotlib.pyplot as plt

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import Dataset, DatasetDict
from evaluate import load
from sacrebleu.metrics import TER
from nltk.translate.bleu_score import corpus_bleu
from torch.utils.data import DataLoader

In [5]:
if torch.cuda.is_available():
    print("GPU is available")
else:
    print("GPU is not available")

GPU is available


In [None]:
!python training.py

2025-01-02 10:07:21.048627: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-02 10:07:21.081476: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-02 10:07:21.091304: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
tokenizer_config.json: 100% 2.43k/2.43k [00:00<00:00, 12.2MB/s]
spiece.model: 100% 736k/736k [00:00<00:00, 14.9MB/s]
special_tokens_map.json: 100% 2.22k/2.22k [00:00<00:00, 11.7MB/s]
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavi

In [6]:
def predict_batch(batch, model, tokenizer, max_length=160):
    inputs = tokenizer(
        batch["source"], truncation=True, padding=True, max_length=max_length, return_tensors="pt"
    ).to(model.device)

    with torch.no_grad():
        outputs = model.generate(inputs["input_ids"], max_length=max_length)

    predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return predictions


In [7]:
def generate_predictions(batch):
    batch["predictions"] = predict_batch(batch, model, tokenizer)
    return batch

In [9]:
def evaluation_with_sacrebleu(model, tokenizer, test_dataset):
    metric = load("sacrebleu")

    print("Generating predictions...")
    test_dataset = test_dataset.map(
        generate_predictions, batched=True, batch_size=8
    )

    predictions = test_dataset["predictions"]
    references = [[ref] for ref in test_dataset["target"]]

    print("Computing SacreBLEU...")
    result = metric.compute(predictions=predictions, references=references)
    print("Evaluation complete. Results:")
    print(result)

    return result


In [10]:
def evaluate_with_rouge(model, tokenizer, test_dataset):
    rouge = load("rouge")

    print("Generating predictions...")
    test_dataset = test_dataset.map(
        generate_predictions, batched=True, batch_size=8
    )

    predictions = test_dataset["predictions"]
    references = [[ref] for ref in test_dataset["target"]]

    print("Computing Rouge...")
    result = rouge.compute(predictions=predictions, references=references)
    print("Evaluation complete. Results:")
    print(result)

    return result

In [11]:
def evaluate_with_meteor(model, tokenizer, test_dataset):
    meteor = load("meteor")

    print("Generating predictions...")
    test_dataset = test_dataset.map(
        generate_predictions, batched=True, batch_size=8
    )

    predictions = test_dataset["predictions"]
    references = [[ref] for ref in test_dataset["target"]]

    print("Computing METEOR...")
    result = meteor.compute(predictions=predictions, references=references)
    print("Evaluation complete. Results:")
    print(result)

    return result

In [12]:
def evaluate_with_bleu(model, tokenizer, test_dataset):
    print("Generating predictions...")
    test_dataset = test_dataset.map(
        generate_predictions, batched=True, batch_size=8
    )

    predictions = test_dataset["predictions"]
    references = [[ref] for ref in test_dataset["target"]]

    print("Computing BLEU...")
    result = corpus_bleu(references, predictions)
    print("Evaluation complete. Results:")
    print(result)

    return result

In [13]:
def evaluate_with_ter(model, tokenizer, test_dataset):
    metric = TER()

    references = []
    predictions = []

    print("Generating predictions...")
    test_dataset = test_dataset.map(
        generate_predictions, batched=True, batch_size=8
    )

    predictions = test_dataset["predictions"]
    references = [[ref] for ref in test_dataset["target"]]

    print("Computing TER...")
    result = metric.corpus_score(predictions, references)
    print("Evaluation complete. Results:")
    print(result.score)

    return result

In [14]:
def evaluate_with_chrf(model, tokenizer, test_dataset):
    chrf = load("chrf")

    print("Generating predictions...")
    test_dataset = test_dataset.map(
        generate_predictions, batched=True, batch_size=8
    )

    predictions = test_dataset["predictions"]
    references = [[ref] for ref in test_dataset["target"]]

    print("Computing Chrf...")
    result = chrf.compute(predictions=predictions, references=references)
    print("Evaluation complete. Results:")
    print(result)

    return result

In [15]:
model_path = "./fine_tuned_model"
data_path = "./data/"
print(os.listdir(model_path))
print(os.listdir(data_path))

['generation_config.json', 'config.json', 'model.safetensors', 'added_tokens.json', 'tokenizer_config.json', 'special_tokens_map.json', 'spiece.model', 'tokenizer.json']
['tqdn1_ch_vn.xlsx', 'tqdn3_ch_vn.xlsx', 'corpus.zh', 'corpus.vi', 'tqdn2_ch_vn.xlsx', 'tqdn1_ch_vn.csv', 'tqdn3_ch_vn.csv', 'tqdn2_ch_vn.csv', 'dataset.csv', 'train_data.csv', 'val_data.csv', 'test_data.csv']


In [16]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


In [17]:
def evaluation_metric(mode=1):
    test_data = pd.read_csv(data_path + "test_data.csv")
    test_dataset = Dataset.from_pandas(test_data)

    match mode:
        case 1:
            result = evaluation_with_sacrebleu(model, tokenizer, test_dataset)
        case 2:
            result = evaluate_with_rouge(model, tokenizer, test_dataset)
        case 3:
            result = evaluate_with_meteor(model, tokenizer, test_dataset)
        case 4:
            result = evaluate_with_bleu(model, tokenizer, test_dataset)
        case 5:
            result = evaluate_with_ter(model, tokenizer, test_dataset)
        case 6:
            result = evaluate_with_chrf(model, tokenizer, test_dataset)
        case _:
            result = evaluation_with_sacrebleu(model, tokenizer, test_dataset)

    return result

In [None]:
# sacrbleu
mode = 1
result = evaluation_metric(mode)

Map:   0%|          | 0/22637 [00:00<?, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Generating predictions...


Map:   0%|          | 0/22637 [00:00<?, ? examples/s]

In [None]:
# rouge
mode = 2
result = evaluation_metric(mode)

In [None]:
# meteor
mode = 3
result = evaluation_metric(mode)

In [None]:
# bleu
mode = 4
result = evaluation_metric(mode)

In [None]:
# ter
mode = 5
result = evaluation_metric(mode)

In [None]:
# chrf
mode = 6
result = evaluation_metric(mode)