In [22]:
! pip install transformers[torch] datasets==3.6.0 evaluate ctranslate2 --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.8/38.8 MB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [23]:
import datasets
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import Trainer, TrainingArguments
import torch
import evaluate
import ctranslate2
from tqdm.notebook import tqdm
import pandas as pd
import copy


In [3]:
tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")

In [4]:
ds = datasets.load_dataset("GEM/e2e_nlg")

In [5]:
ds

DatasetDict({
    train: Dataset({
        features: ['gem_id', 'gem_parent_id', 'meaning_representation', 'target', 'references'],
        num_rows: 33525
    })
    validation: Dataset({
        features: ['gem_id', 'gem_parent_id', 'meaning_representation', 'target', 'references'],
        num_rows: 1484
    })
    test: Dataset({
        features: ['gem_id', 'gem_parent_id', 'meaning_representation', 'target', 'references'],
        num_rows: 1847
    })
    challenge_train_sample: Dataset({
        features: ['gem_id', 'gem_parent_id', 'meaning_representation', 'target', 'references'],
        num_rows: 500
    })
    challenge_validation_sample: Dataset({
        features: ['gem_id', 'gem_parent_id', 'meaning_representation', 'target', 'references'],
        num_rows: 500
    })
    challenge_test_scramble: Dataset({
        features: ['gem_id', 'gem_parent_id', 'meaning_representation', 'target', 'references'],
        num_rows: 500
    })
})

In [6]:
ds['train'][0]

{'gem_id': 'e2e_nlg-train-0',
 'gem_parent_id': 'e2e_nlg-train-0',
 'meaning_representation': 'name[The Eagle], eatType[coffee shop], food[Japanese], priceRange[less than £20], customer rating[low], area[riverside], familyFriendly[yes], near[Burger King]',
 'target': 'The Eagle is a low rated coffee shop near Burger King and the riverside that is family friendly and is less than £20 for Japanese food.',
 'references': []}

In [7]:
ds['test'][0]

{'gem_id': 'e2e_nlg-test-0',
 'gem_parent_id': 'e2e_nlg-test-0',
 'meaning_representation': 'eatType[pub], food[Fast food], customer rating[high], area[riverside], familyFriendly[no], near[Café Rouge]',
 'target': 'The Mills is not kid friendly as it is a riverside pub near Café Rouge.  Its mid priced fast food is highly rated.',
 'references': ['The Mills is not kid friendly as it is a riverside pub near Café Rouge.  Its mid priced fast food is highly rated.']}

In [8]:
# Keep the columns need fro fine-tuning
columns_to_keep_train_val = ['meaning_representation', 'target']

# Prepare train set
train_dataset = ds['train'].remove_columns(
    [col for col in ds['train'].column_names if col not in columns_to_keep_train_val]
)

# Prepare validation set
validation_dataset = ds['validation'].remove_columns(
    [col for col in ds['validation'].column_names if col not in columns_to_keep_train_val]
)

# Keep the columns need for Evaluation
columns_to_keep_test = ['meaning_representation', 'target', 'references']
# Prepare test set
test_dataset = ds['test'].remove_columns(
    [col for col in ds['test'].column_names if col not in columns_to_keep_test]
)


In [9]:
processed_dataset = datasets.DatasetDict({
    'train': train_dataset,
    'validation': validation_dataset,
    'test': test_dataset
})

In [10]:
# Rename columns
processed_dataset = processed_dataset.rename_column("meaning_representation", "input_text")
processed_dataset = processed_dataset.rename_column("target", "labels")

In [11]:
processed_dataset

DatasetDict({
    train: Dataset({
        features: ['input_text', 'labels'],
        num_rows: 33525
    })
    validation: Dataset({
        features: ['input_text', 'labels'],
        num_rows: 1484
    })
    test: Dataset({
        features: ['input_text', 'labels', 'references'],
        num_rows: 1847
    })
})

In [12]:
# Tokenize the dataset
def tokenize_function(batch):
  model_inputs = tokenizer(batch["input_text"], padding="max_length", truncation=True, max_length=512)
  with tokenizer.as_target_tokenizer():
    labels = tokenizer(batch["labels"], padding="max_length", truncation=True, max_length=512)
  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

tokenized_dataset = processed_dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/1484 [00:00<?, ? examples/s]



In [14]:
# Define the training arguments
training_arguments = transformers.Seq2SeqTrainingArguments(
    output_dir = 't5-small-e2e_nlg',
    num_train_epochs = 3,
    eval_strategy = 'epoch',
    save_strategy='epoch',
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    save_total_limit = 3,
    load_best_model_at_end = True,
    fp16=True,
    optim='adafactor',
    report_to = []
)

In [15]:
trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.074,0.07837
2,0.0684,0.074207
3,0.0661,0.073335


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=6288, training_loss=0.10142879844015185, metrics={'train_runtime': 4377.3727, 'train_samples_per_second': 22.976, 'train_steps_per_second': 1.436, 'total_flos': 1.36120016830464e+16, 'train_loss': 0.10142879844015185, 'epoch': 3.0})

In [16]:
# Save the fine-tuned model
model.save_pretrained('t5-small-e2e_nlg')

# Save the tokenizer
tokenizer.save_pretrained('t5-small-e2e_nlg')

('t5-small-e2e_nlg/tokenizer_config.json',
 't5-small-e2e_nlg/special_tokens_map.json',
 't5-small-e2e_nlg/spiece.model',
 't5-small-e2e_nlg/added_tokens.json',
 't5-small-e2e_nlg/tokenizer.json')

In [17]:
!zip -r t5-small-e2e_nlg.zip t5-small-e2e_nlg


  adding: t5-small-e2e_nlg/ (stored 0%)
  adding: t5-small-e2e_nlg/tokenizer.json (deflated 74%)
  adding: t5-small-e2e_nlg/tokenizer_config.json (deflated 95%)
  adding: t5-small-e2e_nlg/model.safetensors (deflated 7%)
  adding: t5-small-e2e_nlg/checkpoint-4192/ (stored 0%)
  adding: t5-small-e2e_nlg/checkpoint-4192/scaler.pt (deflated 64%)
  adding: t5-small-e2e_nlg/checkpoint-4192/rng_state.pth (deflated 26%)
  adding: t5-small-e2e_nlg/checkpoint-4192/tokenizer.json (deflated 74%)
  adding: t5-small-e2e_nlg/checkpoint-4192/tokenizer_config.json (deflated 95%)
  adding: t5-small-e2e_nlg/checkpoint-4192/training_args.bin (deflated 53%)
  adding: t5-small-e2e_nlg/checkpoint-4192/scheduler.pt (deflated 61%)
  adding: t5-small-e2e_nlg/checkpoint-4192/model.safetensors (deflated 7%)
  adding: t5-small-e2e_nlg/checkpoint-4192/special_tokens_map.json (deflated 85%)
  adding: t5-small-e2e_nlg/checkpoint-4192/generation_config.json (deflated 27%)
  adding: t5-small-e2e_nlg/checkpoint-4192/spi

In [24]:
# Convert model to ctranslate format
! ct2-transformers-converter --model ./t5-small-e2e_nlg --output_dir t5-small-e2e_nlg-ct2

2025-09-23 16:53:29.209775: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758646409.230150   74079 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758646409.236469   74079 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1758646409.253999   74079 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1758646409.254024   74079 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1758646409.254030   74079 computation_placer.cc:177] computation placer alr

## Generation (Need Correction)

In [25]:
# Instantiate model as ctranslate Translator and instantiate Tokenizer
translator = ctranslate2.Translator(
    't5-small-e2e_nlg-ct2',
    device='cuda',
)
tokenizer = transformers.AutoTokenizer.from_pretrained('./t5-small-e2e_nlg')

In [26]:
test_dataset = processed_dataset["test"]
inputs = test_dataset["input_text"]
labels = test_dataset["labels"]


In [28]:
inputs_tokenized = [text.split() for text in inputs]

In [29]:
batch_size = 8
all_outputs = []

for i in tqdm(range(0, len(inputs_tokenized), batch_size)):
    batch = inputs_tokenized[i:i+batch_size]
    results = translator.translate_batch(
        batch,
        max_batch_size=batch_size,
        max_input_length=512,
        max_decoding_length=512,
        beam_size=4
    )

    # Each result is a list of tokens, convert to string
    batch_outputs = [" ".join(r[0].tokens) for r in results]
    all_outputs.extend(batch_outputs)


  0%|          | 0/231 [00:00<?, ?it/s]

  batch_outputs = [" ".join(r[0].tokens) for r in results]


AttributeError: 'dict' object has no attribute 'tokens'

In [None]:
# Post-process CTranslate2 outputs
clean_outputs = [output.replace("▁", " ").strip() for output in all_outputs]


In [None]:
df = pd.DataFrame({
    "input_text": inputs,
    "target": labels,
    "generated_text": clean_outputs
})

In [None]:
df.head()

In [None]:
df.to_csv("t5-small-e2e_nlg_test_results_3epoch.csv", index=False)
print("Saved test predictions to 't5-small-e2e_nlg_test_results_3epoch.csv' ")