In [None]:
import os, json, zipfile
from datasets import Dataset, DatasetDict
from transformers import T5Tokenizer, T5ForConditionalGeneration
from sklearn.model_selection import train_test_split
from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq
from transformers import pipeline
import random
import pandas as pd
!pip install rouge-score --quiet
from rouge_score import rouge_scorer
!pip install sacrebleu --quiet
import sacrebleu

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from transformers import (
    DataCollatorForSeq2Seq, TrainingArguments, Trainer,
    EarlyStoppingCallback
)

In [None]:
# Unzip (if not already)
zip_path    = "/content/furniture_dataset.zip"
extract_dir = "/content/furniture_full"
os.makedirs(extract_dir, exist_ok=True)
with zipfile.ZipFile(zip_path, 'r') as z:
    z.extractall(extract_dir)

# Find JSONL
jsonl = next(f for f in os.listdir(extract_dir) if f.endswith(".jsonl"))
jsonl_path = os.path.join(extract_dir, jsonl)

# Read all data
data = []
with open(jsonl_path) as f:
    for line in f:
        data.append(json.loads(line))

print(f"Loaded {len(data)} samples.")


Loaded 100000 samples.


In [None]:


# Convert to dict lists
inputs  = [d["linearized"] for d in data]
targets = [" ".join(d["instructions"]) for d in data]

# 80/10/10 split
train_in, temp_in, train_tg, temp_tg = train_test_split(inputs, targets, test_size=0.2, random_state=42)
val_in,  test_in,  val_tg,  test_tg  = train_test_split(temp_in, temp_tg, test_size=0.5, random_state=42)

# Build HF DatasetDict
train_ds = Dataset.from_dict({"input_text": train_in, "target_text": train_tg})
val_ds   = Dataset.from_dict({"input_text": val_in,   "target_text": val_tg})
test_ds  = Dataset.from_dict({"input_text": test_in,  "target_text": test_tg})

dataset = DatasetDict({"train": train_ds, "validation": val_ds, "test": test_ds})
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 80000
    })
    validation: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 10000
    })
})


In [None]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model     = T5ForConditionalGeneration.from_pretrained("t5-small").cuda()

max_in, max_tg = 256, 256

def preprocess(ex):
    mi = tokenizer(ex["input_text"],  max_length=max_in,  truncation=True, padding="max_length")
    la = tokenizer(text_target=ex["target_text"], max_length=max_tg, truncation=True, padding="max_length")
    mi["labels"] = la["input_ids"]
    return mi

tokenized = dataset.map(preprocess, batched=True, remove_columns=["input_text","target_text"])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = TrainingArguments(
    output_dir="/content/furniture_full_t5",
    num_train_epochs=3,               # up to 8 epochs
    per_device_train_batch_size=3,
    gradient_accumulation_steps=2,     # effective batch size = 16
    per_device_eval_batch_size=8,
    save_steps=1000,                   # checkpoint every 1 000 steps
    eval_steps=1000,                   # evaluate every 1 000 steps
    save_total_limit=3,
    logging_steps=200,
    weight_decay=0.01,
    learning_rate=3e-4,
    fp16=True,
    report_to="none"
)

# 3) Trainer without early stopping or best-model loading
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer
)

trainer.train()
trainer.save_model("/content/furniture_full_t5_final")
tokenizer.save_pretrained("/content/furniture_full_t5_final")


  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
200,0.2772
400,0.0072
600,0.002
800,0.0019
1000,0.0016
1200,0.0006
1400,0.001
1600,0.0005
1800,0.0008
2000,0.0005


('/content/furniture_full_t5_final/tokenizer_config.json',
 '/content/furniture_full_t5_final/special_tokens_map.json',
 '/content/furniture_full_t5_final/spiece.model',
 '/content/furniture_full_t5_final/added_tokens.json')

In [None]:

generator = pipeline("text2text-generation",
                     model="/content/furniture_full_t5_final",
                     tokenizer=tokenizer,
                     device=0)

# Prepare test inputs & targets
test_inputs  = test_ds["input_text"]
test_targets = test_ds["target_text"]

# Generate predictions (you can batch this for speed)
preds = [generator("assemble: "+inp, max_new_tokens=128, num_beams=4)[0]["generated_text"]
         for inp in test_inputs[:1000]]  # you can evaluate all or a subset

refs = test_targets[:len(preds)]

# Compute BLEU
bleu = sacrebleu.corpus_bleu(preds, [refs]).score
print("BLEU:", bleu)

# Compute ROUGE
scorer = rouge_scorer.RougeScorer(["rouge1","rouge2","rougeL"], use_stemmer=True)
scores = [scorer.score(r, p) for r,p in zip(refs, preds)]
avg = {k: sum(d[k].fmeasure for d in scores)/len(scores) for k in scores[0]}
print("ROUGE:", avg)


Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


BLEU: 80.38054119856658
ROUGE: {'rouge1': 0.927062151617313, 'rouge2': 0.925215166830592, 'rougeL': 0.927062151617313}


# **TESTING ON UNSEEN DATA**

In [None]:
import os, zipfile, json
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, pipeline

# Step 1a: Unzip
zip_path    = "/content/furniture_10000_enhanced.zip"
extract_dir = "/content/furniture_testset"
os.makedirs(extract_dir, exist_ok=True)
with zipfile.ZipFile(zip_path, 'r') as z:
    z.extractall(extract_dir)

# Step 1b: Locate JSONL
jsonl_files = [f for f in os.listdir(extract_dir) if f.endswith(".jsonl")]
assert jsonl_files, "No .jsonl file found in the zip!"
test_jsonl = os.path.join(extract_dir, jsonl_files[0])

# Step 1c: Read into lists
inputs, targets = [], []
with open(test_jsonl) as f:
    for line in f:
        obj = json.loads(line)
        inputs.append(obj["linearized"])
        # join instructions back to a single string
        targets.append(" ".join(obj["instructions"]))

print(f"Loaded {len(inputs)} unseen test samples.")


Loaded 10000 unseen test samples.


In [None]:
model_dir = "/content/furniture_full_t5_final"  # or wherever you saved the final zip
tokenizer = T5Tokenizer.from_pretrained(model_dir)
model     = T5ForConditionalGeneration.from_pretrained(model_dir).cuda()

# Create a generation pipeline
generator = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    device=0,
    max_new_tokens=256,
    num_beams=4,
    early_stopping=True
)


Device set to use cuda:0


In [None]:
# Prepare full list of prompts
prompts = ["assemble: " + tree for tree in inputs]

# Generate everything in batches of 32
batch_size = 32
all_outputs = generator(prompts, batch_size=batch_size)

# Extract the generated texts
predictions = [out["generated_text"] for out in all_outputs]



In [None]:
# BLEU
bleu = sacrebleu.corpus_bleu(predictions, [targets]).score
print(f"Unseen Test BLEU: {bleu:.2f}")

# ROUGE
scorer = rouge_scorer.RougeScorer(["rouge1","rouge2","rougeL"], use_stemmer=True)
scores = [scorer.score(tgt, pred) for tgt, pred in zip(targets, predictions)]
avg_rouge = {
    "rouge1": sum(s["rouge1"].fmeasure for s in scores) / len(scores),
    "rouge2": sum(s["rouge2"].fmeasure for s in scores) / len(scores),
    "rougeL": sum(s["rougeL"].fmeasure for s in scores) / len(scores),
}
print("Unseen Test ROUGE:", {k: f"{v:.3f}" for k,v in avg_rouge.items()})
