<a href="https://colab.research.google.com/github/Ayaan577/IITK-Internship/blob/main/Furniture_Assembly_Model_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Colab Cell 1: Install / Upgrade Dependencies
# Colab Cell 1: Install HF libs without messing up core Colab packages
!pip install --upgrade --quiet --no-deps transformers datasets tqdm


# Colab Cell 2: Imports & GPU Check
import torch
if not torch.cuda.is_available():
    raise RuntimeError("No GPU found! Select T4 in Runtime > Change runtime type > GPU.")
print("Using GPU:", torch.cuda.get_device_name(0))

Using GPU: Tesla T4


In [9]:
import os, json, zipfile
from datasets import Dataset
from transformers import (
    T5Tokenizer, T5ForConditionalGeneration,
    Trainer, TrainingArguments,
    DataCollatorForSeq2Seq,  # Changed from DataCollatorWithPadding
    EarlyStoppingCallback
)

In [11]:
# Colab Cell 3: Unzip the dataset
zip_path = "/content/furniture_10000_enhanced.zip"
extract_dir = "/content/furniture_dataset"
os.makedirs(extract_dir, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as z:
    z.extractall(extract_dir)

# Find the JSONL
files = [f for f in os.listdir(extract_dir) if f.endswith(".jsonl")]
if not files:
    raise FileNotFoundError("No .jsonl file found in the zip!")
jsonl_path = os.path.join(extract_dir, files[0])
print("Loading from:", jsonl_path)

# Colab Cell 4: Read first 10k samples from JSONL
max_samples = 10_000
data = []
with open(jsonl_path, 'r') as f:
    for i, line in enumerate(f):
        if i >= max_samples:
            break
        data.append(json.loads(line))
print(f"Loaded {len(data)} samples (out of 100k) for speed")

# Prepare inputs/targets
inputs  = [item['linearized'] for item in data]
targets = [" ".join(item['instructions']) for item in data]

# Build HF Dataset
raw_ds = Dataset.from_dict({"input_text": inputs, "target_text": targets})

# Colab Cell 5: Tokenizer & Model
model_name = "t5-small"
tokenizer  = T5Tokenizer.from_pretrained(model_name)
model      = T5ForConditionalGeneration.from_pretrained(model_name).cuda()

# Colab Cell 6: Preprocessing + Dataset Split
max_input_length  = 256
max_target_length = 256

def preprocess_fn(examples):
    model_inputs = tokenizer(
        examples["input_text"],
        max_length=max_input_length,
        truncation=True
    )
    # Tokenize targets with the 'text_target' argument
    labels = tokenizer(
        text_target=examples["target_text"],
        max_length=max_target_length,
        truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized = raw_ds.map(preprocess_fn, batched=True, remove_columns=["input_text","target_text"])

# Proper train-test split
split_dataset = tokenized.train_test_split(test_size=0.1)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

# Correct data collator for seq2seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Colab Cell 7: TrainingArguments (compatible with your transformers)
training_args = TrainingArguments(
    output_dir="/content/furniture_t5_model",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    per_device_eval_batch_size=8,
    learning_rate=3e-4,
    weight_decay=0.01,
    fp16=True,
    logging_steps=50,
    save_steps=500,           # checkpoint every 500 steps
    eval_steps=500,           # evaluate every 500 steps
    save_total_limit=2,
    logging_dir="/content/furniture_t5_model/logs",
    report_to="none"
)

# Colab Cell 8: Trainer without early stopping
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    # no EarlyStoppingCallback
)


# Colab Cell 9: Train!
trainer.train()

# Colab Cell 10: Save & Zip the Final Model
trainer.save_model("/content/furniture_t5_model_final")
tokenizer.save_pretrained("/content/furniture_t5_model_final")

!zip -r furniture_t5_model_final.zip /content/furniture_t5_model_final

Loading from: /content/furniture_dataset/furniture_10000_enhanced.jsonl
Loaded 10000 samples (out of 100k) for speed


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
50,1.0014
100,0.2969
150,0.2053
200,0.1736
250,0.1574
300,0.1381
350,0.1277
400,0.1197
450,0.1088
500,0.1049


  adding: content/furniture_t5_model_final/ (stored 0%)
  adding: content/furniture_t5_model_final/model.safetensors (deflated 8%)
  adding: content/furniture_t5_model_final/spiece.model (deflated 48%)
  adding: content/furniture_t5_model_final/generation_config.json (deflated 29%)
  adding: content/furniture_t5_model_final/added_tokens.json (deflated 83%)
  adding: content/furniture_t5_model_final/tokenizer_config.json (deflated 94%)
  adding: content/furniture_t5_model_final/config.json (deflated 62%)
  adding: content/furniture_t5_model_final/special_tokens_map.json (deflated 85%)
  adding: content/furniture_t5_model_final/training_args.bin (deflated 52%)


In [13]:
from transformers import pipeline

# Load the trained model and tokenizer
model_path = "/content/furniture_t5_model_final"
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path).cuda()

# Create a pipeline for text generation
text_gen = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=0)

# Use the hold-out evaluation dataset
unseen_samples = eval_dataset[:10]  # Adjust this as needed
# Decode inputs and targets from the tokenized dataset
inputs = [tokenizer.decode(ids, skip_special_tokens=True) for ids in unseen_samples["input_ids"]]
targets = [tokenizer.decode(ids, skip_special_tokens=True) for ids in unseen_samples["labels"]]

# Generate predictions
predictions = []
for inp in inputs:
    generated = text_gen(inp, max_length=256, num_beams=4, early_stopping=True)
    predictions.append(generated[0]["generated_text"])

# Display some examples
for i in range(3):  # Show first 3 examples
    print(f"Input: {inputs[i]}")
    print(f"Target: {targets[i]}")
    print(f"Prediction: {predictions[i]}")
    print("-" * 50)



Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=256) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=256) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=256) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=256) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_

Input: (Bunk_bed (Frame) (Slats) (Mattress) (GuardRails) (Fasteners))
Target: Insert GuardRails + Fasteners -> GuardRails_Fasteners_assembly Insert GuardRails_Fasteners_assembly + Mattress -> GuardRails_Fasteners_assembly_Mattress_assembly Combine GuardRails_Fasteners_assembly_Mattress_assembly + Slats -> GuardRails_Fasteners_assembly_Mattress_assembly_Slats_assembly Insert GuardRails_Fasteners_assembly_Mattress_assembly_Slats_assembly + Frame -> Finished_GuardRails_Fasteners_assembly_Mattress_assembly_Slats_assembly_Frame_assembly
Prediction: Attach GuardRails + Slats -> GuardRails_Slats_assembly Combine GuardRails_Slats_assembly + Mattress -> GuardRails_Slats_assembly_Mattress_assembly Combine GuardRails_Slats_assembly_Mattress_assembly + Frame -> GuardRails_Slats_assembly_Mattress_assembly_Frame_assembly Combine GuardRails_Slats_assembly_Mattress_assembly_Frame_assembly + Fasteners -> Finished_GuardRails_Slats_assembly_Mattress_assembly_Frame_assembly_Fasteners_assembly
------------

In [14]:
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.1.1 sacrebleu-2.5.1


In [15]:
import sacrebleu

# Calculate BLEU
bleu_score = sacrebleu.corpus_bleu(predictions, [targets])
print("BLEU Score:", bleu_score.score)

BLEU Score: 56.36083868922011


In [16]:
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=10bbbc2043e290fd7620f895ae319b924ab682a3e50629e786fb3cc03aca2ad8
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [17]:
from rouge_score import rouge_scorer

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

# Calculate ROUGE for each prediction
rouge_scores = [scorer.score(target, pred) for target, pred in zip(targets, predictions)]

# Compute average scores
avg_rouge = {
    "rouge1": sum([score["rouge1"].fmeasure for score in rouge_scores]) / len(rouge_scores),
    "rouge2": sum([score["rouge2"].fmeasure for score in rouge_scores]) / len(rouge_scores),
    "rougeL": sum([score["rougeL"].fmeasure for score in rouge_scores]) / len(rouge_scores),
}

print("ROUGE Scores:", avg_rouge)

ROUGE Scores: {'rouge1': 0.7760967003965311, 'rouge2': 0.3206089743589744, 'rougeL': 0.5860468033835567}
