In [None]:
from datasets import load_dataset

data_files = {'train': 'trn.json', 'test': 'tst.json'}
# Load the dataset
dataset = load_dataset('json', data_files=data_files)

# Sample 100 rows from the training split (or modify for other splits)
train_sample = dataset["train"].shuffle(seed=42).select(range(240000))
test_sample = dataset["test"].shuffle(seed=42).select(range(24000))

train_sample, test_sample

In [None]:
# Convert JSON into T5-friendly format
def format_prompt(example):
    return {
        "input_text": f"Product: {example['title']} Description: {example['content']}",
        "target_text": example['title']
    }

train_sample = train_sample.map(format_prompt)
test_sample = test_sample.map(format_prompt)

In [None]:
tokenized_dataset_train = train_sample
tokenized_dataset_test = test_sample
tokenized_dataset_train = tokenized_dataset_train.remove_columns(['uid', 'title', 'content'])
tokenized_dataset_test = tokenized_dataset_test.remove_columns(['uid', 'title', 'content'])
tokenized_dataset_train

In [None]:
import torch

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_name = "google/t5-efficient-tiny"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    task_type="SEQ_CLS",
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()



In [None]:
def tokenize_function(example):
    encoding = tokenizer(
        example["input_text"], padding="max_length", truncation=True, max_length=128
    )
    target_encoding = tokenizer(
        example["target_text"], padding="max_length", truncation=True, max_length=16
    )

    return {
        "input_ids": encoding["input_ids"],
        "attention_mask": encoding["attention_mask"],
        "labels": target_encoding["input_ids"],
    }

tokenized_dataset_train = tokenized_dataset_train.map(tokenize_function, batched=True)
tokenized_dataset_test = tokenized_dataset_test.map(tokenize_function, batched=True)
tokenized_dataset_train.set_format("torch")
tokenized_dataset_test.set_format("torch")

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./t5-small-finetuned",
    per_device_train_batch_size=4,  # Adjust based on RAM
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    eval_strategy="steps",
    save_strategy="steps",
    save_steps=500,
    logging_steps=400,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    fp16=False,  # Apple MPS does not support FP16
    push_to_hub=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_test,
    processing_class=tokenizer
)

trainer.train()


In [None]:
model.save_pretrained("./t5-finetuned")
tokenizer.save_pretrained("./t5-finetuned")

In [None]:
from transformers import pipeline

text_generator = pipeline("text-generation", model="./t5-finetuned", tokenizer=tokenizer)
prompt = "Product: Headphones\nDescription:"
output = text_generator(prompt, max_length=50)
print(output)