In [None]:
from datasets import DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import get_peft_model, LoraConfig
from huggingface_hub import login
import torch
import numpy as np
import pandas as pd
import os
from dotenv import load_dotenv

In [None]:
# Faz login automaticamente no Hugging Face
load_dotenv()
token = os.getenv("HF_TOKEN")
login(token=token)

In [None]:
from datasets import load_dataset

data_files = {'train': 'trn.json', 'test': 'tst.json'}
# Load the dataset
dataset = load_dataset('json', data_files=data_files)

# Sample 100 rows from the training split (or modify for other splits)
train_sample = dataset["train"].shuffle(seed=42).select(range(10000))
test_sample = dataset["test"].shuffle(seed=42).select(range(1000))

train_sample, test_sample

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_name = "google/t5-efficient-tiny"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
# create tokenize function
def tokenize_function(examples):
    model_inputs = tokenizer(examples["title"], 
                             max_length=512, 
                             truncation=True, 
                             padding=True)


    labels = tokenizer(examples["content"], 
                       max_length=512, 
                       truncation=True, 
                       padding=True)
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

In [None]:
# tokenize training and validation datasets
tokenized_dataset_train_sample = train_sample.map(tokenize_function, batched=True)
tokenized_dataset_test_sample = test_sample.map(tokenize_function, batched=True)
tokenized_dataset_train_sample

In [None]:
tokenized_dataset_train = train_sample
tokenized_dataset_test = test_sample
tokenized_dataset_train_sample = tokenized_dataset_train_sample.remove_columns(['uid', 'title', 'content'])
tokenized_dataset_test_sample = tokenized_dataset_test_sample.remove_columns(['uid', 'title', 'content'])
tokenized_dataset_train

In [None]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    task_type="SEQ_CLS",
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./t5-small-finetuned",
    per_device_train_batch_size=4,  # Adjust based on RAM
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    eval_strategy="steps",
    save_strategy="steps",
    save_steps=500,
    logging_steps=400,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    fp16=False,  # Apple MPS does not support FP16
    push_to_hub=False,
    use_mps_device=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train_sample,
    eval_dataset=tokenized_dataset_test_sample,
    processing_class=tokenizer
)

trainer.train()


In [None]:
model.save_pretrained("./t5-finetuned_v2")
tokenizer.save_pretrained("./t5-finetuned_v2")

In [None]:
def predict_description(model, tokenizer, title, max_length=512):
    # Tokenizar o título
    inputs = tokenizer(title, return_tensors="pt", truncation=True, max_length=max_length)

    # Mover para o mesmo dispositivo do modelo
    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
    model.to(device)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Gerar a descrição usando o modelo
    with torch.no_grad():
        output_tokens = model.generate(**inputs, max_length=max_length)

    # Decodificar a saída para obter a descrição em texto
    description = tokenizer.decode(output_tokens[0], skip_special_tokens=True)

    return description

# Exemplo de uso
title = "Girls Ballet Tutu Neon Pink"
description = predict_description(model, tokenizer, title)

print("Título:", title)
print("Descrição prevista:", description)
