In [None]:
from datasets import DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import get_peft_model, LoraConfig
from huggingface_hub import login
import evaluate
import torch
import numpy as np
import pandas as pd
import os
from dotenv import load_dotenv

In [None]:
# Faz login automaticamente no Hugging Face
load_dotenv()
token = os.getenv("HF_TOKEN")
login(token=token)

In [None]:
#dataset
def load_data(file):
    raw = pd.read_json(file, lines=True)
    data = raw[["title", "content"]].fillna("")
    print(data.head())
    return data
    
dataset_train = load_data("trn.json")
dataset_test = load_data("tst.json")


In [None]:
dataset_train_title = dataset_train["title"]
dataset_train_content = dataset_train["content"]

dataset_test_title = dataset_test["title"]
dataset_test_content = dataset_test["content"]

In [None]:
labels_train = pd.read_csv("filter_labels_train.txt", sep=" ", header=None)
labels_test = pd.read_csv("filter_labels_test.txt", sep=" ", header=None)

In [None]:
# # create new dataset
final_dataset = DatasetDict({'train':Dataset.from_dict({'title':dataset_train_title,'content':dataset_train_content}),
                                                          'test':Dataset.from_dict({'title':dataset_test_title,'content':dataset_test_content})})

In [None]:
final_dataset["train"].column_names

In [None]:
model_checkpoint = 't5-small'
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
# create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


# add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [None]:
# create tokenize function
def tokenize_function(examples):
    model_inputs = tokenizer(examples["title"], 
                             max_length=512, 
                             truncation=True, 
                             padding=True)


    labels = tokenizer(examples["content"], max_length=512, truncation=True, padding=True)
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

In [None]:
# tokenize training and validation datasets
tokenized_dataset = final_dataset.map(tokenize_function, batched=True)
tokenized_dataset

In [None]:
tokenized_dataset = tokenized_dataset.remove_columns(
    final_dataset["train"].column_names
)
tokenized_dataset

In [None]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)

In [None]:
# import accuracy evaluation metric
accuracy = evaluate.load("accuracy")

In [None]:
# Configuração LoRA
peft_config = LoraConfig(task_type="SEQ_CLS", r=4, lora_alpha=32, lora_dropout=0.01)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

In [None]:
# Hiperparâmetros
training_args = TrainingArguments(
    output_dir=model_checkpoint + "-lora-text-classification",
    learning_rate=1e-3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=10,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [None]:
# Criar Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
)

# Treinar modelo
trainer.train()

In [None]:
def predict_description(model, tokenizer, title, max_length=512):
    # Tokenizar o título
    inputs = tokenizer(title, return_tensors="pt", truncation=True, max_length=max_length)

    # Mover para o mesmo dispositivo do modelo
    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
    model.to(device)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Gerar a descrição usando o modelo
    with torch.no_grad():
        output_tokens = model.generate(**inputs, max_length=max_length)

    # Decodificar a saída para obter a descrição em texto
    description = tokenizer.decode(output_tokens[0], skip_special_tokens=True)

    return description

# Exemplo de uso
title = "Smartphone com câmera de 108MP e bateria de longa duração"
description = predict_description(model, tokenizer, title)

print("Título:", title)
print("Descrição prevista:", description)
