In [None]:
import pandas as pd

df = pd.read_csv("top50_posts_per_user_reduced.csv")       # autor, trait, post, similarity
labels = pd.read_csv("authors_train.csv")  # username + 5 rasgos

df = df.merge(labels, on="username")

df.head()


In [None]:
#esta funcion lo que busca es indicar cual es la columna objetivo en cada fila, en base al trait de la columna trait lo que coge como etiqueta es la columna que se llame igual al valor que hay en trait
def get_target(row):
    return str(row[row["trait"].lower()])  # valor num√©rico correcto

df["input_text"] = df.apply(lambda row: f"predict {row['trait']}: {row['post']}", axis=1)
df["target_text"] = df.apply(get_target, axis=1)

In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(df[["input_text", "target_text"]])
dataset = dataset.train_test_split(test_size=0.1)


In [None]:
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained("t5-small")

def preprocess(example):
    enc = tokenizer(
        example["input_text"],
        padding="max_length",
        truncation=True,
        max_length=256,
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            example["target_text"],
            padding="max_length",
            truncation=True,
            max_length=10
        )
    enc["labels"] = labels["input_ids"]
    return enc

tokenized = dataset.map(preprocess, batched=True)


In [None]:
from transformers import T5ForConditionalGeneration, TrainingArguments, Trainer

model = T5ForConditionalGeneration.from_pretrained("t5-small")

In [None]:
training_args = TrainingArguments(
    output_dir="./t5_personality",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    report_to="none",
    learning_rate=3e-4,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=100,
    save_total_limit=2,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
)

In [None]:
trainer.train()

In [None]:
def predict_trait(trait, text):
    input_text = f"predict {trait}: {text}"
    inputs = tokenizer(input_text, return_tensors="pt")
    outputs = model.generate(**inputs)
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return result

print(predict_trait("extraversion", "I enjoy meeting new people"))
