In [11]:
def build_input(trait, post):
    return (
        f"Estimate the Big Five personality trait '{trait}'. "
        f"Return ONLY a number between 0 and 100.\n\n"
        f"Text:\n{post}"
    )

import pandas as pd

df = pd.read_csv("top50_posts_per_user_reduced.csv")
labels = pd.read_csv("authors_train.csv")

df = df.merge(labels, on="username")

df["trait"] = df["trait"].str.lower().str.strip()

df["input_text"] = df.apply(lambda r: build_input(r["trait"], r["post"]), axis=1)
df["target_text"] = df.apply(lambda r: str(r[r["trait"]]), axis=1)


In [12]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small")


In [13]:
from datasets import Dataset

dataset = Dataset.from_pandas(df[["input_text", "target_text"]])
dataset = dataset.train_test_split(test_size=0.1)

train_ds = dataset["train"]
test_ds = dataset["test"]


In [14]:
def preprocess(batch):
    model_inputs = tokenizer(
        batch["input_text"],
        padding="max_length",
        truncation=True,
        max_length=256,
    )

    labels = tokenizer(
        batch["target_text"],
        padding="max_length",
        truncation=True,
        max_length=8,
    )["input_ids"]

    model_inputs["labels"] = labels
    return model_inputs

train_tokenized = train_ds.map(preprocess, batched=True)
test_tokenized = test_ds.map(preprocess, batched=True)

train_tokenized.set_format(type="torch")
test_tokenized.set_format(type="torch")


Map: 100%|█████████████████████████████████████████████████████████████| 139886/139886 [00:54<00:00, 2558.04 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████| 15543/15543 [00:06<00:00, 2535.87 examples/s]


In [15]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./t5_personality",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=3e-4,
    logging_steps=200,
    save_steps=2000,
)


In [16]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized
)

trainer.train()


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
200,2.2022
400,0.5902
600,0.5716
800,0.5743
1000,0.5725
1200,0.5683
1400,0.5671
1600,0.5688
1800,0.5669
2000,0.5681


TrainOutput(global_step=52458, training_loss=0.547183321972054, metrics={'train_runtime': 8107.0967, 'train_samples_per_second': 51.764, 'train_steps_per_second': 6.471, 'total_flos': 3.900521871153562e+16, 'train_loss': 0.547183321972054, 'epoch': 3.0})

In [17]:
import torch, re, numpy as np

def extract_number(text):
    nums = re.findall(r"[-+]?\d*\.\d+|\d+", text)
    if not nums:
        return np.nan
    return float(nums[0])

def predict_trait_post(model, tokenizer, trait, post):
    input_text = build_input(trait, post)

    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        truncation=True,
        padding=True
    ).to(model.device)

    with torch.no_grad():
        output = model.generate(**inputs, max_length=10)

    decoded = tokenizer.decode(output[0], skip_special_tokens=True).strip()

    return extract_number(decoded)


In [18]:
from tqdm import tqdm

posts = df["post"].tolist()
traits = df["trait"].tolist()

preds = []
batch_size = 32

for i in tqdm(range(0, len(posts), batch_size)):
    batch_inputs = [
        build_input(t, p)
        for t, p in zip(traits[i:i+batch_size], posts[i:i+batch_size])
    ]

    tokens = tokenizer(
        batch_inputs,
        return_tensors="pt",
        padding=True,
        truncation=True
    ).to(model.device)

    with torch.no_grad():
        outputs = model.generate(**tokens, max_length=10)

    decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    decoded = [extract_number(x) for x in decoded]

    preds.extend(decoded)

df["predicted"] = preds


100%|███████████████████████████████████████████████████████████████████████████████| 4858/4858 [17:15<00:00,  4.69it/s]


In [24]:
df[df["predicted"].isna()].head()

Unnamed: 0,username,trait,post,similarity,agreeableness,openness,conscientiousness,extraversion,neuroticism,input_text,target_text,predicted
10210,Bliss86,openness,Did it occur to you that you people may just t...,0.336731,70.0,34.0,18.0,58.0,9.0,Estimate the Big Five personality trait 'openn...,34.0,
15611,ClutchHunter,openness,You are so intellectual and original.,0.391093,62.0,9.0,22.0,7.0,35.0,Estimate the Big Five personality trait 'openn...,9.0,
17791,CrimsonSmear,openness,"When you're searching for understanding, one o...",0.423945,77.0,80.0,8.0,8.0,44.0,Estimate the Big Five personality trait 'openn...,80.0,
19788,Danny1994m,openness,"For me, Programming languages.",0.411056,20.0,30.0,15.0,38.0,25.0,Estimate the Big Five personality trait 'openn...,30.0,
25339,ErraticPragmatic,openness,You need to read more about cognitive function...,0.460956,2.0,87.0,2.0,90.0,73.0,Estimate the Big Five personality trait 'openn...,87.0,


In [25]:
# --- LIMPIEZA DE PREDICCIONES ---
df_clean = df.copy()

# 1. Quitar predicciones NaN
df_clean = df_clean.dropna(subset=["predicted"])

# 2. Quitar predicciones fuera de rango 0–100
df_clean = df_clean[(df_clean["predicted"] >= 0) & (df_clean["predicted"] <= 100)]

# 3. Quitar targets NaN (por si algún trait venía vacío)
cols_traits = ["openness", "conscientiousness", "extraversion",
               "agreeableness", "neuroticism"]

df_clean = df_clean.dropna(subset=cols_traits)

print("Filas originales:", len(df))
print("Filas después de limpieza:", len(df_clean))


Filas originales: 155429
Filas después de limpieza: 151175


In [26]:
from sklearn.metrics import mean_squared_error
import numpy as np

traits = ["openness", "conscientiousness", "extraversion",
          "agreeableness", "neuroticism"]

rmse_by_trait = {}

for t in traits:
    subset = df_clean[df_clean["trait"] == t]
    if len(subset) == 0:
        rmse_by_trait[t] = None
        continue

    mse = mean_squared_error(subset[t], subset["predicted"])
    rmse_by_trait[t] = np.sqrt(mse)

rmse_by_trait



{'openness': np.float64(37.160451696874475),
 'conscientiousness': np.float64(46.88801438917675),
 'extraversion': np.float64(45.77425332999526),
 'agreeableness': np.float64(50.86623791021859),
 'neuroticism': np.float64(54.80542366189192)}