# Big Five Training Notebook
This notebook automates fine-tuning (multi-trait or single-trait) BERT/RoBERTa models on the Reddit personality dataset (`Fatima0923/Automated-Personality-Prediction`). Run cells in order.

## Kaggle environment setup
Run this section to prepare the `/kaggle/working` directory and mirror the Python source files under `/source` for imports.

In [1]:
!mkdir -p /kaggle/working/notebook_models
!cp /kaggle/input/checkpoint/roberta-base_best.pth /kaggle/working/notebook_models/

In [2]:
import os
import shutil
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd()
IN_KAGGLE = Path("/kaggle").exists()
WORK_DIR = Path("/kaggle/working") if IN_KAGGLE else PROJECT_ROOT
SOURCE_DIR = WORK_DIR / "source"
MODELS_ROOT = WORK_DIR / "notebook_models"
LOGS_ROOT = WORK_DIR / "notebook_logs"

SOURCE_DIR.mkdir(parents=True, exist_ok=True)
MODELS_ROOT.mkdir(parents=True, exist_ok=True)
LOGS_ROOT.mkdir(parents=True, exist_ok=True)

if IN_KAGGLE:
    INPUT_SOURCE_ROOT = Path("/kaggle/input/source")
    if INPUT_SOURCE_ROOT.exists():
        shutil.copytree(INPUT_SOURCE_ROOT, SOURCE_DIR, dirs_exist_ok=True)
    else:
        for py_file in PROJECT_ROOT.glob("*.py"):
            destination = SOURCE_DIR / py_file.name
            if py_file.resolve() != destination.resolve():
                shutil.copy2(py_file, destination)

    INPUT_DATASET_PATH = Path("/kaggle/input/pandora-big5-train/pandora_train.csv")
    if INPUT_DATASET_PATH.exists():
        os.environ.setdefault("DATASET_CSV_PATH", str(INPUT_DATASET_PATH))
else:
    for py_file in PROJECT_ROOT.glob("*.py"):
        destination = SOURCE_DIR / py_file.name
        if py_file.resolve() != destination.resolve():
            shutil.copy2(py_file, destination)

if str(SOURCE_DIR) not in sys.path:
    sys.path.insert(0, str(SOURCE_DIR))

print(f"Running inside Kaggle: {IN_KAGGLE}")
print(f"Working directory: {WORK_DIR}")
print(f"Python sources available under: {SOURCE_DIR}")
if IN_KAGGLE:
    print(f"Dataset path: {os.environ.get('DATASET_CSV_PATH')}")

Running inside Kaggle: True
Working directory: /kaggle/working
Python sources available under: /kaggle/working/source
Dataset path: /kaggle/input/pandora-big5-train/pandora_train.csv


In [3]:
# Optional: install requirements inside the notebook environment
!pip install -q -r /kaggle/input/source/requirements.txt

In [4]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tqdm.notebook import tqdm
from transformers import BertTokenizer, RobertaTokenizer

MODEL_CONFIGS = {
    "bert-base": {
        "tokenizer_cls": BertTokenizer,
        "tokenizer_name": "bert-base-cased",
        "model_name": "bert-base-cased",
        "hidden_size": 768
    },
    "bert-large": {
        "tokenizer_cls": BertTokenizer,
        "tokenizer_name": "bert-large-cased",
        "model_name": "bert-large-cased",
        "hidden_size": 1024
    },
    "roberta-base": {
        "tokenizer_cls": RobertaTokenizer,
        "tokenizer_name": "roberta-base",
        "model_name": "roberta-base",
        "hidden_size": 768
    },
    "roberta-large": {
        "tokenizer_cls": RobertaTokenizer,
        "tokenizer_name": "roberta-large",
        "model_name": "roberta-large",
        "hidden_size": 1024
    }
}

# === Dataset configuration ===
DATASET_PATH = Path(os.environ.get("DATASET_CSV_PATH", "dataset.csv"))
SAMPLE_FRACTION = float(os.environ.get("DATASET_SAMPLE_FRACTION", "0.1"))
RANDOM_STATE = 42
CSV_COLUMN_MAPPING = {
    "text": "body",
    "A": "agr",
    "O": "ope",
    "C": "con",
    "E": "ext",
    "N": "neu",
}

if not 0 < SAMPLE_FRACTION <= 1:
    raise ValueError("SAMPLE_FRACTION must be within (0, 1].")

# === User-configurable options ===
selected_model = "roberta-base"            # choose key from MODEL_CONFIGS
training_mode = "multi_trait"              # "multi_trait" or "single_trait"
target_trait = "ope"                       # used only when training_mode == "single_trait"

TRAIT_COLUMNS = ["ext", "neu", "agr", "con", "ope"]
assert target_trait in TRAIT_COLUMNS, "Invalid target trait"
model_tag = selected_model.replace('-', '_')

if training_mode == "single_trait":
    target_columns = [target_trait]
    run_suffix = f"{model_tag}_single_{target_trait}"
else:
    target_columns = TRAIT_COLUMNS
    run_suffix = f"{model_tag}_multi"

print(f"Selected config: {selected_model}")
print(f"Training mode: {training_mode} | targets: {target_columns}")
print(f"Run suffix (log folder): {run_suffix}")

Selected config: roberta-base
Training mode: multi_trait | targets: ['ext', 'neu', 'agr', 'con', 'ope']
Run suffix (log folder): roberta_base_multi


In [5]:
def load_local_dataset(csv_path: Path, sample_fraction: float, random_state: int = 42) -> pd.DataFrame:
    if not csv_path.exists():
        raise FileNotFoundError(f"Dataset file not found: {csv_path.resolve()}")

    df = pd.read_csv(csv_path)
    missing_columns = [col for col in CSV_COLUMN_MAPPING if col not in df.columns]
    if missing_columns:
        raise ValueError(
            f"Dataset is missing required columns: {missing_columns}. "
            f"Expected columns: {list(CSV_COLUMN_MAPPING.keys())}"
        )

    df = df.rename(columns=CSV_COLUMN_MAPPING)
    df = df[["body", "ext", "neu", "agr", "con", "ope"]]
    df[TRAIT_COLUMNS] = df[TRAIT_COLUMNS] / 100.0
    df["body"] = df["body"].fillna("")

    if sample_fraction < 1.0:
        df = df.sample(frac=sample_fraction, random_state=random_state)
    df = df.reset_index(drop=True)
    print(f"Loaded {len(df)} rows from {csv_path} (sample fraction={sample_fraction})")
    return df


def make_local_splits(df: pd.DataFrame, random_state: int = 42):
    df_train, df_test = train_test_split(df, test_size=0.3, random_state=random_state, shuffle=True)
    df_train, df_val = train_test_split(df_train, test_size=0.2, random_state=random_state, shuffle=True)
    print(f"Split sizes -> train: {len(df_train)}, val: {len(df_val)}, test: {len(df_test)}")
    return df_train, df_val, df_test, df


def load_personality_splits() -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """Local override of dataset_utils.load_personality_splits for Colab runs."""
    return make_local_splits(load_local_dataset(DATASET_PATH, SAMPLE_FRACTION, RANDOM_STATE), RANDOM_STATE)

In [6]:
import torch
from torch.utils.data import Dataset, DataLoader

print("Preparing local DataLoaders...")
df_train, df_val, df_test, df_all = load_personality_splits()

cfg = MODEL_CONFIGS[selected_model]
tokenizer = cfg["tokenizer_cls"].from_pretrained(cfg["tokenizer_name"])
max_length = 64

def encode_text(text: str):
    return tokenizer.encode_plus(
        text,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors=None
    )

for split_df in (df_train, df_val, df_test):
    split_df['body'] = split_df['body'].apply(lambda txt: encode_text(txt))

class LocalPersonalityDataset(Dataset):
    def __init__(self, tokenized_texts, targets):
        self.tokenized_texts = tokenized_texts
        self.targets = targets

    def __len__(self):
        return len(self.tokenized_texts)

    def __getitem__(self, idx):
        encoded = self.tokenized_texts[idx]
        return {
            'input_ids': torch.tensor(encoded['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(encoded['attention_mask'], dtype=torch.long),
            'targets': torch.tensor(self.targets[idx], dtype=torch.float)
        }

def make_loader(df_split, batch_size=16, shuffle=False):
    dataset = LocalPersonalityDataset(
        df_split['body'].tolist(),
        df_split[target_columns].values
    )
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=os.cpu_count())

train_loader = make_loader(df_train, shuffle=True)
val_loader = make_loader(df_val)
test_loader = make_loader(df_test)
print("DataLoaders ready.")

Preparing local DataLoaders...
Loaded 192420 rows from /kaggle/input/pandora-big5-train/pandora_train.csv (sample fraction=0.1)
Split sizes -> train: 107755, val: 26939, test: 57726


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

DataLoaders ready.


In [7]:
import torch

batch_size = 16
learning_rate = 1e-5
accumulation_steps = 16
num_epochs = 10
patience = 2
model_save_dir = MODELS_ROOT
model_save_dir.mkdir(parents=True, exist_ok=True)
best_model_path = model_save_dir / f"{selected_model}_best.pth"
final_model_path = model_save_dir / f"{selected_model}_final.pth"

# Resume configuration
resume_from_checkpoint = True
checkpoint_path = best_model_path  # change to another path if desired

if resume_from_checkpoint:
    print(f"Checkpoint configured: {checkpoint_path}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

Checkpoint configured: /kaggle/working/notebook_models/roberta-base_best.pth


device(type='cuda')

In [8]:
from transformers import AutoModel
from torch.optim import AdamW
import torch.nn as nn

output_dim = len(target_columns)

class PersonalityRegressor(nn.Module):
    def __init__(self, base_model_name: str, hidden_size: int, output_dim: int):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(base_model_name)
        self.dropout = nn.Dropout(0.3)
        self.regressor = nn.Linear(hidden_size, output_dim)

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.pooler_output if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None else outputs.last_hidden_state[:, 0]
        pooled = self.dropout(pooled)
        return self.regressor(pooled)

model = PersonalityRegressor(
    base_model_name=MODEL_CONFIGS[selected_model]["model_name"],
    hidden_size=MODEL_CONFIGS[selected_model]["hidden_size"],
    output_dim=output_dim
).to(device)
optimizer = AdamW(model.parameters(), lr=learning_rate)
loss_fn = nn.MSELoss()

start_epoch = 1
best_val_loss = float('inf')
patience_counter = 0

if resume_from_checkpoint and os.path.exists(checkpoint_path):
    state = torch.load(checkpoint_path, map_location=device)
    model.load_state_dict(state.get("model_state", state))
    if "optimizer_state" in state:
        optimizer.load_state_dict(state["optimizer_state"])
    best_val_loss = state.get("best_val_loss", best_val_loss)
    patience_counter = state.get("patience_counter", patience_counter)
    start_epoch = state.get("epoch", 0) + 1
    print(f"Loaded checkpoint from {checkpoint_path} (resuming at epoch {start_epoch})")
else:
    if resume_from_checkpoint:
        print(f"Checkpoint {checkpoint_path} not found; starting from scratch.")

model

2026-01-19 09:40:07.488798: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1768815607.681115      24 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1768815607.739842      24 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1768815608.225160      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768815608.225220      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768815608.225224      24 computation_placer.cc:177] computation placer alr

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded checkpoint from /kaggle/working/notebook_models/roberta-base_best.pth (resuming at epoch 7)


PersonalityRegressor(
  (encoder): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [9]:
for epoch in range(start_epoch, start_epoch + num_epochs):
    model.train()
    total_loss = 0
    optimizer.zero_grad()

    train_bar = tqdm(train_loader, desc=f"Epoch {epoch} - Train")
    for step, batch in enumerate(train_bar, 1):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs, targets) / accumulation_steps
        loss.backward()

        if step % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        total_loss += loss.item()
        train_bar.set_postfix(loss=total_loss / step)

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch}: Train Loss {avg_train_loss:.4f}")

    model.eval()
    val_loss = 0
    with torch.no_grad():
        val_bar = tqdm(val_loader, desc=f"Epoch {epoch} - Val")
        for step, batch in enumerate(val_bar, 1):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['targets'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs, targets)
            val_loss += loss.item()
            val_bar.set_postfix(loss=val_loss / step)

    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch}: Val Loss {avg_val_loss:.4f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save({
            "model_state": model.state_dict(),
            "optimizer_state": optimizer.state_dict(),
            "epoch": epoch,
            "best_val_loss": best_val_loss,
            "patience_counter": patience_counter
        }, best_model_path)
        print(f"Saved best checkpoint to {best_model_path}")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered")
            break

# Save final state
torch.save({
    "model_state": model.state_dict(),
    "optimizer_state": optimizer.state_dict(),
    "epoch": epoch,
    "best_val_loss": best_val_loss,
    "patience_counter": patience_counter
}, final_model_path)
print(f"Saved final checkpoint to {final_model_path}")

Epoch 7 - Train:   0%|          | 0/6735 [00:00<?, ?it/s]

Epoch 7: Train Loss 0.0049


Epoch 7 - Val:   0%|          | 0/1684 [00:00<?, ?it/s]

Epoch 7: Val Loss 0.0823
Saved best checkpoint to /kaggle/working/notebook_models/roberta-base_best.pth


Epoch 8 - Train:   0%|          | 0/6735 [00:00<?, ?it/s]

Epoch 8: Train Loss 0.0048


Epoch 8 - Val:   0%|          | 0/1684 [00:00<?, ?it/s]

Epoch 8: Val Loss 0.0838


Epoch 9 - Train:   0%|          | 0/6735 [00:00<?, ?it/s]

Epoch 9: Train Loss 0.0046


Epoch 9 - Val:   0%|          | 0/1684 [00:00<?, ?it/s]

Epoch 9: Val Loss 0.0843
Early stopping triggered
Saved final checkpoint to /kaggle/working/notebook_models/roberta-base_final.pth


In [10]:
from math import sqrt
import pandas as pd
from datetime import datetime

model.eval()
trait_names = [col.upper() for col in target_columns]
num_traits = len(trait_names)
all_preds = [[] for _ in range(num_traits)]
all_targets = [[] for _ in range(num_traits)]
with torch.no_grad():
    test_bar = tqdm(test_loader, desc="Testing")
    for batch in test_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        for i in range(num_traits):
            all_preds[i].extend(outputs[:, i].cpu().numpy())
            all_targets[i].extend(targets[:, i].cpu().numpy())

metric_rows = []
for i, trait in enumerate(trait_names):
    preds = np.array(all_preds[i])
    targs = np.array(all_targets[i])
    mse = mean_squared_error(targs, preds)
    rmse = sqrt(mse)
    mae = mean_absolute_error(targs, preds)
    r2 = r2_score(targs, preds)
    metric_rows.append({"trait": trait, "MSE": mse, "RMSE": rmse, "MAE": mae, "R2": r2})

metrics_df = pd.DataFrame(metric_rows)

pred_matrix = pd.DataFrame(np.array(all_preds).T, columns=trait_names)
target_matrix = pd.DataFrame(np.array(all_targets).T, columns=trait_names)

correlation_predictions = pred_matrix.corr() if num_traits > 1 else None
correlation_targets = target_matrix.corr() if num_traits > 1 else None

# === Persist results ===
log_dir = (LOGS_ROOT / run_suffix)
log_dir.mkdir(parents=True, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
metrics_path = log_dir / f"metrics_{timestamp}.csv"
metrics_df.to_csv(metrics_path, index=False)
print(f"Saved metrics to {metrics_path}")

if correlation_predictions is not None:
    corr_pred_path = log_dir / f"corr_predictions_{timestamp}.csv"
    corr_target_path = log_dir / f"corr_targets_{timestamp}.csv"
    correlation_predictions.to_csv(corr_pred_path)
    correlation_targets.to_csv(corr_target_path)
    print(f"Saved correlation matrices to {corr_pred_path} / {corr_target_path}")
else:
    print("Single trait run: skipping correlation matrix export.")

metrics_df

Testing:   0%|          | 0/3608 [00:00<?, ?it/s]

Saved metrics to /kaggle/working/notebook_logs/roberta_base_multi/metrics_20260119_101901.csv
Saved correlation matrices to /kaggle/working/notebook_logs/roberta_base_multi/corr_predictions_20260119_101901.csv / /kaggle/working/notebook_logs/roberta_base_multi/corr_targets_20260119_101901.csv


Unnamed: 0,trait,MSE,RMSE,MAE,R2
0,EXT,0.086394,0.293928,0.244619,0.020775
1,NEU,0.090975,0.30162,0.250483,0.057143
2,AGR,0.08123,0.285009,0.23449,0.101148
3,CON,0.065903,0.256717,0.209344,0.066682
4,OPE,0.094921,0.308093,0.26162,0.025941
