# Big Five Training Notebook
This notebook automates fine-tuning (multi-trait or single-trait) BERT/RoBERTa models on the Reddit personality dataset (`Fatima0923/Automated-Personality-Prediction`). Run cells in order.

## Optional: Google Colab setup
Run this section only on Colab to mount Google Drive and switch to your project directory.

In [31]:
try:
    from google.colab import drive  # type: ignore
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

if IN_COLAB:
    drive.mount('/content/drive')
    %cd /content/drive/MyDrive
    %cd BigFive
else:
    print("Not running in Google Colab; skipping Drive mount.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive
/content/drive/MyDrive/BigFive


In [32]:
# Optional: install requirements inside the notebook environment
!pip install -q -r requirements.txt

In [33]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tqdm.notebook import tqdm
from transformers import BertTokenizer, RobertaTokenizer

MODEL_CONFIGS = {
    "bert-base": {
        "tokenizer_cls": BertTokenizer,
        "tokenizer_name": "bert-base-cased",
        "model_name": "bert-base-cased",
        "hidden_size": 768
    },
    "bert-large": {
        "tokenizer_cls": BertTokenizer,
        "tokenizer_name": "bert-large-cased",
        "model_name": "bert-large-cased",
        "hidden_size": 1024
    },
    "roberta-base": {
        "tokenizer_cls": RobertaTokenizer,
        "tokenizer_name": "roberta-base",
        "model_name": "roberta-base",
        "hidden_size": 768
    },
    "roberta-large": {
        "tokenizer_cls": RobertaTokenizer,
        "tokenizer_name": "roberta-large",
        "model_name": "roberta-large",
        "hidden_size": 1024
    }
}

# === Dataset configuration ===
DATASET_PATH = Path(os.environ.get("DATASET_CSV_PATH", "dataset.csv"))
SAMPLE_FRACTION = float(os.environ.get("DATASET_SAMPLE_FRACTION", "0.1"))
RANDOM_STATE = 42
CSV_COLUMN_MAPPING = {
    "text": "body",
    "A": "agr",
    "O": "ope",
    "C": "con",
    "E": "ext",
    "N": "neu",
}

if not 0 < SAMPLE_FRACTION <= 1:
    raise ValueError("SAMPLE_FRACTION must be within (0, 1].")

# === User-configurable options ===
selected_model = "roberta-base"            # choose key from MODEL_CONFIGS
training_mode = "multi_trait"              # "multi_trait" or "single_trait"
target_trait = "ope"                       # used only when training_mode == "single_trait"

TRAIT_COLUMNS = ["ext", "neu", "agr", "con", "ope"]
assert target_trait in TRAIT_COLUMNS, "Invalid target trait"
model_tag = selected_model.replace('-', '_')

if training_mode == "single_trait":
    target_columns = [target_trait]
    run_suffix = f"{model_tag}_single_{target_trait}"
else:
    target_columns = TRAIT_COLUMNS
    run_suffix = f"{model_tag}_multi"

print(f"Selected config: {selected_model}")
print(f"Training mode: {training_mode} | targets: {target_columns}")
print(f"Run suffix (log folder): {run_suffix}")

Selected config: roberta-base
Training mode: multi_trait | targets: ['ext', 'neu', 'agr', 'con', 'ope']
Run suffix (log folder): roberta_base_multi


In [34]:
def load_local_dataset(csv_path: Path, sample_fraction: float, random_state: int = 42) -> pd.DataFrame:
    if not csv_path.exists():
        raise FileNotFoundError(f"Dataset file not found: {csv_path.resolve()}")

    df = pd.read_csv(csv_path)
    missing_columns = [col for col in CSV_COLUMN_MAPPING if col not in df.columns]
    if missing_columns:
        raise ValueError(
            f"Dataset is missing required columns: {missing_columns}. "
            f"Expected columns: {list(CSV_COLUMN_MAPPING.keys())}"
        )

    df = df.rename(columns=CSV_COLUMN_MAPPING)
    df = df[["body", "ext", "neu", "agr", "con", "ope"]]
    df[TRAIT_COLUMNS] = df[TRAIT_COLUMNS] / 100.0
    df["body"] = df["body"].fillna("")

    if sample_fraction < 1.0:
        df = df.sample(frac=sample_fraction, random_state=random_state)
    df = df.reset_index(drop=True)
    print(f"Loaded {len(df)} rows from {csv_path} (sample fraction={sample_fraction})")
    return df


def make_local_splits(df: pd.DataFrame, random_state: int = 42):
    df_train, df_test = train_test_split(df, test_size=0.3, random_state=random_state, shuffle=True)
    df_train, df_val = train_test_split(df_train, test_size=0.2, random_state=random_state, shuffle=True)
    print(f"Split sizes -> train: {len(df_train)}, val: {len(df_val)}, test: {len(df_test)}")
    return df_train, df_val, df_test, df


def load_personality_splits() -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """Local override of dataset_utils.load_personality_splits for Colab runs."""
    return make_local_splits(load_local_dataset(DATASET_PATH, SAMPLE_FRACTION, RANDOM_STATE), RANDOM_STATE)

In [35]:
import torch
from torch.utils.data import Dataset, DataLoader

print("Preparing local DataLoaders...")
df_train, df_val, df_test, df_all = load_personality_splits()

cfg = MODEL_CONFIGS[selected_model]
tokenizer = cfg["tokenizer_cls"].from_pretrained(cfg["tokenizer_name"])
max_length = 64

def encode_text(text: str):
    return tokenizer.encode_plus(
        text,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors=None
    )

for split_df in (df_train, df_val, df_test):
    split_df['body'] = split_df['body'].apply(lambda txt: encode_text(txt))

class LocalPersonalityDataset(Dataset):
    def __init__(self, tokenized_texts, targets):
        self.tokenized_texts = tokenized_texts
        self.targets = targets

    def __len__(self):
        return len(self.tokenized_texts)

    def __getitem__(self, idx):
        encoded = self.tokenized_texts[idx]
        return {
            'input_ids': torch.tensor(encoded['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(encoded['attention_mask'], dtype=torch.long),
            'targets': torch.tensor(self.targets[idx], dtype=torch.float)
        }

def make_loader(df_split, batch_size=16, shuffle=False):
    dataset = LocalPersonalityDataset(
        df_split['body'].tolist(),
        df_split[target_columns].values
    )
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=os.cpu_count())

train_loader = make_loader(df_train, shuffle=True)
val_loader = make_loader(df_val)
test_loader = make_loader(df_test)
print("DataLoaders ready.")

Preparing local DataLoaders...
Loaded 192420 rows from dataset.csv (sample fraction=0.1)
Split sizes -> train: 107755, val: 26939, test: 57726


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

DataLoaders ready.


In [36]:
import torch

batch_size = 16
learning_rate = 1e-5
accumulation_steps = 16
num_epochs = 10
patience = 2
model_save_dir = "notebook_models"
os.makedirs(model_save_dir, exist_ok=True)
best_model_path = os.path.join(model_save_dir, f"{selected_model}_best.pth")
final_model_path = os.path.join(model_save_dir, f"{selected_model}_final.pth")

# Resume configuration
resume_from_checkpoint = True
checkpoint_path = best_model_path  # change to another path if desired

if resume_from_checkpoint:
    print(f"Checkpoint configured: {checkpoint_path}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [37]:
from transformers import AutoModel
from torch.optim import AdamW
import torch.nn as nn

output_dim = len(target_columns)

class PersonalityRegressor(nn.Module):
    def __init__(self, base_model_name: str, hidden_size: int, output_dim: int):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(base_model_name)
        self.dropout = nn.Dropout(0.3)
        self.regressor = nn.Linear(hidden_size, output_dim)

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.pooler_output if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None else outputs.last_hidden_state[:, 0]
        pooled = self.dropout(pooled)
        return self.regressor(pooled)

model = PersonalityRegressor(
    base_model_name=MODEL_CONFIGS[selected_model]["model_name"],
    hidden_size=MODEL_CONFIGS[selected_model]["hidden_size"],
    output_dim=output_dim
).to(device)
optimizer = AdamW(model.parameters(), lr=learning_rate)
loss_fn = nn.MSELoss()

start_epoch = 1
best_val_loss = float('inf')
patience_counter = 0

if resume_from_checkpoint and os.path.exists(checkpoint_path):
    state = torch.load(checkpoint_path, map_location=device)
    model.load_state_dict(state.get("model_state", state))
    if "optimizer_state" in state:
        optimizer.load_state_dict(state["optimizer_state"])
    best_val_loss = state.get("best_val_loss", best_val_loss)
    patience_counter = state.get("patience_counter", patience_counter)
    start_epoch = state.get("epoch", 0) + 1
    print(f"Loaded checkpoint from {checkpoint_path} (resuming at epoch {start_epoch})")
else:
    if resume_from_checkpoint:
        print(f"Checkpoint {checkpoint_path} not found; starting from scratch.")

model

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PersonalityRegressor(
  (encoder): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [None]:
for epoch in range(start_epoch, start_epoch + num_epochs):
    model.train()
    total_loss = 0
    optimizer.zero_grad()

    train_bar = tqdm(train_loader, desc=f"Epoch {epoch} - Train")
    for step, batch in enumerate(train_bar, 1):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs, targets) / accumulation_steps
        loss.backward()

        if step % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        total_loss += loss.item()
        train_bar.set_postfix(loss=total_loss / step)

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch}: Train Loss {avg_train_loss:.4f}")

    model.eval()
    val_loss = 0
    with torch.no_grad():
        val_bar = tqdm(val_loader, desc=f"Epoch {epoch} - Val")
        for step, batch in enumerate(val_bar, 1):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['targets'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs, targets)
            val_loss += loss.item()
            val_bar.set_postfix(loss=val_loss / step)

    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch}: Val Loss {avg_val_loss:.4f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save({
            "model_state": model.state_dict(),
            "optimizer_state": optimizer.state_dict(),
            "epoch": epoch,
            "best_val_loss": best_val_loss,
            "patience_counter": patience_counter
        }, best_model_path)
        print(f"Saved best checkpoint to {best_model_path}")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered")
            break

# Save final state
torch.save({
    "model_state": model.state_dict(),
    "optimizer_state": optimizer.state_dict(),
    "epoch": epoch,
    "best_val_loss": best_val_loss,
    "patience_counter": patience_counter
}, final_model_path)
print(f"Saved final checkpoint to {final_model_path}")

Epoch 1 - Train:   0%|          | 0/6735 [00:00<?, ?it/s]

Epoch 1: Train Loss 0.0063


Epoch 1 - Val:   0%|          | 0/1684 [00:00<?, ?it/s]

Epoch 1: Val Loss 0.0863
Saved best checkpoint to notebook_models/roberta-base_best.pth


Epoch 2 - Train:   0%|          | 0/6735 [00:00<?, ?it/s]

Epoch 2: Train Loss 0.0056


Epoch 2 - Val:   0%|          | 0/1684 [00:00<?, ?it/s]

Epoch 2: Val Loss 0.0847
Saved best checkpoint to notebook_models/roberta-base_best.pth


Epoch 3 - Train:   0%|          | 0/6735 [00:00<?, ?it/s]

Epoch 3: Train Loss 0.0054


Epoch 3 - Val:   0%|          | 0/1684 [00:00<?, ?it/s]

Epoch 3: Val Loss 0.0838
Saved best checkpoint to notebook_models/roberta-base_best.pth


Epoch 4 - Train:   0%|          | 0/6735 [00:00<?, ?it/s]

Epoch 4: Train Loss 0.0053


Epoch 4 - Val:   0%|          | 0/1684 [00:00<?, ?it/s]

Epoch 4: Val Loss 0.0829
Saved best checkpoint to notebook_models/roberta-base_best.pth


Epoch 5 - Train:   0%|          | 0/6735 [00:00<?, ?it/s]

Epoch 5: Train Loss 0.0051


Epoch 5 - Val:   0%|          | 0/1684 [00:00<?, ?it/s]

Epoch 5: Val Loss 0.0827
Saved best checkpoint to notebook_models/roberta-base_best.pth


Epoch 6 - Train:   0%|          | 0/6735 [00:00<?, ?it/s]

Epoch 6: Train Loss 0.0050


Epoch 6 - Val:   0%|          | 0/1684 [00:00<?, ?it/s]

Epoch 6: Val Loss 0.0825
Saved best checkpoint to notebook_models/roberta-base_best.pth


Epoch 7 - Train:   0%|          | 0/6735 [00:00<?, ?it/s]

In [None]:
from math import sqrt
import pandas as pd
from datetime import datetime

model.eval()
trait_names = [col.upper() for col in target_columns]
num_traits = len(trait_names)
all_preds = [[] for _ in range(num_traits)]
all_targets = [[] for _ in range(num_traits)]
with torch.no_grad():
    test_bar = tqdm(test_loader, desc="Testing")
    for batch in test_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        for i in range(num_traits):
            all_preds[i].extend(outputs[:, i].cpu().numpy())
            all_targets[i].extend(targets[:, i].cpu().numpy())

metric_rows = []
for i, trait in enumerate(trait_names):
    preds = np.array(all_preds[i])
    targs = np.array(all_targets[i])
    mse = mean_squared_error(targs, preds)
    rmse = sqrt(mse)
    mae = mean_absolute_error(targs, preds)
    r2 = r2_score(targs, preds)
    metric_rows.append({"trait": trait, "MSE": mse, "RMSE": rmse, "MAE": mae, "R2": r2})

metrics_df = pd.DataFrame(metric_rows)

pred_matrix = pd.DataFrame(np.array(all_preds).T, columns=trait_names)
target_matrix = pd.DataFrame(np.array(all_targets).T, columns=trait_names)

correlation_predictions = pred_matrix.corr() if num_traits > 1 else None
correlation_targets = target_matrix.corr() if num_traits > 1 else None

# === Persist results ===
log_dir = os.path.join("notebook_logs", run_suffix)
os.makedirs(log_dir, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
metrics_path = os.path.join(log_dir, f"metrics_{timestamp}.csv")
metrics_df.to_csv(metrics_path, index=False)
print(f"Saved metrics to {metrics_path}")

if correlation_predictions is not None:
    corr_pred_path = os.path.join(log_dir, f"corr_predictions_{timestamp}.csv")
    corr_target_path = os.path.join(log_dir, f"corr_targets_{timestamp}.csv")
    correlation_predictions.to_csv(corr_pred_path)
    correlation_targets.to_csv(corr_target_path)
    print(f"Saved correlation matrices to {corr_pred_path} / {corr_target_path}")
else:
    print("Single trait run: skipping correlation matrix export.")

metrics_df