In [14]:
import optuna
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tqdm import tqdm

import optuna
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [22]:
train = pd.read_excel("data/idiap/dataset.xlsx")
train = train.dropna()
train = train.reset_index(drop=True)
train = train.drop(columns=["Unnamed: 0"])

texts = train["final_text"]
target_vars_names = [
    "hones16",
    "emoti16",
    "extra16",
    "agree16",
    "consc16",
    "openn16",
    "icar_hat0",
    "icar_hat1",
    "icar_hat2",
]
target_vars = train[target_vars_names]
target_vars = (target_vars - target_vars.min()) / (target_vars.max() - target_vars.min())
target_vars = target_vars.reset_index(drop=True).to_numpy()

In [6]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

model_name = "bhadresh-savani/bert-base-go-emotion"
tokenizer = AutoTokenizer.from_pretrained(model_name)
emotion_model = AutoModelForSequenceClassification.from_pretrained(model_name)


def extract_features(texts):
    features = []
    for text in tqdm(texts):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        with torch.no_grad():
            outputs = emotion_model(**inputs)
        probabilities = torch.softmax(outputs.logits, dim=-1)
        features.append(probabilities[0].numpy())
    return np.array(features)


features = extract_features(texts)

100%|██████████| 1983/1983 [05:32<00:00,  5.97it/s]


In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    features, target_vars, test_size=0.2, random_state=42
)

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)


class EmotionRegressor(nn.Module):
    def __init__(self, input_size, output_size, hidden_units, num_layers, dropout_rate):
        super(EmotionRegressor, self).__init__()
        layers = []
        for _ in range(num_layers):
            layers.append(
                nn.Linear(
                    input_size if len(layers) == 0 else hidden_units, hidden_units
                )
            )
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout_rate))
        layers.append(nn.Linear(hidden_units, output_size))
        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)


# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters
    hidden_units = trial.suggest_int("hidden_units", 32, 256, step=32)
    num_layers = trial.suggest_int("num_layers", 1, 3)
    dropout_rate = trial.suggest_float("dropout_rate", 0.0, 0.5)
    learning_rate = trial.suggest_float("learning_rate", 1e-4, 1e-2, log=True)

    # Initialize the model
    model = EmotionRegressor(
        input_size=X_train.shape[1],
        output_size=y_train.shape[1],
        hidden_units=hidden_units,
        num_layers=num_layers,
        dropout_rate=dropout_rate,
    )
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Train the model
    epochs = 50
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train_tensor)
        loss = criterion(outputs, y_train_tensor)
        loss.backward()
        optimizer.step()

    # Evaluate the model
    model.eval()
    with torch.no_grad():
        predictions = model(X_test_tensor).numpy()

    # Calculate mean RMSE for the objective function
    rmse_per_output = np.sqrt(
        mean_squared_error(y_test, predictions, multioutput="raw_values")
    )
    mean_rmse = np.mean(rmse_per_output)

    r2_per_output = r2_score(y_test, predictions, multioutput="raw_values")
    mean_r2 = np.mean(r2_per_output)
    return mean_r2


# Optimize the hyperparameters with Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=200)

# Print the best hyperparameters
print("Best hyperparameters:", study.best_params)

# Train and evaluate the final model with the best hyperparameters
best_params = study.best_params
final_model = EmotionRegressor(
    input_size=X_train.shape[1],
    output_size=y_train.shape[1],
    hidden_units=best_params["hidden_units"],
    num_layers=best_params["num_layers"],
    dropout_rate=best_params["dropout_rate"],
)
criterion = nn.MSELoss()
optimizer = optim.Adam(final_model.parameters(), lr=best_params["learning_rate"])

# Train the final model
epochs = 100
for epoch in range(epochs):
    final_model.train()
    optimizer.zero_grad()
    outputs = final_model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")

# Evaluate the final model
final_model.eval()
with torch.no_grad():
    predictions = final_model(X_test_tensor).numpy()

mae_per_output = mean_absolute_error(y_test, predictions, multioutput="raw_values")
rmse_per_output = np.sqrt(
    mean_squared_error(y_test, predictions, multioutput="raw_values")
)
r2_per_output = r2_score(y_test, predictions, multioutput="raw_values")
mean_mae = np.mean(mae_per_output)
mean_rmse = np.mean(rmse_per_output)
mean_r2 = np.mean(r2_per_output)

# Print results
print("MAE per output:", mae_per_output)
print("RMSE per output:", rmse_per_output)
print("R2 per output:", r2_per_output)
print("Mean MAE:", mean_mae)
print("Mean RMSE:", mean_rmse)
print("Mean R2:", mean_r2)

[I 2024-12-18 11:49:10,170] A new study created in memory with name: no-name-7abd0534-04e3-4d57-b728-d51fed91fd95
[I 2024-12-18 11:49:10,361] Trial 0 finished with value: -0.04306295900271347 and parameters: {'hidden_units': 128, 'num_layers': 2, 'dropout_rate': 0.13258246046155997, 'learning_rate': 0.0035382542651959145}. Best is trial 0 with value: -0.04306295900271347.
[I 2024-12-18 11:49:10,641] Trial 1 finished with value: -28.76821069090125 and parameters: {'hidden_units': 192, 'num_layers': 2, 'dropout_rate': 0.11570093443505686, 'learning_rate': 0.00014463578991026716}. Best is trial 0 with value: -0.04306295900271347.
[I 2024-12-18 11:49:10,826] Trial 2 finished with value: -0.04122410494492858 and parameters: {'hidden_units': 128, 'num_layers': 2, 'dropout_rate': 0.30313461542155923, 'learning_rate': 0.0017279242577491762}. Best is trial 2 with value: -0.04122410494492858.
[I 2024-12-18 11:49:11,013] Trial 3 finished with value: -0.07641612864529845 and parameters: {'hidden_u

Best hyperparameters: {'hidden_units': 256, 'num_layers': 1, 'dropout_rate': 0.15059637214789562, 'learning_rate': 0.0061999050851647185}
Epoch [10/100], Loss: 0.0780
Epoch [20/100], Loss: 0.0396
Epoch [30/100], Loss: 0.0284
Epoch [40/100], Loss: 0.0251
Epoch [50/100], Loss: 0.0242
Epoch [60/100], Loss: 0.0237
Epoch [70/100], Loss: 0.0237
Epoch [80/100], Loss: 0.0233
Epoch [90/100], Loss: 0.0234
Epoch [100/100], Loss: 0.0232
MAE per output: [0.13412758 0.1385297  0.13788654 0.14045485 0.14759735 0.13571472
 0.03290575 0.03928731 0.04646209]
RMSE per output: [0.1672936  0.17404351 0.16892969 0.17406826 0.18916002 0.16915717
 0.05337999 0.0566868  0.0648716 ]
R2 per output: [-0.00375136  0.00385267 -0.00546433 -0.00600259 -0.00184933 -0.00382385
 -0.0096159  -0.00829628 -0.01181128]
Mean MAE: 0.1058850981680787
Mean RMSE: 0.13528784862395893
Mean R2: -0.005195806682625623


In [26]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from peft import LoraConfig, get_peft_model, TaskType

# Load and preprocess the dataset
train = pd.read_excel("data/idiap/dataset.xlsx")
train = train.dropna()
train = train.reset_index(drop=True)
train = train.drop(columns=["Unnamed: 0"])

texts = train["final_text"]
target_vars_names = [
    "hones16",
    "emoti16",
    "extra16",
    "agree16",
    "consc16",
    "openn16",
    "icar_hat0",
    "icar_hat1",
    "icar_hat2",
]
target_vars = train[target_vars_names]
target_vars = (target_vars - target_vars.min()) / (target_vars.max() - target_vars.min())
target_vars = target_vars.reset_index(drop=True).to_numpy()

# Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, target_vars, test_size=0.2, random_state=42
)

# Debug train-test split
print(f"Train samples: {len(train_texts)}, Labels: {len(train_labels)}")
print(f"Validation samples: {len(val_texts)}, Labels: {len(val_labels)}")

# Load pretrained model and tokenizer
model_name = "bert-base-uncased"  # Replace with your emotion model if available
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModel.from_pretrained(model_name)

# Configure LoRA
lora_config = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION,  # Feature extraction for regression
    inference_mode=False,
    r=8,  # Low-rank dimension
    lora_alpha=16,  # Scaling factor
    lora_dropout=0.1  # Dropout
)
lora_model = get_peft_model(base_model, lora_config)

# Define the regression model
class EmotionRegressor(nn.Module):
    def __init__(self, base_model, output_size):
        super(EmotionRegressor, self).__init__()
        self.base_model = base_model
        self.regression_head = nn.Linear(base_model.config.hidden_size, output_size)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        cls_output = outputs.last_hidden_state[:, 0, :]  # CLS token representation
        regression_output = self.regression_head(cls_output)
        return regression_output

# Initialize the model
output_size = target_vars.shape[1]
full_model = EmotionRegressor(base_model=lora_model, output_size=output_size)

# Dataset class
class EmotionDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoding = self.tokenizer(
            str(self.texts.iloc[idx]),  # Ensure text is a string
            truncation=True,
            padding='max_length',
            max_length=128,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float32)
        }

# Create datasets
train_dataset = EmotionDataset(pd.Series(train_texts), train_labels, tokenizer)
val_dataset = EmotionDataset(pd.Series(val_texts), val_labels, tokenizer)

# Debug dataset
print("First training sample:", train_dataset[0])

# Data loaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True, drop_last=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32, shuffle=False, drop_last=False)

# Debug DataLoader
for i, batch in enumerate(train_loader):
    print(f"Batch {i}: input_ids shape {batch['input_ids'].shape}, labels shape {batch['labels'].shape}")
    break

# Training setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
full_model.to(device)
criterion = nn.MSELoss()
optimizer = optim.AdamW(full_model.parameters(), lr=1e-3)

# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    # Training phase
    full_model.train()
    train_loss = 0.0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        outputs = full_model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    
    # Validation phase
    full_model.eval()
    val_loss = 0.0
    val_predictions = []
    val_targets = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = full_model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            
            val_predictions.append(outputs.cpu().numpy())
            val_targets.append(labels.cpu().numpy())
    
    # Calculate metrics
    val_predictions = np.concatenate(val_predictions, axis=0)
    val_targets = np.concatenate(val_targets, axis=0)
    mae = mean_absolute_error(val_targets, val_predictions)
    rmse = np.sqrt(mean_squared_error(val_targets, val_predictions))
    r2 = r2_score(val_targets, val_predictions)
    
    print(f"Epoch [{epoch+1}/{num_epochs}]")
    print(f"  Train Loss: {train_loss / len(train_loader):.4f}")
    print(f"  Val Loss: {val_loss / len(val_loader):.4f}")
    print(f"  Val MAE: {mae:.4f}")
    print(f"  Val RMSE: {rmse:.4f}")
    print(f"  Val R2: {r2:.4f}")

# Save the model
torch.save(full_model.state_dict(), "emotion_regressor.pth")


Train samples: 1586, Labels: 1586
Validation samples: 397, Labels: 397
First training sample: {'input_ids': tensor([  101,  7592,  1012,  8529,  1045,  2228,  2026,  2783,  2597,  2004,
         2529,  4219,  2306,  2529,  4219,  1012,  1045,  2228,  1045,  2052,
         2022,  2200,  4591,  2000,  2022,  3755,  2000,  1996,  3208,  1997,
         1996, 17850,  2533,  1012,  1998,  1045,  2228,  2023,  2005,  2536,
         4436,  1012,  1045,  2903,  2008,  2026,  4105,  2911,  2026,  4105,
         4813,  2024,  2031,  5301, 24821,  2083,  2551,  2007,  2111,  2306,
         2023,  3105,  1012,  1045,  1005,  2310,  4342,  1037,  2843,  2055,
         1996, 16165,  1010,  2054,  1996,  5372,  8778,  2003,  2306,  1996,
        16165,  1998,  2129,  2000,  6133,  2008,  6464,  2061,  2008,  5126,
         2031,  1037,  2307,  3325,  2012,  2147,  2090,  2026,  3003,  5896,
        11137,  2026,  4105,  4813,  1012,  1998,  2036,  1045,  2903,  2008,
         2026,  3754,  2000,  6133