In [1]:
import json
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch
import pandas as pd
import os
from util import get_dataframe_from_json
from sklearn.metrics import classification_report
from bert_util import bert_tokenize_data, tensor_train_test_split, train_bert_model, model_predict, get_data_loader, \
    train_pipeline, randomized_cv_search
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from bert_util import scoring_fn
os.environ["USE_TF"] = "0"

  from .autonotebook import tqdm as notebook_tqdm
2025-05-25 20:11:50.287062: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-25 20:11:50.294392: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748196710.303578   13688 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748196710.306315   13688 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1748196710.313539   13688 computation_placer.cc:177] computation placer already r

# Normal Hand-Labeled Dataset

In [2]:
normal_train_df = pd.read_json('../normal_train_dataset.json', lines=False)
normal_test_df = pd.read_json('../normal_test_dataset.json', lines=False)
normal_train_df

Unnamed: 0,turn,utterance,emotion,act,hat
0,3,"I'll take one, too.",happiness,inform,0
1,8,"You know, we are superior to other clothes com...",no_emotion,inform,3
2,5,"Her new boyfriend, right?",no_emotion,commissive,1
3,9,How about recommending him to use the storage ...,no_emotion,directive,4
4,1,"Oh, a bouquet of flowers. It's very kind of you.",surprise,commissive,1
...,...,...,...,...,...
808,0,I prefer potatoes to eggplants.,no_emotion,inform,0
809,0,"Mr. Smith, I would like to get right to the po...",no_emotion,question,1
810,4,Yeah?,no_emotion,question,1
811,0,I am so bored all day.,no_emotion,inform,0


# Baseline BERT over the Normal Hand-Labeled Dataset

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
token_ids, attention_masks = bert_tokenize_data(tokenizer, normal_train_df['utterance'].values)
train_dataloader, val_dataloader = tensor_train_test_split(torch.tensor(normal_train_df['hat'].values), token_ids, attention_masks, test_size=0.1)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)
optimizer = AdamW(model.parameters(), lr=2e-5)


configs = {
        "epochs": 10,
        "clip_grad_norm": 1.0,
        "early_stopping_patience": 3,
        "early_stopping_delta": 0.1,
    }

num_training_steps = 10 * len(train_dataloader)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps)

model = train_bert_model(model, optimizer, scheduler, train_dataloader, val_dataloader, configs)

In [4]:
test_texts = normal_test_df['utterance'].values
serie = pd.Series(test_texts)
tids, amids = bert_tokenize_data(tokenizer, serie, max_length=64)
dl = get_data_loader(tids, amids, batch_size=5, shuffle=False)
preds, confidences = model_predict(model, dl)
labels_flat = normal_test_df['hat'].values.flatten()
accuracy = np.sum(preds == labels_flat) / len(labels_flat)

In [5]:
hat_map = {
    0: "red",
    1: "white",
    2: "black",
    3: "yellow",
    4: "green",
}
preds_array = np.array(preds)
print(classification_report(labels_flat, preds_array, target_names=list(hat_map.values())))

              precision    recall  f1-score   support

         red       0.53      0.50      0.51        40
       white       0.75      0.75      0.75       110
       black       0.33      0.26      0.29        23
      yellow       0.48      0.63      0.55        19
       green       0.17      0.17      0.17        12

    accuracy                           0.60       204
   macro avg       0.45      0.46      0.45       204
weighted avg       0.60      0.60      0.60       204



# HyperParameter Tuning over the Normal Hand-Labeled Dataset

In [None]:
def build_model():
    return BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
best_model, best_config, best_score = randomized_cv_search(build_model, tokenizer, normal_train_df['utterance'], normal_train_df['hat'], num_folds=5, num_samples=20, use_lora=True)


In [5]:
tids, amids = bert_tokenize_data(tokenizer, pd.Series(normal_test_df['utterance'].values), max_length=best_config["tokenizer_max_length"])
val_dataloader = get_data_loader(tids, amids, batch_size=best_config["dataloader_batch_size"], shuffle=False)
print(f"Best config: {best_config}")
score = scoring_fn(best_model, val_dataloader, normal_test_df['hat'].values)
#Best config: {'epochs': 15, 'model_dropout': 0.3, 'optimizer_lr': 0.0001, 'scheduler_warmup_steps': 200, 'lora_r': 4, 'lora_alpha': 32, 'lora_dropout': 0.1, 'tokenizer_max_length': 128, 'dataloader_batch_size': 16, 'clip_grad_norm': 2.0, 'early_stopping_patience': 10, 'early_stopping_delta': 0.01, 'scheduler_type': 'constant', 'optimizer_type': 'AdamW', 'weight_decay': 0.0001}

Best config: {'epochs': 15, 'model_dropout': 0.3, 'optimizer_lr': 0.0001, 'scheduler_warmup_steps': 200, 'lora_r': 4, 'lora_alpha': 32, 'lora_dropout': 0.1, 'tokenizer_max_length': 128, 'dataloader_batch_size': 16, 'clip_grad_norm': 2.0, 'early_stopping_patience': 10, 'early_stopping_delta': 0.01, 'scheduler_type': 'constant', 'optimizer_type': 'AdamW', 'weight_decay': 0.0001}
              precision    recall  f1-score   support

         red       0.40      0.40      0.40        40
       white       0.69      0.80      0.74       110
       black       0.33      0.17      0.23        23
      yellow       0.29      0.26      0.28        19
       green       0.25      0.17      0.20        12

    accuracy                           0.56       204
   macro avg       0.39      0.36      0.37       204
weighted avg       0.53      0.56      0.54       204



# EDA Augmented Hand-Labeled Dataset

In [6]:
augmented_train_df = pd.read_json('../eda_train_dataset.json', lines=False)

In [7]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
token_ids, attention_masks = bert_tokenize_data(tokenizer, augmented_train_df['utterance'].values)
train_dataloader, val_dataloader = tensor_train_test_split(torch.tensor(augmented_train_df['hat'].values), token_ids, attention_masks, test_size=0.1)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)
optimizer = AdamW(model.parameters(), lr=2e-5)


configs = {
        "epochs": 10,
        "clip_grad_norm": 1.0,
        "early_stopping_patience": 3,
        "early_stopping_delta": 0.1,
    }

num_training_steps = 10 * len(train_dataloader)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps)

model = train_bert_model(model, optimizer, scheduler, train_dataloader, val_dataloader, configs)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



-------------------- Epoch 1 --------------------
Training:
---------
Start Time:       2025-05-25 19:31:31.164460
Average Training Loss: 0.8062743402999824
Time Taken:            0:00:34.284827

Validation:
-----------
Start Time:       2025-05-25 19:32:05.449674
Average Validation Loss:     1.163320651079746
Average Validation Accuracy: 0.5946048632218845
Time Taken:                  0:00:01.075927

-------------------- Epoch 2 --------------------
Training:
---------
Start Time:       2025-05-25 19:32:06.527131
Average Training Loss: 0.19978863342870867
Time Taken:            0:00:34.782225

Validation:
-----------
Start Time:       2025-05-25 19:32:41.309958
Average Validation Loss:     0.9578507408658241
Average Validation Accuracy: 0.7572188449848024
Time Taken:                  0:00:01.073582

-------------------- Epoch 3 --------------------
Training:
---------
Start Time:       2025-05-25 19:32:42.384912
Average Training Loss: 0.0871732396174846
Time Taken:            0:00:34

In [8]:
test_texts = normal_test_df['utterance'].values
serie = pd.Series(test_texts)
tids, amids = bert_tokenize_data(tokenizer, serie, max_length=64)
dl = get_data_loader(tids, amids, batch_size=5, shuffle=False)
preds, confidences = model_predict(model, dl)
labels_flat = normal_test_df['hat'].values.flatten()
accuracy = np.sum(preds == labels_flat) / len(labels_flat)
hat_map = {
    0: "red",
    1: "white",
    2: "black",
    3: "yellow",
    4: "green",
}
preds_array = np.array(preds)
print(classification_report(labels_flat, preds_array, target_names=list(hat_map.values())))

              precision    recall  f1-score   support

         red       0.63      0.47      0.54        40
       white       0.66      0.89      0.76       110
       black       0.29      0.09      0.13        23
      yellow       0.78      0.37      0.50        19
       green       0.56      0.42      0.48        12

    accuracy                           0.64       204
   macro avg       0.58      0.45      0.48       204
weighted avg       0.62      0.64      0.60       204



### HyperParameter Tuning over the EDA Augmented Hand-Labeled Dataset

In [None]:
def build_model():
    return BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
best_model, best_config, best_score = randomized_cv_search(build_model, tokenizer, augmented_train_df['utterance'], augmented_train_df['hat'], num_folds=2, num_samples=10, use_lora=True)

In [11]:
tids, amids = bert_tokenize_data(tokenizer, pd.Series(normal_test_df['utterance'].values), max_length=best_config["tokenizer_max_length"])
val_dataloader = get_data_loader(tids, amids, batch_size=best_config["dataloader_batch_size"], shuffle=False)
print(f"Best config: {best_config}")
score = scoring_fn(best_model, val_dataloader, normal_test_df['hat'].values)

Best config: {'epochs': 15, 'model_dropout': 0.3, 'optimizer_lr': 0.0001, 'scheduler_warmup_steps': 200, 'lora_r': 4, 'lora_alpha': 32, 'lora_dropout': 0.1, 'tokenizer_max_length': 128, 'dataloader_batch_size': 16, 'clip_grad_norm': 2.0, 'early_stopping_patience': 10, 'early_stopping_delta': 0.01, 'scheduler_type': 'constant', 'optimizer_type': 'AdamW', 'weight_decay': 0.0001}
              precision    recall  f1-score   support

         red       0.55      0.30      0.39        40
       white       0.64      0.88      0.74       110
       black       0.40      0.26      0.32        23
      yellow       0.55      0.32      0.40        19
       green       0.50      0.17      0.25        12

    accuracy                           0.60       204
   macro avg       0.53      0.39      0.42       204
weighted avg       0.58      0.60      0.56       204



# Randomized Search for Hyperparameter Tuning usage example

# BERT over the Automated Labeled Dataset

In [13]:
ald_train_df = pd.read_json('../ald_train_dataset.json', lines=False)
ald_test_df = pd.read_json('../ald_test_dataset.json', lines=False)

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
token_ids, attention_masks = bert_tokenize_data(tokenizer, ald_train_df['utterance'].values)
train_dataloader, val_dataloader = tensor_train_test_split(torch.tensor(ald_train_df['hat'].values), token_ids, attention_masks, test_size=0.1)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)
optimizer = AdamW(model.parameters(), lr=2e-5)


configs = {
        "epochs": 10,
        "clip_grad_norm": 1.0,
        "early_stopping_patience": 3,
        "early_stopping_delta": 0.1,
    }

num_training_steps = 10 * len(train_dataloader)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps)

model = train_bert_model(model, optimizer, scheduler, train_dataloader, val_dataloader, configs)

In [15]:
from bert_util import scoring_fn
tids, amids = bert_tokenize_data(tokenizer, pd.Series(ald_test_df['utterance'].values), max_length=best_config["tokenizer_max_length"])
val_dataloader = get_data_loader(tids, amids, batch_size=best_config["dataloader_batch_size"], shuffle=False)

score = scoring_fn(best_model, val_dataloader, ald_test_df['hat'].values)

              precision    recall  f1-score   support

         red       0.55      0.30      0.39        40
       white       0.64      0.88      0.74       110
       black       0.40      0.26      0.32        23
      yellow       0.55      0.32      0.40        19
       green       0.50      0.17      0.25        12

    accuracy                           0.60       204
   macro avg       0.53      0.39      0.42       204
weighted avg       0.58      0.60      0.56       204



In [None]:
def build_model():
    return BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
best_model, best_config, best_score = randomized_cv_search(build_model, tokenizer, ald_train_df['utterance'], ald_train_df['hat'], num_folds=2, num_samples=10, use_lora=True)


In [18]:
from bert_util import scoring_fn
tids, amids = bert_tokenize_data(tokenizer, pd.Series(ald_test_df['utterance'].values), max_length=best_config["tokenizer_max_length"])
val_dataloader = get_data_loader(tids, amids, batch_size=best_config["dataloader_batch_size"], shuffle=False)
print(f"Best config: {best_config}")
score = scoring_fn(best_model, val_dataloader, ald_test_df['hat'].values)

Best config: {'epochs': 15, 'model_dropout': 0.3, 'optimizer_lr': 0.0001, 'scheduler_warmup_steps': 200, 'lora_r': 4, 'lora_alpha': 32, 'lora_dropout': 0.1, 'tokenizer_max_length': 128, 'dataloader_batch_size': 16, 'clip_grad_norm': 2.0, 'early_stopping_patience': 10, 'early_stopping_delta': 0.01, 'scheduler_type': 'constant', 'optimizer_type': 'AdamW', 'weight_decay': 0.0001}
              precision    recall  f1-score   support

         red       0.40      0.40      0.40        40
       white       0.69      0.80      0.74       110
       black       0.33      0.17      0.23        23
      yellow       0.29      0.26      0.28        19
       green       0.25      0.17      0.20        12

    accuracy                           0.56       204
   macro avg       0.39      0.36      0.37       204
weighted avg       0.53      0.56      0.54       204

