In [1]:
import json
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch
import pandas as pd
import os
from util import get_dataframe_from_json
from sklearn.metrics import classification_report
from bert_util import bert_tokenize_data, tensor_train_test_split, train_bert_model, model_predict, get_data_loader, \
    train_pipeline, randomized_cv_search
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from bert_util import scoring_fn
os.environ["USE_TF"] = "0"

2025-05-26 16:16:02,433	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2025-05-26 16:16:02,669	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


# Normal Hand-Labeled Dataset

In [2]:
normal_train_df = pd.read_json('../normal_train_dataset.json', lines=False)
normal_test_df = pd.read_json('../normal_test_dataset.json', lines=False)
normal_train_df

Unnamed: 0,turn,utterance,emotion,act,hat
0,3,"I'll take one, too.",happiness,inform,0
1,8,"You know, we are superior to other clothes com...",no_emotion,inform,3
2,5,"Her new boyfriend, right?",no_emotion,commissive,1
3,9,How about recommending him to use the storage ...,no_emotion,directive,4
4,1,"Oh, a bouquet of flowers. It's very kind of you.",surprise,commissive,1
...,...,...,...,...,...
808,0,I prefer potatoes to eggplants.,no_emotion,inform,0
809,0,"Mr. Smith, I would like to get right to the po...",no_emotion,question,1
810,4,Yeah?,no_emotion,question,1
811,0,I am so bored all day.,no_emotion,inform,0


# Baseline BERT over the Normal Hand-Labeled Dataset

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
token_ids, attention_masks = bert_tokenize_data(tokenizer, normal_train_df['utterance'].values)
train_dataloader, val_dataloader = tensor_train_test_split(torch.tensor(normal_train_df['hat'].values), token_ids, attention_masks, test_size=0.1)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)
optimizer = AdamW(model.parameters(), lr=2e-5)


configs = {
        "epochs": 10,
        "clip_grad_norm": 1.0,
        "early_stopping_patience": 3,
        "early_stopping_delta": 0.1,
    }

num_training_steps = 10 * len(train_dataloader)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps)

model = train_bert_model(model, optimizer, scheduler, train_dataloader, val_dataloader, configs)

In [4]:
test_texts = normal_test_df['utterance'].values
serie = pd.Series(test_texts)
tids, amids = bert_tokenize_data(tokenizer, serie, max_length=64)
dl = get_data_loader(tids, amids, batch_size=5, shuffle=False)
preds, confidences = model_predict(model, dl)
labels_flat = normal_test_df['hat'].values.flatten()
accuracy = np.sum(preds == labels_flat) / len(labels_flat)

In [5]:
hat_map = {
    0: "red",
    1: "white",
    2: "black",
    3: "yellow",
    4: "green",
}
preds_array = np.array(preds)
print(classification_report(labels_flat, preds_array, target_names=list(hat_map.values())))

              precision    recall  f1-score   support

         red       0.53      0.50      0.51        40
       white       0.75      0.75      0.75       110
       black       0.33      0.26      0.29        23
      yellow       0.48      0.63      0.55        19
       green       0.17      0.17      0.17        12

    accuracy                           0.60       204
   macro avg       0.45      0.46      0.45       204
weighted avg       0.60      0.60      0.60       204



# HyperParameter Tuning over the Normal Hand-Labeled Dataset

In [None]:
def build_model():
    return BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
best_model, best_config, best_score = randomized_cv_search(build_model, tokenizer, normal_train_df['utterance'], normal_train_df['hat'], num_folds=5, num_samples=20, use_lora=True)


In [8]:
tids, amids = bert_tokenize_data(tokenizer, pd.Series(normal_test_df['utterance'].values), max_length=best_config["tokenizer_max_length"])
val_dataloader = get_data_loader(tids, amids, batch_size=best_config["dataloader_batch_size"], shuffle=False)
print(f"Best config: {best_config}")
score = scoring_fn(best_model, val_dataloader, normal_test_df['hat'].values)
#Best config: {'epochs': 15, 'model_dropout': 0.3, 'optimizer_lr': 0.0001, 'scheduler_warmup_steps': 200, 'lora_r': 4, 'lora_alpha': 32, 'lora_dropout': 0.1, 'tokenizer_max_length': 128, 'dataloader_batch_size': 16, 'clip_grad_norm': 2.0, 'early_stopping_patience': 10, 'early_stopping_delta': 0.01, 'scheduler_type': 'constant', 'optimizer_type': 'AdamW', 'weight_decay': 0.0001}

Best config: {'epochs': 30, 'model_dropout': 0.1, 'optimizer_lr': 0.001, 'scheduler_warmup_steps': 0, 'lora_r': 4, 'lora_alpha': 32, 'lora_dropout': 0.05, 'tokenizer_max_length': 128, 'dataloader_batch_size': 32, 'clip_grad_norm': 1.0, 'early_stopping_patience': 10, 'early_stopping_delta': 0.01, 'scheduler_type': 'linear', 'optimizer_type': 'AdamW', 'weight_decay': 0.0001}
              precision    recall  f1-score   support

         red       0.57      0.42      0.49        40
       white       0.70      0.82      0.76       110
       black       0.67      0.09      0.15        23
      yellow       0.28      0.53      0.36        19
       green       0.29      0.17      0.21        12

    accuracy                           0.59       204
   macro avg       0.50      0.40      0.39       204
weighted avg       0.61      0.59      0.57       204



# EDA Augmented Hand-Labeled Dataset

In [4]:
augmented_train_df = pd.read_json('../eda_train_dataset.json', lines=False)

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
token_ids, attention_masks = bert_tokenize_data(tokenizer, augmented_train_df['utterance'].values)
train_dataloader, val_dataloader = tensor_train_test_split(torch.tensor(augmented_train_df['hat'].values), token_ids, attention_masks, test_size=0.1)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)
optimizer = AdamW(model.parameters(), lr=2e-5)


configs = {
        "epochs": 10,
        "clip_grad_norm": 1.0,
        "early_stopping_patience": 3,
        "early_stopping_delta": 0.1,
    }

num_training_steps = 10 * len(train_dataloader)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps)

model = train_bert_model(model, optimizer, scheduler, train_dataloader, val_dataloader, configs)

In [8]:
test_texts = normal_test_df['utterance'].values
serie = pd.Series(test_texts)
tids, amids = bert_tokenize_data(tokenizer, serie, max_length=64)
dl = get_data_loader(tids, amids, batch_size=5, shuffle=False)
preds, confidences = model_predict(model, dl)
labels_flat = normal_test_df['hat'].values.flatten()
accuracy = np.sum(preds == labels_flat) / len(labels_flat)
hat_map = {
    0: "red",
    1: "white",
    2: "black",
    3: "yellow",
    4: "green",
}
preds_array = np.array(preds)
print(classification_report(labels_flat, preds_array, target_names=list(hat_map.values())))

              precision    recall  f1-score   support

         red       0.63      0.47      0.54        40
       white       0.66      0.89      0.76       110
       black       0.29      0.09      0.13        23
      yellow       0.78      0.37      0.50        19
       green       0.56      0.42      0.48        12

    accuracy                           0.64       204
   macro avg       0.58      0.45      0.48       204
weighted avg       0.62      0.64      0.60       204



### HyperParameter Tuning over the EDA Augmented Hand-Labeled Dataset

In [None]:
def build_model():
    return BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
best_model, best_config, best_score = randomized_cv_search(build_model, tokenizer, augmented_train_df['utterance'], augmented_train_df['hat'], num_folds=5, num_samples=20, use_lora=True)

In [11]:
tids, amids = bert_tokenize_data(tokenizer, pd.Series(normal_test_df['utterance'].values), max_length=best_config["tokenizer_max_length"])
val_dataloader = get_data_loader(tids, amids, batch_size=best_config["dataloader_batch_size"], shuffle=False)
print(f"Best config: {best_config}")
score = scoring_fn(best_model, val_dataloader, normal_test_df['hat'].values)
# Best config: {'epochs': 15, 'model_dropout': 0.3, 'optimizer_lr': 0.0001, 'scheduler_warmup_steps': 200, 'lora_r': 4, 'lora_alpha': 32, 'lora_dropout': 0.1, 'tokenizer_max_length': 128, 'dataloader_batch_size': 16, 'clip_grad_norm': 2.0, 'early_stopping_patience': 10, 'early_stopping_delta': 0.01, 'scheduler_type': 'constant', 'optimizer_type': 'AdamW', 'weight_decay': 0.0001}

Best config: {'epochs': 15, 'model_dropout': 0.3, 'optimizer_lr': 0.0001, 'scheduler_warmup_steps': 200, 'lora_r': 4, 'lora_alpha': 32, 'lora_dropout': 0.1, 'tokenizer_max_length': 128, 'dataloader_batch_size': 16, 'clip_grad_norm': 2.0, 'early_stopping_patience': 10, 'early_stopping_delta': 0.01, 'scheduler_type': 'constant', 'optimizer_type': 'AdamW', 'weight_decay': 0.0001}
              precision    recall  f1-score   support

         red       0.55      0.30      0.39        40
       white       0.64      0.88      0.74       110
       black       0.40      0.26      0.32        23
      yellow       0.55      0.32      0.40        19
       green       0.50      0.17      0.25        12

    accuracy                           0.60       204
   macro avg       0.53      0.39      0.42       204
weighted avg       0.58      0.60      0.56       204



# Randomized Search for Hyperparameter Tuning usage example

# BERT over the Automated Labeled Dataset

In [7]:
ald_train_df = pd.read_json('../ald_train_dataset.json', lines=False)
ald_test_df = pd.read_json('../ald_test_dataset.json', lines=False)

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
token_ids, attention_masks = bert_tokenize_data(tokenizer, ald_train_df['utterance'].values)
train_dataloader, val_dataloader = tensor_train_test_split(torch.tensor(ald_train_df['hat'].values), token_ids, attention_masks, test_size=0.1)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)
optimizer = AdamW(model.parameters(), lr=2e-5)


configs = {
        "epochs": 10,
        "clip_grad_norm": 1.0,
        "early_stopping_patience": 3,
        "early_stopping_delta": 0.1,
    }

num_training_steps = 10 * len(train_dataloader)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps)

model = train_bert_model(model, optimizer, scheduler, train_dataloader, val_dataloader, configs)

In [10]:
test_texts = ald_test_df['utterance'].values
serie = pd.Series(test_texts)
tids, amids = bert_tokenize_data(tokenizer, serie, max_length=64)
dl = get_data_loader(tids, amids, batch_size=5, shuffle=False)
preds, confidences = model_predict(model, dl)
labels_flat = ald_test_df['hat'].values.flatten()
accuracy = np.sum(preds == labels_flat) / len(labels_flat)
hat_map = {
    0: "red",
    1: "white",
    2: "black",
    3: "yellow",
    4: "green",
}
preds_array = np.array(preds)
print(classification_report(labels_flat, preds_array, target_names=list(hat_map.values())))

              precision    recall  f1-score   support

         red       0.78      0.73      0.75       200
       white       0.45      0.33      0.38       200
       black       0.60      0.48      0.54       200
      yellow       0.51      0.70      0.59       200
       green       0.59      0.66      0.62       200

    accuracy                           0.58      1000
   macro avg       0.58      0.58      0.58      1000
weighted avg       0.58      0.58      0.58      1000



In [None]:
def build_model():
    return BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
best_model, best_config, best_score = randomized_cv_search(build_model, tokenizer, ald_train_df['utterance'], ald_train_df['hat'], num_folds=2, num_samples=20, use_lora=True)


In [18]:
from bert_util import scoring_fn
tids, amids = bert_tokenize_data(tokenizer, pd.Series(ald_test_df['utterance'].values), max_length=best_config["tokenizer_max_length"])
val_dataloader = get_data_loader(tids, amids, batch_size=best_config["dataloader_batch_size"], shuffle=False)
print(f"Best config: {best_config}")
score = scoring_fn(best_model, val_dataloader, ald_test_df['hat'].values)

Best config: {'epochs': 15, 'model_dropout': 0.3, 'optimizer_lr': 0.0001, 'scheduler_warmup_steps': 200, 'lora_r': 4, 'lora_alpha': 32, 'lora_dropout': 0.1, 'tokenizer_max_length': 128, 'dataloader_batch_size': 16, 'clip_grad_norm': 2.0, 'early_stopping_patience': 10, 'early_stopping_delta': 0.01, 'scheduler_type': 'constant', 'optimizer_type': 'AdamW', 'weight_decay': 0.0001}
              precision    recall  f1-score   support

         red       0.40      0.40      0.40        40
       white       0.69      0.80      0.74       110
       black       0.33      0.17      0.23        23
      yellow       0.29      0.26      0.28        19
       green       0.25      0.17      0.20        12

    accuracy                           0.56       204
   macro avg       0.39      0.36      0.37       204
weighted avg       0.53      0.56      0.54       204



# Testing the model over the Normal Hand-Labeled test set

In [12]:
test_texts = normal_test_df['utterance'].values
serie = pd.Series(test_texts)
tids, amids = bert_tokenize_data(tokenizer, serie, max_length=64)
dl = get_data_loader(tids, amids, batch_size=5, shuffle=False)
preds, confidences = model_predict(model, dl)
labels_flat = normal_test_df['hat'].values.flatten()
accuracy = np.sum(preds == labels_flat) / len(labels_flat)
hat_map = {
    0: "red",
    1: "white",
    2: "black",
    3: "yellow",
    4: "green",
}
preds_array = np.array(preds)
print(classification_report(labels_flat, preds_array, target_names=list(hat_map.values())))

              precision    recall  f1-score   support

         red       0.36      0.30      0.33        40
       white       0.65      0.45      0.53       110
       black       0.15      0.13      0.14        23
      yellow       0.22      0.63      0.32        19
       green       0.11      0.17      0.13        12

    accuracy                           0.39       204
   macro avg       0.30      0.34      0.29       204
weighted avg       0.46      0.39      0.41       204

