In [1]:
import torch
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
)
from bert_util import bert_tokenize_data, tensor_train_test_split, train_bert_model, model_predict, get_data_loader
import pandas as pd
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
import numpy as np
from sklearn.metrics import classification_report

  from .autonotebook import tqdm as notebook_tqdm
2025-05-28 16:49:27.947049: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-28 16:49:28.208330: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748443768.309007   27370 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748443768.332787   27370 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1748443768.546260   27370 computation_placer.cc:177] computation placer already r

# Normal Hand-Labeled Dataset

In [2]:
normal_train_df = pd.read_json('../normal_train_dataset.json', lines=False)
normal_test_df = pd.read_json('../normal_test_dataset.json', lines=False)
normal_train_df

Unnamed: 0,turn,utterance,emotion,act,hat
0,3,"I'll take one, too.",happiness,inform,0
1,8,"You know, we are superior to other clothes com...",no_emotion,inform,3
2,5,"Her new boyfriend, right?",no_emotion,commissive,1
3,9,How about recommending him to use the storage ...,no_emotion,directive,4
4,1,"Oh, a bouquet of flowers. It's very kind of you.",surprise,commissive,1
...,...,...,...,...,...
808,0,I prefer potatoes to eggplants.,no_emotion,inform,0
809,0,"Mr. Smith, I would like to get right to the po...",no_emotion,question,1
810,4,Yeah?,no_emotion,question,1
811,0,I am so bored all day.,no_emotion,inform,0


# Baseline roBERTa over the Normal Hand-Labeled Dataset

In [3]:
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
token_ids, attention_masks = bert_tokenize_data(tokenizer, normal_train_df['utterance'].values)
train_dataloader, val_dataloader = tensor_train_test_split(torch.tensor(normal_train_df['hat'].values), token_ids, attention_masks, test_size=0.1)
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=5)
optimizer = AdamW(model.parameters(), lr=2e-5)


configs = {
        "epochs": 10,
        "clip_grad_norm": 1.0,
        "early_stopping_patience": 3,
        "early_stopping_delta": 0.1,
    }

num_training_steps = 10 * len(train_dataloader)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps)

model = train_bert_model(model, optimizer, scheduler, train_dataloader, val_dataloader, configs)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



-------------------- Epoch 1 --------------------
Training:
---------
Start Time:       2025-05-28 16:49:43.278552
Average Training Loss: 1.3443113597838774
Time Taken:            0:00:07.915859

Validation:
-----------
Start Time:       2025-05-28 16:49:51.194784
Average Validation Loss:     1.3561261729760603
Average Validation Accuracy: 0.45454545454545453
Time Taken:                  0:00:00.235475

-------------------- Epoch 2 --------------------
Training:
---------
Start Time:       2025-05-28 16:49:51.431286
Average Training Loss: 1.1360726962270944
Time Taken:            0:00:07.568970

Validation:
-----------
Start Time:       2025-05-28 16:49:59.000608
Average Validation Loss:     1.4277930476448752
Average Validation Accuracy: 0.45454545454545453
Time Taken:                  0:00:00.231653
No improvement for 1 epoch(s).

-------------------- Epoch 3 --------------------
Training:
---------
Start Time:       2025-05-28 16:49:59.232652
Average Training Loss: 0.94333565672454

In [6]:
# Implement cohen's kappa to evaluate the model
from sklearn.metrics import cohen_kappa_score
def cohen_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')



In [4]:
test_texts = normal_test_df['utterance'].values
serie = pd.Series(test_texts)
tids, amids = bert_tokenize_data(tokenizer, serie, max_length=64)
dl = get_data_loader(tids, amids, batch_size=5, shuffle=False)
preds, confidences = model_predict(model, dl)
labels_flat = normal_test_df['hat'].values.flatten()
accuracy = np.sum(preds == labels_flat) / len(labels_flat)

In [5]:
hat_map = {
    0: "red",
    1: "white",
    2: "black",
    3: "yellow",
    4: "green",
}
preds_array = np.array(preds)
print(classification_report(labels_flat, preds_array, target_names=list(hat_map.values())))

              precision    recall  f1-score   support

         red       0.46      0.57      0.51        40
       white       0.81      0.65      0.72       110
       black       0.33      0.57      0.42        23
      yellow       0.57      0.42      0.48        19
       green       0.23      0.25      0.24        12

    accuracy                           0.58       204
   macro avg       0.48      0.49      0.47       204
weighted avg       0.63      0.58      0.59       204



In [7]:
cohen_kappa(normal_test_df['hat'].values, preds)

np.float64(0.3573451545376186)

# Baseline roBERTa over the EDA Augmented Hand-Labeled Dataset

In [6]:
augmented_train_df = pd.read_json('../eda_train_dataset.json', lines=False)

In [None]:
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
token_ids, attention_masks = bert_tokenize_data(tokenizer, augmented_train_df['utterance'].values)
train_dataloader, val_dataloader = tensor_train_test_split(torch.tensor(augmented_train_df['hat'].values), token_ids, attention_masks, test_size=0.1)
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=5)
optimizer = AdamW(model.parameters(), lr=2e-5)


configs = {
        "epochs": 10,
        "clip_grad_norm": 1.0,
        "early_stopping_patience": 3,
        "early_stopping_delta": 0.1,
    }

num_training_steps = 10 * len(train_dataloader)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps)

model = train_bert_model(model, optimizer, scheduler, train_dataloader, val_dataloader, configs)

In [8]:
test_texts = normal_test_df['utterance'].values
serie = pd.Series(test_texts)
tids, amids = bert_tokenize_data(tokenizer, serie, max_length=64)
dl = get_data_loader(tids, amids, batch_size=5, shuffle=False)
preds, confidences = model_predict(model, dl)
labels_flat = normal_test_df['hat'].values.flatten()
accuracy = np.sum(preds == labels_flat) / len(labels_flat)
hat_map = {
    0: "red",
    1: "white",
    2: "black",
    3: "yellow",
    4: "green",
}
preds_array = np.array(preds)
print(classification_report(labels_flat, preds_array, target_names=list(hat_map.values())))

              precision    recall  f1-score   support

         red       0.49      0.47      0.48        40
       white       0.70      0.78      0.74       110
       black       0.50      0.26      0.34        23
      yellow       0.53      0.53      0.53        19
       green       0.25      0.25      0.25        12

    accuracy                           0.61       204
   macro avg       0.49      0.46      0.47       204
weighted avg       0.60      0.61      0.60       204



# Baseline roBERTa over the Automated Labeled Dataset

In [15]:
ald_train_df = pd.read_json('../ald_train_dataset.json', lines=False)
ald_test_df = pd.read_json('../ald_test_dataset.json', lines=False)

In [None]:
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
token_ids, attention_masks = bert_tokenize_data(tokenizer, ald_train_df['utterance'].values)
train_dataloader, val_dataloader = tensor_train_test_split(torch.tensor(ald_train_df['hat'].values), token_ids, attention_masks, test_size=0.1)
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=5)
optimizer = AdamW(model.parameters(), lr=2e-5)


configs = {
        "epochs": 10,
        "clip_grad_norm": 1.0,
        "early_stopping_patience": 3,
        "early_stopping_delta": 0.1,
    }

num_training_steps = 10 * len(train_dataloader)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps)

model = train_bert_model(model, optimizer, scheduler, train_dataloader, val_dataloader, configs)

In [17]:
test_texts = ald_test_df['utterance'].values
serie = pd.Series(test_texts)
tids, amids = bert_tokenize_data(tokenizer, serie, max_length=64)
dl = get_data_loader(tids, amids, batch_size=5, shuffle=False)
preds, confidences = model_predict(model, dl)
labels_flat = ald_test_df['hat'].values.flatten()
accuracy = np.sum(preds == labels_flat) / len(labels_flat)
hat_map = {
    0: "red",
    1: "white",
    2: "black",
    3: "yellow",
    4: "green",
}
preds_array = np.array(preds)
print(classification_report(labels_flat, preds_array, target_names=list(hat_map.values())))

              precision    recall  f1-score   support

         red       0.75      0.77      0.76       200
       white       0.42      0.39      0.41       200
       black       0.51      0.59      0.55       200
      yellow       0.50      0.67      0.57       200
       green       0.65      0.37      0.47       200

    accuracy                           0.56      1000
   macro avg       0.57      0.56      0.55      1000
weighted avg       0.57      0.56      0.55      1000



# Testing the model on the normal test dataset

In [18]:
test_texts = normal_test_df['utterance'].values
serie = pd.Series(test_texts)
tids, amids = bert_tokenize_data(tokenizer, serie, max_length=64)
dl = get_data_loader(tids, amids, batch_size=5, shuffle=False)
preds, confidences = model_predict(model, dl)
labels_flat = normal_test_df['hat'].values.flatten()
accuracy = np.sum(preds == labels_flat) / len(labels_flat)
hat_map = {
    0: "red",
    1: "white",
    2: "black",
    3: "yellow",
    4: "green",
}
preds_array = np.array(preds)
print(classification_report(labels_flat, preds_array, target_names=list(hat_map.values())))

              precision    recall  f1-score   support

         red       0.32      0.30      0.31        40
       white       0.69      0.47      0.56       110
       black       0.19      0.22      0.20        23
      yellow       0.21      0.58      0.31        19
       green       0.25      0.25      0.25        12

    accuracy                           0.41       204
   macro avg       0.33      0.36      0.33       204
weighted avg       0.49      0.41      0.43       204

