In [20]:
import json

import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch
import pandas as pd
import os

import sys
sys.path.append(os.path.abspath("../.."))  # vai alla cartella superiore

from bert_util import bert_tokenize_data, tensor_train_test_split, train_bert_model, model_predict, get_data_loader, \
    calculate_accuracy
from util import get_dataframe_from_json
from sklearn.metrics import classification_report

os.environ["USE_TF"] = "0"

In [21]:
dataset = get_dataframe_from_json('../../dailydialog/hand_labeled/hand_labelled_dataset.json')

In [12]:
dataset_balanced = pd.read_csv('../../dailydialog/hand_labeled/balanced_dataset_100_each.csv')

In [22]:
hld = dataset

In [23]:
hld['utterance'] = hld['utterance'].str.replace('__eou__', '', regex=False)

In [24]:
hat_map = {
    0: "red",
    1: "white",
    2: "black",
    3: "yellow",
    4: "green",
}

reverse_hat_map = {v: k for k, v in hat_map.items()}
hld['hat'] = hld['hat'].apply(lambda x: reverse_hat_map[x])

In [16]:
# import train test split
from sklearn.model_selection import train_test_split
# split the data into train and test
train_df, test_df = train_test_split(hld, test_size=0.2, random_state=42, stratify=hld['hat'])

In [17]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [18]:
tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
token_ids, attention_masks = bert_tokenize_data(tokenizer, train_df['utterance'].values)
train_dataloader, val_dataloader = tensor_train_test_split(torch.tensor(train_df['hat'].values), token_ids, attention_masks, test_size=0.1)

In [19]:
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
model = AutoModelForSequenceClassification.from_pretrained('distilroberta-base', num_labels=5)
epochs = 20
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()
num_training_steps = epochs * len(train_dataloader)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps)

model = train_bert_model(model, optimizer, scheduler, train_dataloader, val_dataloader, epochs)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


-------------------- Epoch 1 --------------------

Training:
---------
Start Time:       2025-05-21 12:30:59.531596
Average Loss:     1.6016576726262162
Time Taken:       0:00:03.571303

Validation:
---------
Start Time:       2025-05-21 12:31:03.103550
Average Loss:     1.5944644451141357
Average Accuracy: 0.18
Time Taken:       0:00:00.133373

-------------------- Epoch 2 --------------------

Training:
---------
Start Time:       2025-05-21 12:31:03.237491
Average Loss:     1.4554031156912082
Time Taken:       0:00:03.190001

Validation:
---------
Start Time:       2025-05-21 12:31:06.428301
Average Loss:     1.334792685508728
Average Accuracy: 0.32999999999999996
Time Taken:       0:00:00.137863

-------------------- Epoch 3 --------------------

Training:
---------
Start Time:       2025-05-21 12:31:06.566776
Average Loss:     1.1189992442363645
Time Taken:       0:00:03.179498

Validation:
---------
Start Time:       2025-05-21 12:31:09.746901
Average Loss:     1.2864388465881347

In [8]:
test_texts = test_df['utterance'].values
serie = pd.Series(test_texts)
tids, amids = bert_tokenize_data(tokenizer, serie, max_length=64)
dl = get_data_loader(tids, amids, batch_size=5, shuffle=False)
preds, confidences = model_predict(model, dl)
labels_flat = test_df['hat'].values.flatten()
accuracy = np.sum(preds == labels_flat) / len(labels_flat)
accuracy

np.float64(0.5441176470588235)

In [9]:
from sklearn.metrics import classification_report

preds_array = np.array(preds)
print(classification_report(labels_flat, preds_array, target_names=list(hat_map.values())))

              precision    recall  f1-score   support

         red       0.45      0.33      0.38        40
       white       0.65      0.74      0.69       110
       black       0.31      0.22      0.26        23
      yellow       0.50      0.47      0.49        19
       green       0.19      0.25      0.21        12

    accuracy                           0.54       204
   macro avg       0.42      0.40      0.40       204
weighted avg       0.53      0.54      0.53       204



In [10]:
import nltk

# Scarica WordNet e i dati multilingue (se serve)
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /home/atlas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/atlas/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [11]:
from util import eda_augment_dataset

train_df, test_df = train_test_split(hld, test_size=0.2, random_state=42, stratify=hld['hat'])

green_augmented = eda_augment_dataset(train_df[train_df['hat'] == 4], num_aug=10, alpha_sr=0.05, alpha_ri=0.05, alpha_rs=0.05, p_rd=0.1)
yellow_augmented = eda_augment_dataset(train_df[train_df['hat'] == 3], num_aug=10, alpha_sr=0.05, alpha_ri=0.05, alpha_rs=0.05, p_rd=0.1)
black_augmented = eda_augment_dataset(train_df[train_df['hat'] == 2], num_aug=10, alpha_sr=0.05, alpha_ri=0.05, alpha_rs=0.05, p_rd=0.1)

augmented_train_df = pd.concat([train_df, green_augmented, yellow_augmented, black_augmented], ignore_index=True)
augmented_train_df

Unnamed: 0,turn,utterance,emotion,act,hat
0,3,"I'll take one , too .",happiness,inform,0
1,8,"You know , we are superior to other clothes co...",no_emotion,inform,3
2,5,"Her new boyfriend , right ?",no_emotion,commissive,1
3,9,How about recommending him to use the storage ...,no_emotion,directive,4
4,1,"Oh , a bouquet of flowers . It's very kind of ...",surprise,commissive,1
...,...,...,...,...,...
2938,2,problem seems to be the what,no_emotion,question,2
2939,2,what seems to be the problem,no_emotion,question,2
2940,2,be seems to what the problem,no_emotion,question,2
2941,2,look what seems to be the problem,no_emotion,question,2


In [12]:
tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
token_ids, attention_masks = bert_tokenize_data(tokenizer, augmented_train_df['utterance'].values)
train_dataloader, val_dataloader = tensor_train_test_split(torch.tensor(augmented_train_df['hat'].values), token_ids, attention_masks, test_size=0.1)

In [13]:
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup

model = AutoModelForSequenceClassification.from_pretrained('distilroberta-base', num_labels=5)
epochs = 10
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()
num_training_steps = epochs * len(train_dataloader)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps)

model = train_bert_model(model, optimizer, scheduler, train_dataloader, val_dataloader, epochs)
test_texts = test_df['utterance'].values
serie = pd.Series(test_texts)
tids, amids = bert_tokenize_data(tokenizer, serie, max_length=64)
dl = get_data_loader(tids, amids, batch_size=5, shuffle=False)
preds, confidences = model_predict(model, dl)
labels_flat = test_df['hat'].values.flatten()

preds_array = np.array(preds)
print(classification_report(labels_flat, preds_array, target_names=list(hat_map.values())))

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


-------------------- Epoch 1 --------------------

Training:
---------
Start Time:       2025-05-21 11:39:38.248794
Average Loss:     0.7542178766559978
Time Taken:       0:00:26.368342

Validation:
---------
Start Time:       2025-05-21 11:40:04.617810
Average Loss:     0.8680468499660492
Average Accuracy: 0.7601351351351351
Time Taken:       0:00:00.764201

-------------------- Epoch 2 --------------------

Training:
---------
Start Time:       2025-05-21 11:40:05.382658
Average Loss:     0.2911073781797533
Time Taken:       0:00:29.856861

Validation:
---------
Start Time:       2025-05-21 11:40:35.240188
Average Loss:     0.7945205372430988
Average Accuracy: 0.8161196911196912
Time Taken:       0:00:00.739565

-------------------- Epoch 3 --------------------

Training:
---------
Start Time:       2025-05-21 11:40:35.980411
Average Loss:     0.19424756761750042
Time Taken:       0:00:29.326564

Validation:
---------
Start Time:       2025-05-21 11:41:05.307753
Average Loss:     0.6

In [14]:
test_texts = test_df['utterance'].values
serie = pd.Series(test_texts)
tids, amids = bert_tokenize_data(tokenizer, serie, max_length=64)
dl = get_data_loader(tids, amids, batch_size=5, shuffle=False)
preds, confidences = model_predict(model, dl)
labels_flat = test_df['hat'].values.flatten()
accuracy = np.sum(preds == labels_flat) / len(labels_flat)
accuracy

np.float64(0.5980392156862745)

In [15]:
from sklearn.metrics import classification_report

preds_array = np.array(preds)
print(classification_report(labels_flat, preds_array, target_names=list(hat_map.values())))

              precision    recall  f1-score   support

         red       0.45      0.53      0.48        40
       white       0.70      0.80      0.75       110
       black       0.31      0.17      0.22        23
      yellow       0.58      0.37      0.45        19
       green       0.29      0.17      0.21        12

    accuracy                           0.60       204
   macro avg       0.47      0.41      0.42       204
weighted avg       0.57      0.60      0.58       204

