# Training BERT for text-segment analysis

### 0. Initial setup

In [11]:
# Data Processing Libraries
import json
import glob
import pandas as pd
from transformers import RobertaTokenizer as BertTokenizer
from datasets import Dataset

# General Libraries
import random

# ML Libraries
import numpy as np
import torch

# Model
from transformers import RobertaForSequenceClassification as BertForSequenceClassification

# Training
from transformers import TrainingArguments, Trainer

# Evaluation
from transformers import EvalPrediction
from sklearn.metrics import accuracy_score

# My files
from src.data_mod import TeachData
from src.model import TeachModel

random.seed(42)
DTYPE = np.float32
MODEL = "FacebookAI/roberta-base"

In [13]:
test_data = TeachData(ST=False, DH=False, DA_E=True)

test_data.train_utterances

['directions please <<RequestForInstruction>>',
 'today we need to make a plate of toast <<Instruction>>',
 'the bread is on the counter <<InformationOnObjectDetails>>',
 'by the lettuce <<InformationOnObjectDetails>>',
 'there should be a knife <<InformationOnObjectDetails>>',
 'in the drawer under the toaster <<InformationOnObjectDetails>>',
 'the toaster is by the fridge <<InformationOnObjectDetails>>',
 'open the drawer under the toaster <<Instruction,InformationOnObjectDetails>>',
 'the knife should be in there <<InformationOnObjectDetails>>',
 'do you see it? <<Confirm>>',
 'no knife <<Deny>>',
 'there should also be a knife in the fridge <<InformationOnObjectDetails>>',
 "let's check there <<InformationOther>>",
 'in the freezer <<InformationOnObjectDetails>>',
 'there are 3 loaves of bread <<InformationOnObjectDetails>>',
 'choose any and slice 1 please <<Instruction>>',
 'then take a slice of bread <<Instruction>>',
 'and toast it <<Instruction>>',
 'you will need to put the k

### 1. Compiling the Dataset
##### i. Separating the data

In [36]:
train_path = "teach-dataset-parsed/train.json"
valid_seen_path = "teach-dataset-parsed/valid_seen.json"
valid_unseen_path = "teach-dataset-parsed/valid_unseen.json"

def split_utterances(filename) -> (list, list, list, list):
    data = json.load(open(filename, "r"))
    utterances = []
    utterance_labels = []
    
    # Used for single label classification
    text_segments = []
    text_segment_labels = []
    
    for event in data:
        utterances.append(event["utterance"])
        utterance_labels.append(list(filter(bool, event["das"])))
        for text_segment, dialogue_act in zip(event["text_segments"], event["das"]):
            if dialogue_act:
                text_segments.append(text_segment)
                text_segment_labels.append(dialogue_act)
            
    return utterances, utterance_labels, text_segments, text_segment_labels

train_utterances, train_utterance_labels, train_text_segments, train_text_segment_labels = split_utterances(train_path)
valid_seen_utterances, valid_seen_utterance_labels, valid_seen_text_segments, valid_seen_text_segment_labels = split_utterances(valid_seen_path)
valid_unseen_utterances, valid_unseen_utterance_labels, valid_unseen_text_segments, valid_unseen_text_segment_labels = split_utterances(valid_unseen_path)

##### i(a). Analyzing

In [53]:
Columns = ["Dialogue Act", "Example", "Count", "Commander(%)", "Driver(%)"]
labels = [
    "Instruction",
    "RequestForInstruction",
    "RequestOtherInfo",
    "RequestMore",
    "InformationOnObjectDetails",
    "RequestForObjectLocationAndOtherDetails",
    "InformationOther",
    "AlternateQuestions",
    "Acknowledge",
    "Greetings/Salutations",
    "Confirm",
    "MiscOther",
    "Affirm",
    "Deny",
    "FeedbackPositive",
    "FeedbackNegative",
    "OtherInterfaceComment",
    "NotifyFailure"
]

driver_das = {}
commander_das = {}

for event in json.load(open(train_path, "r")) + json.load(open(valid_seen_path, "r")) + json.load(open(valid_unseen_path, "r")):
    for da, segment in zip(event["das"], event["text_segments"]):
        if da:
            dialogue_act = da[0].upper() + da[1:]
            if dialogue_act not in driver_das:
                driver_das[dialogue_act] = []
                commander_das[dialogue_act] = []
            if event["agent"] == "Driver":
                driver_das[dialogue_act].append(event["utterance"])
            else:
                commander_das[dialogue_act].append(event["utterance"])
                
data = []
for label in labels:
    driver = driver_das[label]
    commander = commander_das[label]
    data.append([label, 
                 driver[0] if driver else commander[0], 
                 len(driver) + len(commander), 
                 len(commander) / (len(driver) + len(commander)), 
                 len(driver) / (len(driver) + len(commander))])

df = pd.DataFrame(data, columns=Columns)
df

Unnamed: 0,Dialogue Act,Example,Count,Commander(%),Driver(%)
0,Instruction,To place the mug in,8350,0.993892,0.006108
1,RequestForInstruction,directions please,2986,0.006028,0.993972
2,RequestOtherInfo,Is that it next to the apple,439,0.006834,0.993166
3,RequestMore,The lettuce has been sliced. Anything else?,369,0.00271,0.99729
4,InformationOnObjectDetails,Remotecontrol is on chair,5077,0.993106,0.006894
5,RequestForObjectLocationAndOtherDetails,where are they,1488,0.00336,0.99664
6,InformationOther,I'll just use this one on the counter ),750,0.852,0.148
7,AlternateQuestions,"mug is clean, should i make coffee?",93,0.301075,0.698925
8,Acknowledge,thank you,5285,0.191675,0.808325
9,Greetings/Salutations,hello how can I help?,1873,0.432995,0.567005


##### ii. Making test and validation sets

In [37]:
valid_utterances = valid_seen_utterances[:len(valid_seen_utterances) // 2] + valid_unseen_utterances[:len(valid_unseen_utterances) // 2]
valid_utterance_labels = valid_seen_utterance_labels[:len(valid_seen_utterance_labels) // 2] + valid_unseen_utterance_labels[:len(valid_unseen_utterance_labels) // 2]
valid_text_segments = valid_seen_text_segments[:len(valid_seen_text_segments) // 2] + valid_unseen_text_segments[:len(valid_unseen_text_segments) // 2]
valid_text_segment_labels = valid_seen_text_segment_labels[:len(valid_seen_text_segment_labels) // 2] + valid_unseen_text_segment_labels[:len(valid_unseen_text_segment_labels) // 2]

test_utterances = valid_seen_utterances[len(valid_seen_utterances) // 2:] + valid_unseen_utterances[len(valid_unseen_utterances) // 2:]
test_utterance_labels = valid_seen_utterance_labels[len(valid_seen_utterance_labels) // 2:] + valid_unseen_utterance_labels[len(valid_unseen_utterance_labels) // 2:]
test_text_segments = valid_seen_text_segments[len(valid_seen_text_segments) // 2:] + valid_unseen_text_segments[len(valid_unseen_text_segments) // 2:]
test_text_segment_labels = valid_seen_text_segment_labels[len(valid_seen_text_segment_labels) // 2:] + valid_unseen_text_segment_labels[len(valid_unseen_text_segment_labels) // 2:]

##### iii. Tokenizing

In [38]:
tokenizer = BertTokenizer.from_pretrained(MODEL, do_lower_case=True)

train_text_segment_encodings = tokenizer(train_text_segments, truncation=True, padding=True)
valid_text_segment_encodings = tokenizer(valid_text_segments, truncation=True, padding=True)
test_text_segment_encodings = tokenizer(test_text_segments, truncation=True, padding=True)

train_utterance_encodings = tokenizer(train_utterances, truncation=True, padding=True)
valid_utterance_encodings = tokenizer(valid_utterances, truncation=True, padding=True)
test_utterance_encodings = tokenizer(test_utterances, truncation=True, padding=True)

##### iv. Labeling

In [39]:
filenames = [train_path, valid_seen_path, valid_unseen_path]

labels = set()
for filename in filenames:
    data = json.load(open(filename, "r"))
    for event in data:
        labels.update(event["das"])
labels.remove("")

labels = list(labels)

def remap_multilabels(label_list: list[str]) -> list[int]:
    return [int(label in label_list) for label in labels]

def remap_singlelabels(label: str) -> list[int]:
    return [int(label == label_) for label_ in labels]

train_text_segment_labels_encoded = list(map(remap_singlelabels, train_text_segment_labels))
valid_text_segment_labels_encoded = list(map(remap_singlelabels, valid_text_segment_labels))
test_text_segment_labels_encoded = list(map(remap_singlelabels, test_text_segment_labels))

train_utterance_labels_encoded = list(map(remap_multilabels, train_utterance_labels))
valid_utterance_labels_encoded = list(map(remap_multilabels, valid_utterance_labels))
test_utterance_labels_encoded = list(map(remap_multilabels, test_utterance_labels))

##### iv. Dataset Creation

In [40]:
def get_dataset(encodings, labels) -> dict:
    dataset = {
        "input_ids": torch.tensor(encodings["input_ids"], dtype=torch.int32),
        "attention_mask": torch.tensor(encodings["attention_mask"], dtype=torch.int32),
        "labels": torch.tensor(labels, dtype=torch.float32)
    }
    
    return dataset

train_text_segments_dataset = Dataset.from_dict(get_dataset(train_text_segment_encodings, train_text_segment_labels_encoded))
valid_text_segments_dataset = Dataset.from_dict(get_dataset(valid_text_segment_encodings, valid_text_segment_labels_encoded))
test_text_segments_dataset = Dataset.from_dict(get_dataset(test_text_segment_encodings, test_text_segment_labels_encoded))

train_utterances_dataset = Dataset.from_dict(get_dataset(train_utterance_encodings, train_utterance_labels_encoded))
valid_utterances_dataset = Dataset.from_dict(get_dataset(valid_utterance_encodings, valid_utterance_labels_encoded))
test_utterances_dataset = Dataset.from_dict(get_dataset(test_utterance_encodings, test_utterance_labels_encoded))

# print(train_utterances_dataset[0][2].shape)

### 2. Model Setup
##### i. Model Initialization

In [41]:
# Initializing the Single-Label Classification
single_model = BertForSequenceClassification.from_pretrained(MODEL,
    problem_type="single_label_classification",
    num_labels=len(labels),
    output_attentions=False,
    output_hidden_states=False,
)
# Initializing the Multi-Label Classification Model
multi_model = BertForSequenceClassification.from_pretrained(MODEL, 
    problem_type="multi_label_classification",
    num_labels=len(labels),
    output_attentions=False,
    output_hidden_states=False
)
# I have a mac, so I'll use metal
if torch.backends.mps.is_available():
    torch_device = torch.device("mps")
    print("Using Metal Renderer")
    single_model.to(torch_device)
    multi_model.to(torch_device)
elif torch.cuda.is_available():
    torch_device = torch.device("cuda")
    print("Using CUDA")
    single_model.to(torch_device)
    multi_model.to(torch_device)
else:
    torch_device = torch.device("cpu")
    print("Using CPU")
    single_model.to(torch_device)
    multi_model.to(torch_device)
    
single_model.train()
multi_model.train(); # ; To not print the model

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using Metal Renderer


##### ii. Train Parameters

In [42]:
EPOCHS = 4 # Apparently recommended
BATCH_SIZE = 16


total_steps = BATCH_SIZE * EPOCHS

### 3. Training

##### i. Single-Label Classification

In [43]:
checkpoints = glob.glob("results/checkpoint-*")

if checkpoints:
    multi_model = BertForSequenceClassification.from_pretrained(checkpoints[-1])
    multi_model.to(torch_device)


training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=0,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
)

# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.tensor(predictions))
    
    y_pred = probs > threshold
    y_true = labels
    return { 'accuracy': accuracy_score(y_true, y_pred) }

def compute_metric(pred: EvalPrediction):
    return multi_label_metrics(pred.predictions, pred.label_ids)

trainer = Trainer(
    model=multi_model,
    args=training_args,
    train_dataset=train_utterances_dataset,
    eval_dataset=valid_utterances_dataset,
    compute_metrics=compute_metric,
)

if not checkpoints:
    trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.0551,0.053624,0.753299
2,0.0428,0.046412,0.771777
3,0.0346,0.04642,0.78599
4,0.0299,0.046819,0.792284


### 4. Evaluation

In [44]:
predictions = trainer.predict(test_utterances_dataset)

multi_label_metrics(predictions.predictions, test_utterance_labels_encoded)["accuracy"]

0.7990255785627284