- We loose structure of dialogue using the tokenizer in preprocess_data

- Emotions encoded using MultiLabelBinarizer doesn't tell us
  anymore how many times a single emotion is present in the dialogue
  and where it is

In [4]:
#!pip install torch==1.13.0+cu116 --extra-index-url https://download.pytorch.org/whl/cu116
!pip install transformers==4.30.0
!pip install datasets==2.13.2
!pip install accelerate -U
!pip install evaluate



In [5]:
# system packages
from pathlib import Path
import shutil
import urllib
import tarfile
import sys
import os
# data and numerical management packages
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')
# useful during debugging (progress bars)
from tqdm import tqdm
from transformers import set_seed

seed = 852

random.seed(seed)
np.random.seed(seed)
set_seed(seed)

In [6]:
dataset_folder = Path.cwd().joinpath("MELD_train_efr.json")
#dataset_path = dataset_folder.joinpath('/MELD_train_efr.json')
df = pd.read_json(dataset_folder)
#df['triggers'] = df['triggers'].fillna(value=0, inplace=False)#.replace('None', 0.0)

In [7]:
df

Unnamed: 0,episode,speakers,emotions,utterances,triggers
0,utterance_0,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise]",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 1.0, 0.0]"
1,utterance_1,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise,...",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]"
2,utterance_2,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise,...",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
3,utterance_3,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise,...",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,utterance_4,"[Joey, Rachel, Joey, Rachel]","[surprise, sadness, surprise, fear]",[But then who? The waitress I went out with la...,"[0.0, 0.0, 1.0, 0.0]"
...,...,...,...,...,...
3995,utterance_3995,"[Chandler, All, Monica, Chandler, Ross, Chandl...","[neutral, joy, neutral, neutral, surprise, dis...","[Hey., Hey!, So how was Joan?, I broke up with...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3996,utterance_3996,"[Chandler, All, Monica, Chandler, Ross, Chandl...","[neutral, joy, neutral, neutral, surprise, dis...","[Hey., Hey!, So how was Joan?, I broke up with...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3997,utterance_3997,"[Chandler, All, Monica, Chandler, Ross, Chandl...","[neutral, joy, neutral, neutral, surprise, dis...","[Hey., Hey!, So how was Joan?, I broke up with...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3998,utterance_3998,"[Chandler, All, Monica, Chandler, Ross, Chandl...","[neutral, joy, neutral, neutral, surprise, dis...","[Hey., Hey!, So how was Joan?, I broke up with...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [8]:
triggers = df['triggers']
for row in range(triggers.shape[0]):
    for trigger in range(len(triggers[row])):
        if triggers[row][trigger] == None:
            triggers[row][trigger] = 0.0

df['triggers'] = triggers

In [9]:
emotions = df['emotions'].explode().unique()
emotions

array(['neutral', 'surprise', 'fear', 'sadness', 'joy', 'disgust',
       'anger'], dtype=object)

In [10]:
triggers = df['triggers'].explode().unique()
triggers

array([0.0, 1.0], dtype=object)

In [11]:
dialogues = df['utterances']
#print(sentences)
max_len_dialogue = 0
index = 0
for idx, dialogue in enumerate(dialogues):
  if len(dialogue) > max_len_dialogue:
    max_len_dialogue = len(dialogue)
    index = idx
max_len_dialogue,index

(24, 219)

In [12]:
from sklearn.preprocessing import LabelBinarizer
sorted_emotions = sorted(emotions)  #sort the array because Binarizer will automatically do that for one hot encoding
label_binarizer = LabelBinarizer()
label_binarizer.fit(sorted_emotions)

dialogues = df['emotions']
one_hot_emotions = []
for dialogue_emotion in dialogues:
  dialogue_emotions_list = []
  for emotion in dialogue_emotion:
    encoded_emotion=label_binarizer.transform([emotion])
    dialogue_emotions_list.append(np.ravel(encoded_emotion).tolist())
  one_hot_emotions.append(dialogue_emotions_list)

In [13]:
df['emotions'] = one_hot_emotions

In [14]:
from sklearn.model_selection import train_test_split
train_data, temp_data = train_test_split(df, train_size=0.8, shuffle=False)
val_data, test_data = train_test_split(temp_data, test_size=0.5, shuffle=False)

In [15]:
from transformers import Trainer, TrainingArguments

class CustomDataCollator:
    def __init__(self, tokenizer, counter=0):
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __call__(self, examples, index=None):
        if index is not None:
            example = examples[index]
            input_ids = example['input_ids'].squeeze()
            attention_mask = example['attention_mask'].squeeze()
            emotion_labels = example['emotion_labels'].squeeze()
            trigger_label = example['trigger_label'].squeeze()
        else:
            input_ids = torch.stack([example['input_ids'].squeeze() for example in examples])
            attention_mask = torch.stack([example['attention_mask'].squeeze() for example in examples])
            emotion_labels = torch.stack([example['emotion_labels'].squeeze() for example in examples])
            trigger_label = torch.stack([example['trigger_label'].squeeze() for example in examples])

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'emotion_labels': emotion_labels,
            'trigger_label': trigger_label
        }
        dialogue = self.dialogues[idx]
        emotion = self.emotions[idx]
        trigger = self.triggers[idx]

        input_ids_list = []
        attention_mask_list = []

        for utterance in dialogue:
          tokenized_utterance = self.tokenizer(utterance, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
          # Extract relevant information
          #input_ids = torch.stack([inputs['input_ids'].squeeze() for inputs in tokenized_dialogue])
          input_ids_list.extend(tokenized_utterance['input_ids'])
          attention_mask_list.extend(tokenized_utterance['attention_mask'])

        emotion_labels = torch.tensor(emotion, dtype=torch.float32)
        trigger_label = torch.tensor(trigger, dtype=torch.long)
        #print('input',torch.stack(input_ids_list).shape)
        #print('attention',torch.stack(attention_mask_list).shape)
        #print('emotion',emotion_labels.shape)
        #print('trigeeer',trigger_label.shape)
        return {
            'input_ids': torch.stack(input_ids_list),
            'attention_mask': torch.stack(attention_mask_list),
            'emotion_labels': emotion_labels,
            'trigger_label': trigger_label
        }

In [36]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, AdamW, Trainer, TrainingArguments

class CustomDataset(Dataset):
    def __init__(self, dialogues, emotions, triggers, tokenizer, max_length=10):
        self.dialogues = dialogues
        self.emotions = emotions
        self.triggers = triggers
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dialogues)

    def __getitem__(self, idx):
        dialogue = self.dialogues[idx]
        emotion = self.emotions[idx]
        trigger = self.triggers[idx]

        input_ids_list = []
        attention_mask_list = []

        for utterance in dialogue:
          tokenized_utterance = self.tokenizer(utterance, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
          # Extract relevant information
          #input_ids = torch.stack([inputs['input_ids'].squeeze() for inputs in tokenized_dialogue])
          input_ids_list.extend(tokenized_utterance['input_ids'])
          attention_mask_list.extend(tokenized_utterance['attention_mask'])

        emotion_labels = torch.tensor(emotion, dtype=torch.float32)
        trigger_label = torch.tensor(trigger, dtype=torch.long)
        #print('input',torch.stack(input_ids_list).shape)
        #print('attention',torch.stack(attention_mask_list).shape)
        #print('emotion',emotion_labels.shape)
        #print('trigeeer',trigger_label.shape)
        return {
            'input_ids': torch.stack(input_ids_list),
            'attention_mask': torch.stack(attention_mask_list),
            'emotion_labels': emotion_labels,
            'trigger_label': trigger_label
        }

In [37]:
class CustomBERTModel(torch.nn.Module):
    def __init__(self):
        super(CustomBERTModel, self).__init__()
        # Replace this with your custom BERT model architecture for multihead classification
        self.bert = BertModel.from_pretrained('bert-base-uncased')#BertForSequenceClassification.from_pretrained
        #LSTM
        self.emotion_head = torch.nn.Linear(self.bert.config.hidden_size, len(emotions))
        self.trigger_head = torch.nn.Linear(self.bert.config.hidden_size, len(triggers))

    def forward(self, input_ids, attention_mask):
        """emotion_outputs = []
        trigger_outputs = []
        for utterance,attention in zip(input_ids,attention_mask):
            outputs,random = self.bert(input_ids=utterance, attention_mask=attention,return_dict=True)
            print('fafafawfaf')
            pooled_output = outputs['pooler_output']
            # Emotion head
            emotion_logits = self.emotion_head(pooled_output)
            emotion_outputs.append(emotion_logits)

            # Trigger head
            trigger_logits = self.trigger_head(pooled_output)
            trigger_outputs.append(trigger_logits)
            print('input',input_ids.shape)
            print('utterance',utterance.shape)
            print('pooled_output',pooled_output.shape)
            print('emotion_logits',emotion_logits.shape)
            print('trigger_logits',trigger_logits.shape)
            #print('utterance',utterance)
            #print(emotion_outputs)"""
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        pooled_output = outputs['pooler_output']

        # Emotion head
        emotion_logits = self.emotion_head(pooled_output)

        # Trigger head
        trigger_logits = self.trigger_head(pooled_output)
        return emotion_logits, trigger_logits
        #return emotion_outputs, trigger_outputs
        #return torch.stack(emotion_outputs), torch.stack(trigger_outputs)

In [38]:
# Assuming train_dialogues, train_emotions, train_triggers, test_dialogues, test_emotions, test_triggers are defined
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_dataset = CustomDataset(train_data['utterances'], train_data['emotions'], train_data['triggers'], tokenizer)
test_dataset = CustomDataset(val_data['utterances'], val_data['emotions'], val_data['triggers'], tokenizer)

custom_Bert_Model = CustomBERTModel()
optimizer = AdamW(custom_Bert_Model.parameters(), lr=5e-5)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [39]:
from sklearn.metrics import f1_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Apply sigmoid to logits and threshold to get multi-label predictions
    predicted_classes = torch.argmax(predictions, dim=1)
    predicted_labels = torch.argmax(labels, dim=1)

    # Calculate F1 score for each individual label/class
    f1 = f1_score(predicted_labels.cpu().numpy(), predicted_classes.cpu().numpy(), average='macro')

    # Convert F1 score to PyTorch tensor and make it part of the computation graph
    f1_tensor = torch.tensor(f1, dtype=torch.float32, requires_grad=True)

    return f1_tensor

In [40]:
num_epochs = 1
batch_size = 1

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)

for epoch in range(num_epochs):
    custom_Bert_Model.train()
    total_loss = 0.0

    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}', leave=False):
        input_ids = batch['input_ids'].squeeze()
        attention_mask = batch['attention_mask'].squeeze()
        emotion_labels = batch['emotion_labels'].squeeze()
        trigger_label = batch['trigger_label'].squeeze()
        optimizer.zero_grad()

        emotion_logits, trigger_logits = custom_Bert_Model(input_ids, attention_mask)

        # Assuming you have defined loss functions for emotion and trigger
        emotion_loss = compute_metrics((emotion_logits, emotion_labels))
        #trigger_loss = your_trigger_loss_function(trigger_logits, trigger_label)

        loss = emotion_loss #+ trigger_loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Average Loss: {average_loss}")




KeyboardInterrupt: 

## Translate into huggingface

In [None]:
from transformers import Trainer, TrainingArguments

class CustomDataCollator:
    def __init__(self, tokenizer, dataset, max_length,index):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.dataset = dataset,
        self.index = index

    def __call__(self, examples, index=None):
        if index is not None:
            example = examples[index]
            input_ids = example['input_ids'].squeeze()
            attention_mask = example['attention_mask'].squeeze()
            emotion_labels = example['emotion_labels'].squeeze()
            trigger_label = example['trigger_label'].squeeze()
        else:
            input_ids = torch.stack([example['input_ids'].squeeze() for example in examples])
            attention_mask = torch.stack([example['attention_mask'].squeeze() for example in examples])
            emotion_labels = torch.stack([example['emotion_labels'].squeeze() for example in examples])
            trigger_label = torch.stack([example['trigger_label'].squeeze() for example in examples])

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'emotion_labels': emotion_labels,
            'trigger_label': trigger_label
        }

In [None]:
training_args = TrainingArguments(
    output_dir="./custom_bert_trainer_output",
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    save_steps=10,  # adjust as needed
    save_total_limit=2,
    logging_steps=10,  # adjust as needed
)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
data_collator = CustomDataCollator(tokenizer, dataset, max_length=10)

trainer = Trainer(
    model=custom_Bert_Model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    compute_loss=compute_loss,
)

# Training loop with data_collator index
for epoch in range(num_epochs):
    for index in range(len(train_dataset)):
        trainer.train(index)