<a href="https://colab.research.google.com/github/DanieleBaiocco/NLPProject/blob/dev_2/nlpproject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
import pandas as pd
import os
from sklearn.preprocessing import LabelBinarizer
import numpy as np
from collections import defaultdict
from transformers import BertTokenizer, BertModel
from pathlib import Path
import pickle
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Optimizer
from tqdm import tqdm
import torch.optim as optim
from sklearn.metrics import f1_score

In [2]:
mount_directory = "/content/drive"
dataset_path = os.path.join(mount_directory, 'MyDrive/MELD/MELD_train_efr.json')
model_card = 'bert-base-uncased'
drive.mount(mount_directory)

Mounted at /content/drive


In [3]:
def fit_onehotencoder(data: pd.core.series.Series) -> LabelBinarizer:
  onehotencoder = LabelBinarizer()
  data_flattened = np.concatenate(data.values)
  onehotencoder.fit(data_flattened)
  return onehotencoder

In [4]:
#sbagliato, deve esserci un tokentypeids per ogni token e deve essere riferito a uno speaker particolare
def map_to_order_of_occurrence(data: list) -> list:
    # When a new entry is added, its value is computed using the lambda function
    dict_order_of_occurrence = defaultdict(lambda: len(dict_order_of_occurrence)+1)
    order_of_occurrence = [dict_order_of_occurrence[element] for element in data]
    return order_of_occurrence

In [5]:
def replace_none_with_zero(data: list) -> list:
    return [0 if x is None else x for x in data]

In [6]:
def tokenize_dialogue(dialogue_text: list, tokenizer, max_length = None) -> list:
    tokenized_dialogue = [tokenizer.tokenize(utterance) + [tokenizer.sep_token] for utterance in dialogue_text]
    tokenized_dialogue.insert(0, [tokenizer.cls_token])
    flattened_tokens = [token for sublist in tokenized_dialogue for token in sublist]
    if max_length != None:
      if len(flattened_tokens) >= max_length:
        flattened_tokens = flattened_tokens[:max_length]
    return flattened_tokens

In [7]:
def add_padding(x: list, max_x_length: int, pad_value, pad_length = None) -> list:
    x_length = len(x)
    num_pad_values = max_x_length - x_length
    if pad_length == None:
      padded_x = x + [pad_value] * num_pad_values
    else:
      pad_list = [pad_value] * pad_length
      pad_lists = np.tile(pad_list, (num_pad_values, 1))
      padded_x = np.concatenate((x, pad_lists), axis=0)
    return padded_x

In [8]:
tokenizer = BertTokenizer.from_pretrained(model_card, do_lower_case=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [9]:
df = pd.read_json(dataset_path)
column_names = ['emotions', 'speakers', 'triggers', 'dialogues', 'dialogues_ids', 'attention_masks']
df_new = pd.DataFrame(columns = column_names)
emotions_onehotencoder = fit_onehotencoder(df['emotions'])
df_new['speakers'] = df['speakers'].apply(lambda x: map_to_order_of_occurrence(x))
df_new['emotions'] = df['emotions'].apply(lambda x: emotions_onehotencoder.transform(x))
df_new['triggers'] = df['triggers'].apply(lambda x: replace_none_with_zero(x))
df_new['dialogues'] = df['utterances'].apply(lambda x: tokenize_dialogue(x, tokenizer))

In [10]:
num_emotions =  len(df_new['emotions'][0][0])
max_num_utterances = df['utterances'].apply(len).max()

In [11]:
def save_dataframe(df):
    _folder = Path.cwd().joinpath("dataframes")
    if not _folder.exists():
        _folder.mkdir(parents=True)

    df_path = Path.joinpath(_folder, 'df_MELD_efr'+'.pkl')
    df.to_pickle(df_path)

In [12]:
save_dataframe(df_new)

In [13]:
def load_dataframe():
    df_path = 'dataframes/df_MELD_efr.pkl'
    if not os.path.exists(df_path):
        raise FileNotFoundError("{0} dataframe does not exist!".format(df_path))

    with open(df_path, 'rb') as file:
        df = pickle.load(file)
    return df

In [14]:
def split_dataframe(orginal_df: pd.DataFrame, seed: int):
    train, test_validation = train_test_split(orginal_df, test_size=0.2, random_state=seed)
    validation, test = train_test_split(test_validation, test_size=0.5, random_state=seed)
    return train.reset_index(drop=True), validation.reset_index(drop=True), test.reset_index(drop=True)

In [15]:
df_train, df_val, df_test = split_dataframe(df_new, 42)

In [16]:
class CustomDataset(Dataset):
    def __init__(self, speakers: pd.core.series.Series,
                 dialogues:  pd.core.series.Series,
                 emotions:  pd.core.series.Series,
                 triggers:  pd.core.series.Series,
                 device,
                 pad_token: str,
                 max_num_utterances):
        self.max_dialogue_length = dialogues.apply(len).max()
        self.max_num_utterances = max_num_utterances
        self.dialogues = dialogues.apply(lambda x: add_padding(x, self.max_dialogue_length, pad_token))
        self.dialogues_ids = self.dialogues.apply(lambda x: tokenizer.convert_tokens_to_ids(x))
        self.attention_masks = self.dialogues.apply(lambda x: [1 if token != pad_token else 0 for token in x])
        self.speakers = speakers.apply(lambda x: add_padding(x, self.max_num_utterances, pad_value=0))
        num_emotion_classes = len(emotions[0][0])
        self.emotions = emotions.apply(lambda x: add_padding(x, self.max_num_utterances, pad_value = 0, pad_length = num_emotion_classes))
        self.triggers = triggers.apply(lambda x: add_padding(x, self.max_num_utterances, pad_value=0))
        self.device = device

    def __len__(self):
        return len(self.dialogues)

    def __getitem__(self, idx):
        speakers = torch.tensor(self.speakers.iloc[idx], dtype=torch.long).to(device)
        dialogues_ids = torch.tensor(self.dialogues_ids.iloc[idx], dtype=torch.long).to(device)
        dialogues_masks =  torch.tensor(self.attention_masks.iloc[idx], dtype=torch.long).to(device)
        emotions = torch.tensor(self.emotions.iloc[idx], dtype=torch.float32).to(device)
        triggers = torch.tensor(self.triggers.iloc[idx], dtype=torch.float32).to(device)
        return  speakers, dialogues_ids, dialogues_masks, emotions, triggers

In [17]:
def create_dataloader(df: pd.core.frame.DataFrame, device, tokenizer, max_num_utterances, batch_size) -> torch.utils.data.dataloader.DataLoader :
    dataset =  CustomDataset(speakers = df['speakers'],
                                dialogues = df['dialogues'],
                                emotions = df['emotions'],
                                triggers = df['triggers'],
                                device = device,
                                pad_token = tokenizer.pad_token,
                                max_num_utterances = max_num_utterances)
    return DataLoader(dataset, batch_size = batch_size, shuffle = True)

In [18]:
batch_size = 2
device =  "cpu"
dataloader_train = create_dataloader(df_train, device, tokenizer, max_num_utterances, batch_size)
dataloader_val = create_dataloader(df_val, device, tokenizer, max_num_utterances, batch_size)
dataloader_test = create_dataloader(df_test, device, tokenizer, max_num_utterances, batch_size)

In [19]:
class CustomBERTModel(nn.Module):
    def __init__(self, model_card, tokenizer, num_emotions, max_num_utterances, gru_hidden_size, freeze_embedding_layer = False):
        super(CustomBERTModel, self).__init__()

        self.tokenizer = tokenizer
        self.max_num_utterances = max_num_utterances
        self.gru_hidden_size = gru_hidden_size

        # Load pre-trained BERT model and tokenizer
        self.bert_model = BertModel.from_pretrained(model_card).to(device)

        if freeze_embedding_layer:
          for param in self.bert_model.embeddings.parameters():
              param.requires_grad = False

        self.representation_length = self.bert_model.config.hidden_size

        # Classifier for emotion prediction
        self.emotion_classifier = nn.Sequential(
            nn.Linear(self.representation_length, 512),
            nn.ReLU(),
            nn.Linear(512, num_emotions),
            nn.Softmax(dim=2)
        )

        self.trigger_prediction = nn.GRU(input_size=self.representation_length + num_emotions + 1, hidden_size=self.gru_hidden_size, batch_first=True, bidirectional=True)
        self.linear_trigger = nn.Linear(self.gru_hidden_size * 2, 1)

    def forward(self, token_type_ids, input_ids, attention_mask):
        bert_output_1 = self.bert_model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        # Extract [SEP] token representations
        sep_indices = (input_ids == self.tokenizer.sep_token_id).nonzero()
        batch_size = input_ids.shape[0]
        sep_representations = torch.zeros((batch_size, self.max_num_utterances, self.representation_length)).to(device)
        dialogue_masks = torch.zeros((batch_size, self.max_num_utterances))
        for idx in range(batch_size):
            sep_indices_idx = sep_indices[sep_indices[:,0] == idx][:,1]
            sep_indices_idx_range = range(len(sep_indices_idx))
            dialogue_masks[idx, sep_indices_idx_range] = 1
            sep_representations[idx, sep_indices_idx_range, :] = bert_output_1[idx, sep_indices_idx, :]
        emotion_predictions = self.emotion_classifier(sep_representations) * dialogue_masks.unsqueeze(-1)
        concatenated_input = torch.cat([sep_representations, emotion_predictions, token_type_ids.unsqueeze(-1)], dim=-1)
        trigger_output, _ = self.trigger_prediction(concatenated_input)
        trigger_output_single_value = self.linear_trigger(trigger_output).squeeze(-1)
        trigger_output_sigmoid = torch.sigmoid(trigger_output_single_value) * dialogue_masks
        return emotion_predictions, trigger_output_sigmoid, dialogue_masks

In [None]:
gru_hidden_size = 256
model = CustomBERTModel(model_card, tokenizer, num_emotions = num_emotions, max_num_utterances=max_num_utterances, gru_hidden_size=gru_hidden_size).to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
class CustomTraining():
    def __init__(self, training_loader: torch.utils.data.dataloader.DataLoader,
                 validation_loader: torch.utils.data.dataloader.DataLoader,
                 test_loader: torch.utils.data.dataloader.DataLoader,
                 emotions_onehotencoder,
                 device: str,
                 epochs=20,
                 seed=42):
        self.training_loader = training_loader
        self.validation_loader = validation_loader
        self.test_loader = test_loader
        self.emotions_onehotencoder = emotions_onehotencoder
        self.epochs = epochs
        self.device = device
        self.seed = seed


    def _forward_step(_, speakers, dialogues_ids, dialogues_masks, model):
            emotion_pred, trigger_pred, dialogue_mask = model(speakers, dialogues_ids, dialogues_masks)
            return emotion_pred, trigger_pred, dialogue_mask


    def _compute_loss(_, emotion_pred, emotions, trigger_pred, triggers):
            emotion_loss = F.binary_cross_entropy_with_logits(emotion_pred, emotions)
            trigger_loss = F.binary_cross_entropy_with_logits(trigger_pred, triggers)
            loss = emotion_loss + trigger_loss
            return loss

    def _remove_pad(batch_idx, target, predictions, dialogue_length):
            target_nopad = target[batch_idx, :dialogue_length]
            predictions_nopad = predictions[batch_idx, :dialogue_length]
            return target_nopad, predictions_nopad


    def _remove_pad_predictions(self, batch_idx, emotions, emotion_predictions, triggers, trigger_predictions, dialogue_mask):
            dialogue_length = (dialogue_mask[batch_idx] == 0).nonzero()[0][0].item()
            emotions_nopad, emotion_pred_nopad = self._remove_pad(batch_idx, emotions, emotion_predictions, dialogue_length)
            triggers_nopad, trigger_pred_nopad = self._remove_pad(batch_idx, triggers, trigger_predictions, dialogue_length)
            return emotions_nopad, emotion_pred_nopad, triggers_nopad, trigger_pred_nopad

    def _update_metric_arrays(_, emotions_nopad, emotion_pred_nopad, triggers_nopad, triggers_pred_nopad,
                        flattened_emotions, flattened_emotions_pred, flattened_triggers, flattened_triggers_pred,
                        f1_sequence_emotions, f1_sequence_triggers):
            flattened_emotions = flattened_emotions + emotions_nopad.tolist()
            flattened_emotions_pred = flattened_emotions_pred + emotion_pred_nopad.tolist()
            flattened_triggers = flattened_triggers + triggers_nopad.tolist()
            flattened_triggers_pred = flattened_triggers_pred + triggers_pred_nopad.tolist()
            f1_sequence_emotions.append(f1_score(emotions_nopad, emotion_pred_nopad, average = 'micro'))
            f1_sequence_triggers.append(f1_score(triggers_nopad, triggers_pred_nopad, average = 'micro'))

    def _get_metric_results(flattened_emotions, flattened_emotions_pred, flattened_triggers, flattened_triggers_pred, f1_sequence_emotions, f1_sequence_triggers):
                  #emotion_pred_labels = torch.eye(emotion_pred.size(2))[torch.argmax(emotion_pred, dim=-1)]
            avg_f1_sequence_emotion = sum(f1_sequence_emotions) / len(f1_sequence_emotions)
            avg_f1_sequence_trigger = sum(f1_sequence_triggers) / len(f1_sequence_triggers)
            f1_flattened_emotion = f1_score(flattened_emotions, flattened_emotions_pred, average='micro')
            f1_flattened_trigger = f1_score(flattened_triggers, flattened_triggers_pred, average='micro')
            return avg_f1_sequence_emotion, avg_f1_sequence_trigger, f1_flattened_emotion, f1_flattened_trigger

    def _compute_metrics(self, emotions, emotion_pred, triggers, trigger_pred,
                 flattened_emotions, flattened_emotions_pred, flattened_triggers, flattened_triggers_pred,
                f1_sequence_emotions, f1_sequence_triggers):
            emotions = torch.argmax(emotions, dim=-1)
            emotion_pred = torch.argmax(emotion_pred, dim=-1)
            trigger_pred = (trigger_pred > 0.5).float()
            for batch_idx in range(emotion_pred.size(0)):
                emotions_nopad, emotion_pred_nopad, triggers_nopad, triggers_pred_nopad = self._remove_pad_predictions(batch_idx,
                                                                                                                       emotions,
                                                                                                                       emotion_pred,
                                                                                                                       triggers,
                                                                                                                       trigger_pred)
                self._update_metric_arrays(emotions_nopad, emotion_pred_nopad, triggers_nopad, triggers_pred_nopad,
                        flattened_emotions, flattened_emotions_pred, flattened_triggers, flattened_triggers_pred,
                        f1_sequence_emotions, f1_sequence_triggers)
            avg_f1_sequence_emotion, avg_f1_sequence_trigger, f1_flattened_emotion, f1_flattened_trigger = self._get_metric_results(flattened_emotions,
                                                                                                                                 flattened_emotions_pred,
                                                                                                                                 flattened_triggers,
                                                                                                                                 flattened_triggers_pred,
                                                                                                                                 f1_sequence_emotions,
                                                                                                                                 f1_sequence_triggers)
            return avg_f1_sequence_emotion, avg_f1_sequence_trigger, f1_flattened_emotion, f1_flattened_trigger


    # Leva l'epoch number nel caso di validation
    def _set_loop_info(self, loop,epoch_number,  loss, avg_loss, avg_f1_sequence_emotion, f1_flattened_emotion, avg_f1_sequence_trigger, f1_flattened_trigger):
            loop.set_description(f'Epoch {epoch_number + 1}/{self.epochs}')
            loop.set_postfix({'loss': loss.item(), 'loss_average': avg_loss,
                              'f1_sequence_emotion': f'{avg_f1_sequence_emotion:.2}',
                              'f1_flattened_emotion': f'{f1_flattened_emotion:.2}',
                              'f1_sequence_trigger': f'{avg_f1_sequence_trigger:.2}',
                              'f1_flattened_trigger': f'{f1_flattened_trigger:.2}',})

    def train_step(self, model: nn.Module, optimizer: Optimizer, epoch_number):
        total_loss = 0
        train_step = 0
        flattened_emotions, flattened_emotions_pred,flattened_triggers, flattened_triggers_pred, f1_sequence_emotions, f1_sequence_triggers = [],[],[],[],[],[]

        loop = tqdm(enumerate(self.training_loader, 0), total=len(self.training_loader))
        for _,data in loop:
            optimizer.zero_grad()
            train_step += 1
            speakers, dialogues_ids, dialogues_masks, emotions, triggers = data
            emotion_pred, trigger_pred, dialogue_mask = self._forward_step(speakers, dialogues_ids, dialogues_masks, model)
            loss = self._compute_loss(emotion_pred, emotions, trigger_pred, triggers)
            loss.backward()
            optimizer.step()
            avg_f1_sequence_emotion, avg_f1_sequence_trigger, f1_flattened_emotion, f1_flattened_trigger = self._compute_metrics(emotions, emotion_pred, triggers, trigger_pred,
                 flattened_emotions, flattened_emotions_pred, flattened_triggers, flattened_triggers_pred, f1_sequence_emotions, f1_sequence_triggers)
            total_loss += loss.item()
            avg_loss = total_loss / train_step
            self._set_loop_info(loop, epoch_number, loss, avg_loss, avg_f1_sequence_emotion, f1_flattened_emotion, avg_f1_sequence_trigger, f1_flattened_trigger)
        return avg_loss, avg_f1_sequence_emotion, f1_flattened_emotion, avg_f1_sequence_trigger, f1_flattened_trigger

        def train(self, model, optimizer, patience):
            train_history = {}
            val_history = {}
            best_val_loss = float('inf')

            for epoch in range(self.epochs):
                model.train()
                train_loss, f1_sequence_emotion, f1_flattened_emotion, f1_sequence_trigger, f1_flattened_trigger = self.train_step(model, optimizer, epoch)
                model.eval()
                val_loss = self.validation(model)

                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    early_stopping_counter = 0
                else:
                    early_stopping_counter += 1

                # Check if training should stop
                if early_stopping_counter >= patience:
                    print(f'Early stopping at epoch {epoch}...')
                    break

        def validation(self, model):
              total_loss = 0
              val_step = 0
              with torch.no_grad():
                  loop = tqdm(enumerate(self.validation_loader, 0), total=len(self.validation_loader))
                  for _, data in loop:
                      val_step += 1
                      speakers, dialogues_ids, dialogues_masks, emotions, triggers = data
                      emotion_pred, trigger_pred, dialogue_mask = self._forward_step(speakers, dialogues_ids, dialogues_masks, model)
                      loss = self._compute_loss(emotion_pred, emotions, trigger_pred, triggers)
                      avg_f1_sequence_emotion, avg_f1_sequence_trigger, f1_flattened_emotion, f1_flattened_trigger = self._compute_metrics(emotions, emotion_pred, triggers, trigger_pred,
                                                                                                                                           flattened_emotions, flattened_emotions_pred, flattened_triggers, flattened_triggers_pred, f1_sequence_emotions, f1_sequence_triggers)
                      total_loss += loss.item()
                      avg_loss = total_loss / val_step
                      self._set_loop_info(loop, epoch_number, loss, avg_loss, avg_f1_sequence_emotion, f1_flattened_emotion, avg_f1_sequence_trigger, f1_flattened_trigger)
              return avg_loss, avg_f1_sequence_emotion, f1_flattened_emotion, avg_f1_sequence_trigger, f1_flattened_trigger

        def test(self, model):
              model.eval()
              nb_test_steps = 0
              all_preds = []
              all_labels = []

              with torch.no_grad():
                  loop = tqdm(enumerate(self.validation_loader, 0), total=len(self.validation_loader))

                  for _, data in loop:
                      speakers, dialogues_ids, dialogues_masks, emotions, triggers = data
                      emotion_pred, trigger_pred, dialogue_mask = model(speakers, dialogues_ids, dialogues_masks)


                      nb_test_steps += 1

                      loop.set_description(f'Test: {nb_test_steps}/{len(self.validation_loader)}')
                      loop.set_postfix({'accuracy':f'{avg_accuracy:.2f}%'})

                    return all_labels, all_preds

In [None]:
optimizer = optim.Adam(model.parameters(), lr=0.001)
trainer = CustomTraining(dataloader_train, dataloader_val, dataloader_test, emotions_onehotencoder, device)
trainer.train_step(model, optimizer, 0)