<a href="https://colab.research.google.com/github/DanieleBaiocco/NLPProject/blob/main/nlpproject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
import pandas as pd
import os
from sklearn.preprocessing import LabelBinarizer
import numpy as np
from collections import defaultdict
from transformers import BertTokenizer
from pathlib import Path
import pickle
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
import torch

In [2]:
mount_directory = "/content/drive"
dataset_path = os.path.join(mount_directory, 'MyDrive/MELD/MELD_train_efr.json')
model_card = 'bert-base-uncased'
drive.mount(mount_directory)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
def fit_onehotencoder(data: pd.core.series.Series) -> LabelBinarizer:
  onehotencoder = LabelBinarizer()
  data_flattened = np.concatenate(data.values)
  onehotencoder.fit(data_flattened)
  return onehotencoder

In [4]:
def map_to_order_of_occurrence(data: list) -> list:
    # When a new entry is added, its value is computed using the lambda function
    dict_order_of_occurrence = defaultdict(lambda: len(dict_order_of_occurrence)+1)
    order_of_occurrence = [dict_order_of_occurrence[element] for element in data]
    return order_of_occurrence

In [5]:
def replace_none_with_zero(data: list) -> list:
    return [0 if x is None else x for x in data]

In [6]:
def tokenize_dialogue(dialogue_text: list, tokenizer) -> list:
    tokenized_dialogue = [tokenizer.tokenize(utterance) + [tokenizer.sep_token] for utterance in dialogue_text]
    tokenized_dialogue.insert(0, [tokenizer.cls_token])
    flattened_tokens = [token for sublist in tokenized_dialogue for token in sublist]
    #input_ids = tokenizer.convert_tokens_to_ids(flattened_tokens)
    return flattened_tokens

In [45]:
def add_padding(x: list, max_x_length: int, pad_value, pad_length = None) -> list:
    x_length = len(x)
    num_pad_values = max_x_length - x_length
    if pad_length == None:
      padded_x = x + [pad_value] * num_pad_values
    else:
      pad_list = [pad_value] * pad_length
      pad_lists = np.tile(pad_list, (num_pad_values, 1))
      padded_x = np.concatenate((x, pad_lists), axis=0)
    return padded_x

In [8]:
tokenizer = BertTokenizer.from_pretrained(model_card, do_lower_case=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [9]:
df = pd.read_json(dataset_path)
column_names = ['emotions', 'speakers', 'triggers', 'dialogues', 'dialogues_ids', 'attention_masks']
df_new = pd.DataFrame(columns = column_names)
emotions_onehotencoder = fit_onehotencoder(df['emotions'])
df_new['emotions'] = df['emotions'].apply(lambda x: emotions_onehotencoder.transform(x))
df_new['speakers'] = df['speakers'].apply(lambda x: map_to_order_of_occurrence(x))
df_new['triggers'] = df['triggers'].apply(lambda x: replace_none_with_zero(x))
df_new['dialogues'] = df['utterances'].apply(lambda x: tokenize_dialogue(x, tokenizer))

In [15]:
def save_dataframe(df):
    _folder = Path.cwd().joinpath("dataframes")
    if not _folder.exists():
        _folder.mkdir(parents=True)

    df_path = Path.joinpath(_folder, 'df_MELD_efr'+'.pkl')
    df.to_pickle(df_path)

In [16]:
save_dataframe(df_new)

In [17]:
def load_dataframe():
    df_path = 'dataframes/df_MELD_efr.pkl'
    if not os.path.exists(df_path):
        raise FileNotFoundError("{0} dataframe does not exist!".format(df_path))

    with open(df_path, 'rb') as file:
        df = pickle.load(file)
    return df

In [18]:
def split_dataframe(orginal_df: pd.DataFrame, seed: int):
    train, test_validation = train_test_split(orginal_df, test_size=0.2, random_state=seed)
    validation, test = train_test_split(test_validation, test_size=0.5, random_state=seed)
    return train.reset_index(drop=True), validation.reset_index(drop=True), test.reset_index(drop=True)

In [34]:
df_train, df_val, df_test = split_dataframe(df_new, 42)

In [52]:
class CustomDataset(Dataset):
    def __init__(self, speakers: pd.core.series.Series,
                 dialogues:  pd.core.series.Series,
                 emotions:  pd.core.series.Series,
                 triggers:  pd.core.series.Series,
                 pad_token: str):
        self.max_dialogue_length = dialogues.apply(len).max()
        self.dialogues = dialogues.apply(lambda x: add_padding(x, self.max_dialogue_length, pad_token))
        self.dialogues_ids = self.dialogues.apply(lambda x: tokenizer.convert_tokens_to_ids(x))
        self.attention_masks = self.dialogues.apply(lambda x: [1 if token != pad_token else 0 for token in x])
        self.speakers = speakers.apply(lambda x: add_padding(x, self.max_dialogue_length, pad_value=0))
        num_emotion_classes = len(emotions[0][0])
        self.emotions = emotions.apply(lambda x: add_padding(x, self.max_dialogue_length, pad_value = 0, pad_length = num_emotion_classes))
        self.triggers = triggers.apply(lambda x: add_padding(x, self.max_dialogue_length, pad_value=0))

    def __len__(self):
        return len(self.dialogues)

    def __getitem__(self, idx):
        speakers = torch.tensor(self.speakers.iloc[idx], dtype=torch.float32)
        dialogues_ids = torch.tensor(self.dialogues_ids.iloc[idx], dtype=torch.float32)
        dialogues_masks =  torch.tensor(self.attention_masks.iloc[idx], dtype=torch.float32)
        emotions = torch.tensor(self.emotions.iloc[idx], dtype=torch.float32)
        triggers = torch.tensor(self.triggers.iloc[idx], dtype=torch.float32)
        return  speakers, dialogues_ids, dialogues_masks, emotions, triggers

In [53]:
batch_size = 16
torch_dataset = CustomDataset(speakers = df_train['speakers'],
                              dialogues = df_train['dialogues'],
                              emotions = df_train['emotions'],
                              triggers = df_train['triggers'],
                              pad_token = tokenizer.pad_token)
dataloader_train = DataLoader(torch_dataset, batch_size = batch_size, shuffle = True)