# Final project

## Imports and Initial Settings

In [1]:
%pip install pandas numpy matplotlib transformers dataset

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import random
import torch
import pandas as pd
from os import path
from sklearn.model_selection import train_test_split
import random
import gc
import torch
import transformers
from tqdm import tqdm
from typing import Callable, Dict, List, Tuple
from timeit import default_timer as timer
from transformers import EncoderDecoderModel, AutoTokenizer, PreTrainedTokenizer, BatchEncoding
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler

2023-11-15 14:36:36.120695: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Fix the random state to 42
SEED = 42

def fix_seed(seed: int) -> None:
    """Fix all the possible sources of randomness.

    Args:
        seed: the seed to use.
    """
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

fix_seed(SEED)

## Dataset Loading

In [4]:
data_folder = 'Dataset'

def load__dataset(filename:str) -> pd.DataFrame:
    with open(path.join(data_folder, filename)) as file_obj:
        data = pd.read_json(file_obj, dtype={'episode':str,'speakers':np.array})
        return data

In [5]:
training_set_fn = 'MELD_train_efr.json'

dataset = load__dataset(training_set_fn)
dataset.head()

Unnamed: 0,episode,speakers,emotions,utterances,triggers
0,utterance_0,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise]",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 1.0, 0.0]"
1,utterance_1,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise,...",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]"
2,utterance_2,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise,...",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
3,utterance_3,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise,...",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,utterance_4,"[Joey, Rachel, Joey, Rachel]","[surprise, sadness, surprise, fear]",[But then who? The waitress I went out with la...,"[0.0, 0.0, 1.0, 0.0]"


In [6]:
def get_index_none_triggers(df:pd.DataFrame, column:str) -> list:
    has_none = []
    for i in df[column].index:
        is_none = np.where(np.array(df[column][i]) == None, 1, 0).any()
        if is_none:
            has_none.append(i)
    return has_none

def clean_none(df:pd.DataFrame, column:str, indexes:list) -> pd.DataFrame:
    for i in indexes:
        df[column][i] = [el if el is not None else 0.0 for el in df[column][i]]
    return df

Check how many triggers do have a None value.

In [7]:
indexes_none = get_index_none_triggers(dataset, column='triggers')
print("{} trigger rows have None values.\n".format(len(indexes_none)))
if len(indexes_none) > 0:
    print(dataset['triggers'][indexes_none])

9 trigger rows have None values.

2671                                [0.0, 0.0, 1.0, None]
2693                      [0.0, 0.0, 0.0, 0.0, 1.0, None]
3105    [0.0, 0.0, 0.0, None, 0.0, 0.0, 0.0, 1.0, 1.0,...
3157    [0.0, 0.0, None, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,...
3171    [0.0, 0.0, 0.0, 0.0, 0.0, None, 0.0, 0.0, 1.0,...
3204    [None, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
3266    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, None, 0.0, 1.0,...
3351    [0.0, 0.0, 0.0, None, 0.0, 0.0, 0.0, 0.0, 0.0,...
3359    [0.0, None, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
Name: triggers, dtype: object


Now we clean the dataset and then check the previous lists.

In [8]:
dataset = clean_none(dataset, column='triggers', indexes=indexes_none)
indexes_none_clean = get_index_none_triggers(dataset, column='triggers')
print("\nAfter cleaning, {} trigger rows have None values.\n".format(len(indexes_none_clean)))
if len(indexes_none) > 0:
    print(dataset['triggers'][indexes_none])


After cleaning, 0 trigger rows have None values.

2671                                 [0.0, 0.0, 1.0, 0.0]
2693                       [0.0, 0.0, 0.0, 0.0, 1.0, 0.0]
3105    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, ...
3157    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...
3171    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...
3204    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
3266    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...
3351    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
3359    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
Name: triggers, dtype: object


## Train-Val-Test splitting

In [9]:
tot = len(dataset)
data = []
for r in range(tot):
  text = dataset['utterances'][r]
  em = dataset['emotions'][r]
  trig = dataset['triggers'][r]
  for i in range(1,len(text)+1):
    t = []
    for q in range(i):
      t.append(text[q])
    data.append(
        {'episode': dataset['episode'][r],
        'utterance': t,
        'emotion': em[i-1],
        'trigger': trig[i-1]
              })
data = pd.DataFrame(data)

In [10]:
len(data)

35000

In [11]:
episodes = data['episode'].unique()

In [12]:
idx_train, idx_test = train_test_split(episodes,
                                     random_state=SEED,
                                     test_size=0.1)
idx_train, idx_val = train_test_split(idx_train,
                                     random_state=SEED,
                                     test_size=idx_test.shape[0])
df_train = data[data['episode'].isin(idx_train)]
df_test = data[data['episode'].isin(idx_test)]
df_val = data[data['episode'].isin(idx_val)]
print('Number of training samples: {}'.format(df_train.shape[0]))
print('Number of test samples: {}'.format(df_test.shape[0]))
print('Number of validation examples: {}'.format(df_val.shape[0]))

Number of training samples: 27850
Number of test samples: 3612
Number of validation examples: 3538


In [13]:
df_train.head()

Unnamed: 0,episode,utterance,emotion,trigger
0,utterance_0,[also I was the point person on my company's t...,neutral,0.0
1,utterance_0,[also I was the point person on my company's t...,neutral,0.0
2,utterance_0,[also I was the point person on my company's t...,neutral,0.0
3,utterance_0,[also I was the point person on my company's t...,neutral,1.0
4,utterance_0,[also I was the point person on my company's t...,surprise,0.0


In [14]:
"""
idx_train, idx_test = train_test_split(dataset.index.to_numpy(),
                                     random_state=SEED,
                                     test_size=0.1)
print('Number of training samples: {}'.format(idx_train.shape[0]))
print('Number of test samples: {}'.format(idx_test.shape[0]))

idx_train, idx_val = train_test_split(idx_train,
                                     random_state=SEED,
                                     test_size=idx_test.shape[0])
print('Number of validation examples: {}'.format(idx_val.shape[0]))

df_train = dataset.iloc[idx_train]
df_val = dataset.iloc[idx_val]
df_test = dataset.iloc[idx_test]
"""

"\nidx_train, idx_test = train_test_split(dataset.index.to_numpy(),\n                                     random_state=SEED,\n                                     test_size=0.1)\nprint('Number of training samples: {}'.format(idx_train.shape[0]))\nprint('Number of test samples: {}'.format(idx_test.shape[0]))\n\nidx_train, idx_val = train_test_split(idx_train,\n                                     random_state=SEED,\n                                     test_size=idx_test.shape[0])\nprint('Number of validation examples: {}'.format(idx_val.shape[0]))\n\ndf_train = dataset.iloc[idx_train]\ndf_val = dataset.iloc[idx_val]\ndf_test = dataset.iloc[idx_test]\n"

In [15]:
df_test.head()

Unnamed: 0,episode,utterance,emotion,trigger
55,utterance_8,"[Hey, Mon.]",neutral,0.0
56,utterance_8,"[Hey, Mon., Hey-hey-hey. You wanna hear someth...",neutral,0.0
57,utterance_8,"[Hey, Mon., Hey-hey-hey. You wanna hear someth...",joy,0.0
58,utterance_8,"[Hey, Mon., Hey-hey-hey. You wanna hear someth...",sadness,1.0
77,utterance_12,"[Hey, Mon.]",neutral,0.0


## Majority Classifier

In [20]:
def update_count_dictionary(key:str, d_em:dict):
    ''' Given in input a dictionary and a string key, it counts the times
        that key has been added.'''
    if key in d_em.keys():
        d_em[key] += 1
    else:
        d_em[key] = 1

emotions_dict = dict()

[update_count_dictionary(emotion, emotions_dict) for emotion in df_train['emotion']]
emotions_dict

{'neutral': 12055,
 'surprise': 3728,
 'fear': 910,
 'sadness': 2121,
 'joy': 5040,
 'disgust': 852,
 'anger': 3144}

In [25]:
class Majority_Classifier():
    '''It is trained on a dataset using the emotion and trigger columns.
        It's output on the forward method will be always the majority class
        for emotion and trigger, calculated in the training set.'''

    def __init__(self):
        self.emotion = ""
        self.trigger = 0.0

    def train(self, dataset:pd.DataFrame):
        '''Input a dataset with the emotion and trigger columns.'''

        emotions_count = dict()
        triggers_count = dict()

        for em, tr in dataset[['emotion','trigger']].values:
            update_count_dictionary(em, emotions_count)
            update_count_dictionary(tr, triggers_count)

        # get the majority class for the emotion column
        max_count = -1
        for emotion, value in emotions_count.items():
            if value > max_count:
                max_count = value
                self.emotion = emotion 

        # get the majority class for the trigger column
        max_count = -1
        for trigger, value in triggers_count.items():
            if value > max_count:
                max_count = value
                self.trigger = trigger 

    def forward(self, utterance):
        '''The input is ignored. This method can be modified. It is intended to
            mimic the forward method of an ML model such that it usable in the same way.'''
        return self.emotion, self.trigger


dumb_majority = Majority_Classifier()
dumb_majority.train(df_train)

print("The majority emotion is: {}".format(dumb_majority.emotion))
print("The majority trigger is: {}".format(dumb_majority.trigger))
print()
print("A test for the dumb majority classifier:")
print("\t\tinput: Oh God! I'm dumb..")

e, t = dumb_majority.forward("Oh God! I'm dumb..")
print("\t\toutput: emotion:{}, trigger:{}".format(e,t))

The majority emotion is: neutral
The majority trigger is: 0.0

A test for the dumb majority classifier:
		input: Oh God! I'm dumb..
		output: emotion:neutral, trigger:0.0
