In [4]:
import os,re, time, pickle, collections, importlib, datetime, torch, nltk, pandas as pd, numpy as np, time
from chardet import detect
from tqdm import tqdm
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from collections import defaultdict, Counter
from wordebd import WORDEBD
from vocab import Vocab, Vectors
from munch import Munch
from cnnlstmseq import CNNLSTMseq
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from nltk.tokenize import word_tokenize
from transformers import BertModel, BertTokenizer
import transformers
from torch.utils.data import DataLoader, TensorDataset, Dataset
from model import batch_graphify, MaskedEdgeAttention, MaskedNLLLoss, \
LSTMModel, GRUModel, DialogRNNModel, DialogueGCNModel, DialogueGCN_DailyModel
from model import DATASET_PATH
%load_ext autoreload
%autoreload 2

In [5]:
key=False

Make sure to specify which dataset to use
- dataset_original
- dataset_drop_noise
- dataset_smote

In [6]:
# dataset_path = "dataset_original"
# dataset_path = "dataset_drop_noise"
# dataset_path = "dataset_smote"
dataset_path = DATASET_PATH

In [7]:
def get_encoding_type(file):
    with open(file, 'rb') as f:
        rawdata = f.read()
    return detect(rawdata)['encoding']

In [8]:
def load_pretrained_glove():
    print("Loading GloVe...")
    glv_vector = {}
    f = open('/embed/glove/glove.840B.300d.txt', encoding='utf-8')
    for line in f:
        values = line.split()
        word, coefs = values[0], np.asarray(values[1:], dtype='float')
        try:
            glv_vector[word] = coefs
        except ValueError:
            continue
    f.close()
    start_time = time.time()
    print(f"Took {time.time() - start_time} seconds to load pretrained GloVe model.")
    return glv_vector

def encode_labels(encoder, l):
    return encoder[l]

def _read_words(data, convmode=None):
    '''    
    Count the occurrences of all words
    @param convmode: str, None for non conversational scope, 'naive' for classic or naive approach, 'conv' for conversation depth into account (one additional dim and nested values)
    @param data: list of examples
    @return words: list of words (with duplicates)
    '''    
    words = []
    if convmode is None:
        for example in data:
            words += example.split()
    return words

def find_value_ranges(lst):
    value_ranges = []
    start_index = 0

    for i in range(1, len(lst)):
        if lst[i] != lst[i - 1]:
            value_ranges.append((start_index, i - 1))
            start_index = i

    # Add the last range
    value_ranges.append((start_index, len(lst) - 1))

    return value_ranges

In [9]:
df = pd.read_csv('data\DatasetPreparation\X_train.csv', encoding='shift_jis')

# Print the column names
print(df.columns)

columns_to_use = df.columns[1:]

Index(['Unnamed: 0', 'Utterance', 'Speaker', 'Emotion', 'Sentiment',
       'Dialogue_ID', 'Utterance_ID'],
      dtype='object')


In [10]:
# Read the CSV file
X_train = pd.read_csv('data\DatasetPreparation\X_train.csv', encoding='shift_jis', usecols=columns_to_use)
X_test = pd.read_csv('data\DatasetPreparation\X_test.csv', encoding='shift_jis', usecols=columns_to_use)
X_dev = pd.read_csv('data\DatasetPreparation\X_dev.csv', encoding='shift_jis', usecols=columns_to_use)

y_train = pd.read_csv('data\DatasetPreparation\y_train.csv', encoding='shift_jis')
y_test = pd.read_csv('data\DatasetPreparation\y_test.csv', encoding='shift_jis')
y_dev = pd.read_csv('data\DatasetPreparation\y_dev.csv', encoding='shift_jis')

# Display the first three rows
print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

print(X_dev.shape)
print(y_dev.shape)

(12176, 6)
(12176, 3)
(3230, 6)
(3230, 3)
(1373, 6)
(1373, 3)


In [11]:
X_train.head(10)

Unnamed: 0,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID
0,also point person company transition system,0,4,neutral,0,0
1,must hand full,1,4,neutral,0,1
2,let talk little bit duty,1,4,neutral,0,3
3,duty right,0,6,positive,0,4
4,heading whole division lot duty,1,4,neutral,0,5
5,see,0,4,neutral,0,6
6,perhaps people dump certain amount,1,4,neutral,0,7
7,good know,0,4,neutral,0,8
8,go detail,1,4,neutral,0,9
9,beg,0,2,negative,0,10


In [12]:
checkFile1 = os.path.isfile("data/dump/" + dataset_path + "/label_encoder.pkl")
checkFile2 = os.path.isfile("data/dump/" + dataset_path + "/label_decoder.pkl")

if not(checkFile1 and checkFile2):
    labels = sorted(set(y_train.Emotion))
    labelEncoder = {label: i for i, label in enumerate(labels)}
    labelDecoder = {i: label for i, label in enumerate(labels)}

    pickle.dump(labelEncoder, open('data/dump/' + dataset_path + '/.pkl', 'wb'))
    pickle.dump(labelDecoder, open('data/dump/' + dataset_path + '/label_decoder.pkl', 'wb'))
else:
    file1 = open('data/dump/' + dataset_path + '/label_encoder.pkl', 'rb')
    file2 = open('data/dump/' + dataset_path + '/label_decoder.pkl', 'rb')
    labelEncoder = pickle.load(file1)
    labelDecoder = pickle.load(file2)
    file1.close()
    file2.close()

In [13]:
labelEncoder

{'anger': 0,
 'disgust': 1,
 'fear': 2,
 'joy': 3,
 'neutral': 4,
 'sadness': 5,
 'surprise': 6}

In [14]:
# Check if the file already exists
checkFile1 = os.path.isfile("data/dump/" + dataset_path + "/labels_train.pkl")
checkFile2 = os.path.isfile("data/dump/" + dataset_path + "/labels_test.pkl")
checkFile3 = os.path.isfile("data/dump/" + dataset_path + "/labels_dev.pkl")

if not (checkFile1 and checkFile2 and checkFile3):
    pickle.dump(X_train["Emotion"], open('data/dump/' + dataset_path + '/labels_train.pkl', 'wb'))
    pickle.dump(X_test["Emotion"], open('data/dump/' + dataset_path + '/labels_test.pkl', 'wb'))
    pickle.dump(X_dev["Emotion"], open('data/dump/' + dataset_path + '/labels_dev.pkl', 'wb'))

Creating an embedding

Testing on smaller data. Uncomment to see the size of updated representations


In [15]:
# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = transformers.BertTokenizer.from_pretrained(model_name)
model = transformers.BertModel.from_pretrained(model_name)

# Define your dialog data
dialogs = [
    "How are you today?",
    "I'm doing well, thank you!",
    "That's good to hear.",
    "Yes, it is.",
    "Do you have any plans for the weekend?",
    "Not really, just relaxing at home.",
    "Sounds nice.",
    "Indeed."
]

# Tokenize and encode the dialogs
encoded_dialogs = [tokenizer.encode(dialog, add_special_tokens=True) for dialog in dialogs]

# Pad sequences to the same length
max_length = max(len(dialog) for dialog in encoded_dialogs)
padded_dialogs = [dialog + [tokenizer.pad_token_id] * (max_length - len(dialog)) for dialog in encoded_dialogs]

# Create attention masks
attention_masks = [[1] * len(dialog) + [0] * (max_length - len(dialog)) for dialog in encoded_dialogs]

# Convert to PyTorch tensors
input_ids = torch.tensor(padded_dialogs)
attention_masks = torch.tensor(attention_masks)

# Obtain the BERT embeddings
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_masks)

# Extract the contextual embeddings (CLS token)
contextual_embeddings = outputs[0][:, 0, :]  # Extract embeddings for the [CLS] token

# Print the shape of the contextual embeddings
print("Shape of contextual embeddings:", contextual_embeddings.shape)

Shape of contextual embeddings: torch.Size([8, 768])


In [16]:
# input_ids

In [17]:
# attention_masks

This is just a duplicate of code above. Using this on train data

In [18]:
rangesTrain = find_value_ranges(X_train["Dialogue_ID"])
print(len(rangesTrain))

rangesTest = find_value_ranges(X_test["Dialogue_ID"])
print(len(rangesTest))

rangesDev = find_value_ranges(X_dev["Dialogue_ID"])
print(len(rangesDev))

2158
577
269


#### Contexualized train data

In [19]:
checkFile = os.path.isfile("embed/" + dataset_path + "/u_prime_BERT_train.pkl")
start_time = time.time()

if not checkFile:
    dialogs = []
    for range_pair, iteration in tqdm(zip(rangesTrain, range(len(rangesTrain))), desc="Processing Ranges"):
        start_idx, end_idx = range_pair
        dialog = list(X_train['Utterance'][start_idx:end_idx + 1])
        dialogs.append(dialog)

    # List to store contextual embeddings for each utterance
    contextualEmbeddingsTrain = []

    # Iterate through each dialog
    for dialog in tqdm(dialogs, desc="Processing Dialogs"):
        # Tokenize and convert dialog to input IDs
        inputs = tokenizer(dialog, return_tensors='pt', padding=True, truncation=True)

        # Get BERT model outputs
        with torch.no_grad():
            outputs = model(**inputs)

        # Extract contextual embeddings (CLS token represents the entire sequence)
        embeddings = outputs.last_hidden_state[:, 0, :].tolist()

        # Store embeddings for each utterance in the dialog
        contextualEmbeddingsTrain.append(torch.tensor(embeddings))

    file_path = f'embed/' + dataset_path + '/u_prime_BERT_train.pkl'
    with open(file_path, 'wb') as file:
            pickle.dump(contextualEmbeddingsTrain, file)

else:
    file_path = f'embed/' + dataset_path + '/u_prime_BERT_train.pkl'
    with open(file_path, 'rb') as file:
        contextualEmbeddingsTrain = pickle.load(file)
        
# Record end time
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time
print(f"Contexualized train data - Elapsed time: {elapsed_time} seconds")

Contexualized train data - Elapsed time: 0.18141508102416992 seconds


<h4> Contexualize test data

In [20]:
checkFile = os.path.isfile("embed/" + dataset_path + "/u_prime_BERT_test.pkl")
start_time = time.time()

if not checkFile:
    dialogs = []
    for range_pair in tqdm(rangesTest):
        start_idx, end_idx = range_pair            
        dialog = list(X_test['Utterance'][start_idx:end_idx + 1])
        dialogs.append(dialog)

    contextualEmbeddingsTest = []

    for dialog in tqdm(dialogs, desc="Processing Dialogs"):
        inputs = tokenizer(dialog, return_tensors='pt', padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :].tolist()
        contextualEmbeddingsTest.append(torch.tensor(embeddings))

    file_path = f'embed/' + dataset_path + '/u_prime_BERT_test.pkl'
    with open(file_path, 'wb') as file:
        pickle.dump(contextualEmbeddingsTest, file)

else:
    file_path = f'embed/' + dataset_path + '/u_prime_BERT_test.pkl'
    with open(file_path, 'rb') as file:
        contextualEmbeddingsTest = pickle.load(file)

end_time = time.time()

elapsed_time = end_time - start_time
print(f"Contexualized test data - Elapsed time: {elapsed_time} seconds")

Contexualized test data - Elapsed time: 0.04954648017883301 seconds


<h4> Contexualize val data

In [21]:
checkFile = os.path.isfile("embed/" + dataset_path + "/u_prime_BERT_dev.pkl")
start_time = time.time()

if not checkFile:
    dialogs = []
    for range_pair in tqdm(rangesDev):
        start_idx, end_idx = range_pair            
        dialog = list(X_dev['Utterance'][start_idx:end_idx + 1])
        dialogs.append(dialog)

    contextualEmbeddingsDev = []

    for dialog in tqdm(dialogs, desc="Processing Dialogs"):
        inputs = tokenizer(dialog, return_tensors='pt', padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :].tolist()
        contextualEmbeddingsDev.append(torch.tensor(embeddings))

    file_path = f'embed/' + dataset_path + '/u_prime_BERT_dev.pkl'
    with open(file_path, 'wb') as file:
        pickle.dump(contextualEmbeddingsDev, file)

else:
    file_path = f'embed/' + dataset_path + '/u_prime_BERT_dev.pkl'
    with open(file_path, 'rb') as file:
        contextualEmbeddingsDev = pickle.load(file)

end_time = time.time()

elapsed_time = end_time - start_time
print(f"Contexualized test data - Elapsed time: {elapsed_time} seconds")

Contexualized test data - Elapsed time: 0.027005672454833984 seconds


<h4> Getting speaker encoder for train set

In [22]:
# Check if the file exists
checkFile = os.path.isfile("data/dump/" + dataset_path + "/speaker_encoder_train.pkl")
encodedSpeakersTrain = []

if not checkFile:
    for range_pair in rangesTrain:
        start_idx, end_idx = range_pair
        speaker_per_dialog = X_train['Speaker'][start_idx:end_idx + 1].copy()
        speaker_feature = sorted(set(speaker_per_dialog))
        speaker_encoder = {feature: i for i, feature in enumerate(speaker_feature)}
        speaker_decoder = {i: feature for i, feature in enumerate(speaker_feature)}

        encoded_speaker = speaker_per_dialog.replace(speaker_encoder)
        encodedSpeakersTrain.append(encoded_speaker)

    # Save encoded speaker list and ranges to a file using pickle
    file_path = 'data/dump/' + dataset_path + '/speaker_encoder_train.pkl'
    with open(file_path, 'wb') as file:
        pickle.dump([encodedSpeakersTrain, rangesTrain], file)
else:
    # Load encoded speaker list and ranges from the existing pickle file
    file = open('data/dump/' + dataset_path + '/speaker_encoder_train.pkl', "rb")
    encodedSpeakersTrain, rangesTrain = pickle.load(file)
    file.close()

<h4> Getting speaker encoder for test set

In [23]:
checkFile = os.path.isfile("data/dump/" + dataset_path + "/speaker_encoder_test.pkl")
encodedSpeakersTest = []

if not checkFile:
    for range_pair in rangesTest:
        start_idx, end_idx = range_pair
        speaker_per_dialog = X_test['Speaker'][start_idx:end_idx + 1].copy()
        speaker_feature = sorted(set(speaker_per_dialog))
        speaker_encoder = {feature: i for i, feature in enumerate(speaker_feature)}
        speaker_decoder = {i: feature for i, feature in enumerate(speaker_feature)}

        encoded_speaker = speaker_per_dialog.replace(speaker_encoder)
        encodedSpeakersTest.append(encoded_speaker)

    # Save encoded speaker list and ranges to a file using pickle
    file_path = 'data/dump/' + dataset_path + '/speaker_encoder_test.pkl'
    with open(file_path, 'wb') as file:
        pickle.dump([encodedSpeakersTest, rangesTest], file)
else:
    # Load encoded speaker list and ranges from the existing pickle file
    file = open('data/dump/' + dataset_path + '/speaker_encoder_test.pkl', "rb")
    encodedSpeakersTest, rangesTest = pickle.load(file)
    file.close()

<h4> Getting speaker encoder for val set

In [24]:
checkFile = os.path.isfile("data/dump/" + dataset_path + "/speaker_encoder_dev.pkl")
encodedSpeakersDev = []

if not checkFile:
    for range_pair in rangesDev:
        start_idx, end_idx = range_pair
        speaker_per_dialog = X_dev['Speaker'][start_idx:end_idx + 1].copy()
        speaker_feature = sorted(set(speaker_per_dialog))
        speaker_encoder = {feature: i for i, feature in enumerate(speaker_feature)}
        speaker_decoder = {i: feature for i, feature in enumerate(speaker_feature)}

        encoded_speaker = speaker_per_dialog.replace(speaker_encoder)
        encodedSpeakersDev.append(encoded_speaker)

    # Save encoded speaker list and ranges to a file using pickle
    file_path = 'data/dump/' + dataset_path + '/speaker_encoder_dev.pkl'
    with open(file_path, 'wb') as file:
        pickle.dump([encodedSpeakersDev, rangesDev], file)
else:
    # Load encoded speaker list and ranges from the existing pickle file
    file = open('data/dump/' + dataset_path + '/speaker_encoder_dev.pkl', "rb")
    encodedSpeakersDev, rangesDev = pickle.load(file)
    file.close()

<h4>Getting data required for graph processing

In [25]:
D_m = 100
D_g = 150
D_p = 150
D_e = 384
D_h = 100
D_a = 100
graph_h=100
n_speakers=2
max_seq_len=110
window_past=0
window_future=5
# vocab_size=vocab_size
n_classes=7
listener_state=False
context_attention='general'
dropout=0.5
nodal_attention=False
no_cuda=True
n_relations = 2 * n_speakers ** 2
att_model = MaskedEdgeAttention(2 * D_e, max_seq_len, no_cuda)
nodal_attention=True
edge_type_mapping = {}
for j in range(n_speakers):
    for k in range(n_speakers):
        edge_type_mapping[str(j) + str(k) + '0'] = len(edge_type_mapping)
        edge_type_mapping[str(j) + str(k) + '1'] = len(edge_type_mapping)

In [26]:
for sample, ranges in zip(contextualEmbeddingsTest, rangesTest):
    if ranges[1] - ranges[0] == 0:
        print(sample.shape, ranges)

torch.Size([1, 768]) (418, 418)
torch.Size([1, 768]) (419, 419)
torch.Size([1, 768]) (422, 422)
torch.Size([1, 768]) (549, 549)
torch.Size([1, 768]) (1611, 1611)
torch.Size([1, 768]) (1612, 1612)
torch.Size([1, 768]) (2116, 2116)
torch.Size([1, 768]) (2117, 2117)
torch.Size([1, 768]) (2442, 2442)
torch.Size([1, 768]) (2546, 2546)


In [27]:
class ContextDataset(Dataset):
    def __init__(self, contextEmbeddings, rangesSet, encodedSpeakersSet):
        # print(contextEmbeddings[0])
        self.contextEmbeddings = contextEmbeddings
        self.rangesSet = rangesSet
        self.encodedSpeakersSet = encodedSpeakersSet

    def __len__(self):
        return len(self.rangesSet)

    def __getitem__(self, idx):
        startIdx, endIdx = self.rangesSet[idx]
#         sequence = self.X_set["sequence"][startIdx:endIdx+1].tolist()
        convs = self.contextEmbeddings[idx]
        qmask = self.encodedSpeakersSet[startIdx: endIdx+1]
        return convs, qmask

# Define the ContextEncoding function
def ContextEncoding(file_path, dataset):
    all_emotions, all_umask, all_seq_lengths = [], [], []
    all_features, all_edge_index, all_edge_norm, all_edge_type, all_edge_index_lengths = [], [], [], [], []
    dataloader = DataLoader(dataset, batch_size=1, shuffle=False)
    for convs, qmask in tqdm(dataloader, desc="Encoding Progress", unit="batch"):
        convs = convs[0]
        textf = convs.unsqueeze(1)

        umask = torch.FloatTensor([[1] * len(textf)])  # Adjust to (1, utterance_size)
#         print(umask)
        lengths = [(umask[j] == 1).nonzero().tolist()[-1][0] + 1 for j in range(len(umask))]
#         print("qmask.shape: ", qmask.shape, lengths)
#         U = cnn_feat_extractor(textf, umask)
#         emotions, hidden = lstm(U)
#         all_emotions.append(emotions)
        
        features, edge_index, \
        edge_norm, edge_type, \
        edge_index_lengths = batch_graphify(textf, 
                                            qmask,
                                            lengths,
                                            window_past,
                                            window_future,
                                            edge_type_mapping,
                                            att_model, 
                                            no_cuda)
        all_umask.append(umask)
        all_seq_lengths.append(lengths)
        all_features.append(features)
        all_edge_index.append(edge_index)
        all_edge_norm.append(edge_norm)
        all_edge_type.append(edge_type)
        all_edge_index_lengths.append(edge_index_lengths)
        
    with open(file_path, 'wb') as file:
        pickle.dump([   all_umask, \
                        all_seq_lengths,
                        all_features, \
                        all_edge_index, \
                        all_edge_norm, \
                        all_edge_type, \
                        all_edge_index_lengths], file)
    
    return all_features, all_edge_index, all_edge_norm, all_edge_type, all_edge_index_lengths

# File paths
file_path1 = 'embed/' + dataset_path + '/pre_h_prime_BERT_train.pkl'
file_path2 = 'embed/' + dataset_path + '/pre_h_prime_BERT_test.pkl'
file_path3 = 'embed/' + dataset_path + '/pre_h_prime_BERT_dev.pkl'

# Check if files exist
checkFile1 = os.path.isfile(file_path1)
checkFile2 = os.path.isfile(file_path2)
checkFile3 = os.path.isfile(file_path3)

if not checkFile1:
    encodedSpeakersFlat = [speaker for dialogue in encodedSpeakersTrain for speaker in dialogue]
    oheEncodedSpeakersFlat = torch.FloatTensor([[1, 0] if x == 0 else [0, 1] for x in encodedSpeakersFlat])
    trainDataset = ContextDataset(contextualEmbeddingsTrain, rangesTrain, oheEncodedSpeakersFlat)
    start_time = time.time()
    
    all_features, \
     all_edge_index, \
     all_edge_norm, \
     all_edge_type, \
     all_edge_index_lengths = ContextEncoding(file_path1, trainDataset)
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    print("It took", elapsed_time, "seconds to encode train text")

if not checkFile2:
    encodedSpeakersFlat = [speaker for dialogue in encodedSpeakersTest for speaker in dialogue]
    oheEncodedSpeakersFlat = torch.FloatTensor([[1, 0] if x == 0 else [0, 1] for x in encodedSpeakersFlat])
    
    testDataset = ContextDataset(contextualEmbeddingsTest, rangesTest, oheEncodedSpeakersFlat)
    start_time = time.time()
    testContext, _, _, _, _ = ContextEncoding(file_path2, testDataset)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print("It took", elapsed_time, "seconds to encode test text")
    
if not checkFile3:
    encodedSpeakersFlat = [speaker for dialogue in encodedSpeakersDev for speaker in dialogue]
    oheEncodedSpeakersFlat = torch.FloatTensor([[1, 0] if x == 0 else [0, 1] for x in encodedSpeakersFlat])
    
    devDataset = ContextDataset(contextualEmbeddingsDev, rangesDev, oheEncodedSpeakersFlat)
    start_time = time.time()
    devContext, _, _, _, _ = ContextEncoding(file_path3, devDataset)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print("It took", elapsed_time, "seconds to encode test text")

if checkFile1 and checkFile2 and checkFile3:
#     with open(file_path1[0], 'rb') as file1:
#         trainContext = pickle.load(file1)
    with open(file_path1, 'rb') as file1:
        _, _, features, edge_index, \
        edge_norm, edge_type, edge_index_lengths = pickle.load(file1)     
#     with open(file_path2[0], 'rb') as file2:
#         testContext = pickle.load(file2)
#     with open(file_path3[0], 'rb') as file3:
#         devContext = pickle.load(file3)

Unsupervised visualization

In [28]:
# Assuming contextual_embeddings is your list of contextual embeddings

# Flatten the list of contextual embeddings into a single list
flattened_embeddings = [emb for dialogue in contextual_embeddings for emb in dialogue]

# Convert the flattened list into a single tensor
tensor_data = torch.tensor(flattened_embeddings)

# Check the shape of the tensor
print(tensor_data.shape)

torch.Size([6144])


In [29]:
labelDecoder

{0: 'anger',
 1: 'disgust',
 2: 'fear',
 3: 'joy',
 4: 'neutral',
 5: 'sadness',
 6: 'surprise'}

Distribution of labels in train data

In [30]:
# Calculate the counts for each unique label
uniqueLabelsTrain, labelCountsTrain = np.unique(list(X_train["Emotion"]), return_counts=True)

# Print the counts for each unique label
for label, count in zip(uniqueLabelsTrain, labelCountsTrain):
    print(f"{labelDecoder[label]}: {count} train occurrences")

anger: 1445 train occurrences
disgust: 356 train occurrences
fear: 320 train occurrences
joy: 2241 train occurrences
neutral: 5658 train occurrences
sadness: 853 train occurrences
surprise: 1303 train occurrences


Distribution of labels in test data

In [31]:
# Calculate the counts for each unique label
uniqueLabelsTest, labelCountsTest = np.unique(list(X_test["Emotion"]), return_counts=True)

# Print the counts for each unique label
for label, count in zip(uniqueLabelsTest, labelCountsTest):
    print(f"{labelDecoder[label]}: {count} test occurrences")

anger: 500 test occurrences
disgust: 96 test occurrences
fear: 56 test occurrences
joy: 476 test occurrences
neutral: 1541 test occurrences
sadness: 258 test occurrences
surprise: 303 test occurrences
