In [201]:
import os,re, time, pickle, collections, importlib, datetime, torch, nltk, pandas as pd, numpy as np, time
from chardet import detect
from tqdm import tqdm
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from collections import defaultdict, Counter
from wordebd import WORDEBD
from vocab import Vocab, Vectors
from munch import Munch
from cnnlstmseq import CNNLSTMseq
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from nltk.tokenize import word_tokenize
from transformers import BertModel, BertTokenizer
import transformers
from torch.utils.data import DataLoader, TensorDataset, Dataset
from model import batch_graphify, MaskedEdgeAttention, MaskedNLLLoss, LSTMModel
from model import DATASET_PATH
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Make sure to specify which dataset to use
- dataset_original
- dataset_drop_noise
- dataset_smote

In [203]:
# dataset_path = "dataset_original"
# dataset_path = "dataset_drop_noise"
# dataset_path = "dataset_smote"
dataset_path = DATASET_PATH

In [204]:
def get_encoding_type(file):
    with open(file, 'rb') as f:
        rawdata = f.read()
    return detect(rawdata)['encoding']

def detect_misspelling(source):
    pass

def replace_spelling(source):
    return re.sub("", "", source)

In [205]:
def preprocess_text(data):
    '''
    Preprocess text data
    @param data: list of text examples
    @return preprocessed_data: list of preprocessed text examples
    '''
    preprocessed_data = []
    for example in data:
        # Convert to lowercase
#         example = example.lower()
        # Remove punctuation
        example = re.sub(r'[^\w\s]', '\'', example)
        preprocessed_data.append(example)
    return preprocessed_data

def load_pretrained_glove():
    print("Loading GloVe...")
    glv_vector = {}
    f = open('/embed/glove/glove.840B.300d.txt', encoding='utf-8')
    for line in f:
        values = line.split()
        word, coefs = values[0], np.asarray(values[1:], dtype='float')
        try:
            glv_vector[word] = coefs
        except ValueError:
            continue
    f.close()
    start_time = time.time()
    print(f"Took {time.time() - start_time} seconds to load pretrained GloVe model.")
    return glv_vector

def encode_labels(encoder, l):
    return encoder[l]

def _read_words(data, convmode=None):
    '''    
    Count the occurrences of all words
    @param convmode: str, None for non conversational scope, 'naive' for classic or naive approach, 'conv' for conversation depth into account (one additional dim and nested values)
    @param data: list of examples
    @return words: list of words (with duplicates)
    '''    
    words = []
    if convmode is None:
        for example in data:
            words += example.split()
    return words

def find_value_ranges(lst):
    value_ranges = []
    start_index = 0

    for i in range(1, len(lst)):
        if lst[i] != lst[i - 1]:
            value_ranges.append((start_index, i - 1))
            start_index = i

    # Add the last range
    value_ranges.append((start_index, len(lst) - 1))

    return value_ranges

In [206]:
# Read the CSV file
X_train = pd.read_csv('data/' + dataset_path + '/train_sent_emo_dya.csv', encoding='shift_jis')
X_test = pd.read_csv('data/' + dataset_path+ '/test_sent_emo_dya.csv', encoding='utf-8')
X_dev = pd.read_csv('data/' + dataset_path + '/dev_sent_emo_dya.csv', encoding='utf-8')

# Display the first three rows
print(X_train.shape)
print(X_test.shape)
print(X_dev.shape)

(12840, 12)
(3400, 12)
(1462, 12)


In [207]:
rangesTrain = find_value_ranges(X_train["Dialogue_ID"])
print(len(rangesTrain))

rangesTest = find_value_ranges(X_test["Dialogue_ID"])
print(len(rangesTest))

rangesDev = find_value_ranges(X_dev["Dialogue_ID"])
print(len(rangesDev))

2160
577
270


In [208]:
# Define features to drop
drop_features = list(X_train.columns[6:]) 

# Create DataFrame for target labels
y_train = pd.DataFrame()
y_test = pd.DataFrame()
y_dev = pd.DataFrame()

y_train["Emotion"] = X_train["Emotion"].copy()
y_test["Emotion"] = X_test["Emotion"].copy()
y_dev["Emotion"] = X_dev["Emotion"].copy()

y_train["Dialogue_ID"] = X_train["Dialogue_ID"].copy()
y_test["Dialogue_ID"] = X_test["Dialogue_ID"].copy()
y_dev["Dialogue_ID"] = X_dev["Dialogue_ID"].copy()

# Drop features from X_train DataFrame
X_train = X_train.drop(drop_features, axis=1)
X_test = X_test.drop(drop_features, axis=1)
X_dev = X_dev.drop(drop_features, axis=1)

Before spelling correction

In [209]:
# X_train[["Utterance", "Emotion"]][:50]

In [210]:
# X_test[["Utterance", "Emotion"]][:50]

In [211]:
# X_train["Utterance"] = X_train["Utterance"].apply(lambda x: replace_spelling(x))
# X_test["Utterance"] = X_test["Utterance"].apply(lambda x: replace_spelling(x))

# X_train["Utterance"] = preprocess_text(X_train["Utterance"].tolist())
# X_test["Utterance"] = preprocess_text(X_test["Utterance"].tolist())

In [212]:
# Function to remove instances based on index ranges
def removeInstanceTurn1(X_set, Y_set, ranges):
    indices_to_remove = []
    for range_pair in ranges:
        i = range_pair[0]
        j = range_pair[1]
        
        if i == j:
            indices_to_remove.append(i)
#             print(X_set["Utterance"].iloc[i])
#             print(i, " vs ", len(X_train))

    X_set = X_set.drop(X_set.index[indices_to_remove])
    Y_set = Y_set.drop(Y_set.index[indices_to_remove])
    
    # Reset index after dropping rows
    X_set.reset_index(drop=True, inplace=True)
    Y_set.reset_index(drop=True, inplace=True)
    
    return X_set, Y_set


X_train, y_train = removeInstanceTurn1(X_train, y_train, rangesTrain)
X_test, y_test = removeInstanceTurn1(X_test, y_test, rangesTest)
X_dev, y_dev = removeInstanceTurn1(X_dev, y_dev, rangesDev)

# print("X_set_cleaned:")
# print(X_train_cleaned)
# print("\nY_set_cleaned:")
# print(Y_train_cleaned)

In [213]:
checkFile1 = os.path.isfile("data/dump/" + dataset_path + "/label_encoder.pkl")
checkFile2 = os.path.isfile("data/dump/" + dataset_path + "/label_decoder.pkl")

if not(checkFile1 and checkFile2):
    labels = sorted(set(y_train.Emotion))
    labelEncoder = {label: i for i, label in enumerate(labels)}
    labelDecoder = {i: label for i, label in enumerate(labels)}

    pickle.dump(labelEncoder, open('data/dump/' + dataset_path + '/label_encoder.pkl', 'wb'))
    pickle.dump(labelDecoder, open('data/dump/' + dataset_path + '/label_decoder.pkl', 'wb'))
else:
    file1 = open('data/dump/' + dataset_path + '/label_encoder.pkl', 'rb')
    file2 = open('data/dump/' + dataset_path + '/label_decoder.pkl', 'rb')
    labelEncoder = pickle.load(file1)
    labelDecoder = pickle.load(file2)
    file1.close()
    file2.close()

In [214]:
# Apply label encoding to the "Emotion" column in y_train
if not isinstance(y_train["Emotion"][0], np.int64):   
    y_train["Emotion"] = y_train["Emotion"].apply(lambda x: encode_labels(labelEncoder, x))
    y_test["Emotion"] = y_test["Emotion"].apply(lambda x: encode_labels(labelEncoder, x))
    y_dev["Emotion"] = y_dev["Emotion"].apply(lambda x: encode_labels(labelEncoder, x))

# Copy the encoded "Emotion" column from y_train to X_train
X_train["Emotion"] = y_train["Emotion"].copy()
X_test["Emotion"] = y_test["Emotion"].copy()
X_dev["Emotion"] = y_dev["Emotion"].copy()

In [215]:
# Check if the file already exists
checkFile1 = os.path.isfile("data/dump/" + dataset_path + "/labels_train.pkl")
checkFile2 = os.path.isfile("data/dump/" + dataset_path + "/labels_test.pkl")
checkFile3 = os.path.isfile("data/dump/" + dataset_path + "/labels_dev.pkl")

if not (checkFile1 and checkFile2 and checkFile3):
    pickle.dump(X_train["Emotion"], open('data/dump/' + dataset_path + '/labels_train.pkl', 'wb'))
    pickle.dump(X_test["Emotion"], open('data/dump/' + dataset_path + '/labels_test.pkl', 'wb'))
    pickle.dump(X_dev["Emotion"], open('data/dump/' + dataset_path + '/labels_dev.pkl', 'wb'))

Creating an embedding

Testing on smaller data. Uncomment to see the size of updated representations


In [216]:
# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = transformers.BertTokenizer.from_pretrained(model_name)
model = transformers.BertModel.from_pretrained(model_name)

# Define your dialog data
dialogs = [
    "How are you today?",
    "I'm doing well, thank you!",
    "That's good to hear.",
    "Yes, it is.",
    "Do you have any plans for the weekend?",
    "Not really, just relaxing at home.",
    "Sounds nice.",
    "Indeed."
]

# Tokenize and encode the dialogs
encoded_dialogs = [tokenizer.encode(dialog, add_special_tokens=True) for dialog in dialogs]

# Pad sequences to the same length
max_length = max(len(dialog) for dialog in encoded_dialogs)
padded_dialogs = [dialog + [tokenizer.pad_token_id] * (max_length - len(dialog)) for dialog in encoded_dialogs]

# Create attention masks
attention_masks = [[1] * len(dialog) + [0] * (max_length - len(dialog)) for dialog in encoded_dialogs]

# Convert to PyTorch tensors
input_ids = torch.tensor(padded_dialogs)
attention_masks = torch.tensor(attention_masks)

# Obtain the BERT embeddings
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_masks)

# Extract the contextual embeddings (CLS token)
contextual_embeddings = outputs[0][:, 0, :]  # Extract embeddings for the [CLS] token

# Print the shape of the contextual embeddings
print("Shape of contextual embeddings:", contextual_embeddings.shape)

Shape of contextual embeddings: torch.Size([8, 768])


In [217]:
# input_ids

In [218]:
# attention_masks

This is just a duplicate of code above. Using this on train data

In [219]:
rangesTrain = find_value_ranges(X_train["Dialogue_ID"])
print(len(rangesTrain))

rangesTest = find_value_ranges(X_test["Dialogue_ID"])
print(len(rangesTest))

rangesDev = find_value_ranges(X_dev["Dialogue_ID"])
print(len(rangesDev))

2160
577
270


In [220]:
X_train["Utterance"][rangesTrain[0][0]:rangesTrain[0][1]]

0     also I was the point person on my company’s tr...
1                      You must’ve had your hands full.
2                               That I did. That I did.
3         So let’s talk a little bit about your duties.
4                                My duties?  All right.
5     Now you’ll be heading a whole division, so you...
6                                                I see.
7     But there’ll be perhaps 30 people under you so...
8                                         Good to know.
9                                 We can go into detail
10                               No don’t I beg of you!
11    All right then, we’ll have a definite answer f...
12                                             Really?!
Name: Utterance, dtype: object

Testing on small sample

In [221]:
# # Load pre-trained BERT model and tokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained('bert-base-uncased')

# # List of text dialogs
# dialogs = [
#     ["How are you today?", "I'm doing well, thank you!"],
#     ["That's good to hear.", "Yes, it is.", "Do you have any plans for the weekend?", "Not really, just relaxing at home."],
#     ["Sounds nice.", "Indeed."]
# ]

# # List to store contextual embeddings for each utterance
# contextual_embeddings = []

# # Iterate through each dialog
# for dialog in dialogs:
#     # Tokenize and convert dialog to input IDs
#     inputs = tokenizer(dialog, return_tensors='pt', padding=True, truncation=True)
    
#     # Get BERT model outputs
#     with torch.no_grad():
#         outputs = model(**inputs)

#     # Extract contextual embeddings (CLS token represents the entire sequence)
#     embeddings = outputs.last_hidden_state[:, 0, :].tolist()

#     # Store embeddings for each utterance in the dialog
#     contextual_embeddings.append(embeddings)

# # Print the list of contextual embeddings
# print("List of Contextual Embeddings:")
# # for embedding in contextual_embeddings:
# #     print(embedding)

#### Contexualized train data

In [247]:
checkFile = os.path.isfile("embed/" + dataset_path + "/u_prime_BERT_train.pkl")
start_time = time.time()

if not checkFile:
    dialogs = []
    for range_pair, iteration in tqdm(zip(rangesTrain, range(len(rangesTrain))), desc="Processing Ranges"):
        start_idx, end_idx = range_pair
        dialog = list(X_train['Utterance'][start_idx:end_idx + 1])
        dialogs.append(dialog)

    # List to store contextual embeddings for each utterance
    contextualEmbeddingsTrain = []

    # Iterate through each dialog
    for dialog in tqdm(dialogs, desc="Processing Dialogs"):
        # Tokenize and convert dialog to input IDs
        inputs = tokenizer(dialog, return_tensors='pt', padding=True, truncation=True)

        # Get BERT model outputs
        with torch.no_grad():
            outputs = model(**inputs)

        # Extract contextual embeddings (CLS token represents the entire sequence)
        embeddings = outputs.last_hidden_state[:, 0, :].tolist()

        # Store embeddings for each utterance in the dialog
        contextualEmbeddingsTrain.append(torch.tensor(embeddings))

    file_path = f'embed/' + dataset_path + '/u_prime_BERT_train.pkl'
    with open(file_path, 'wb') as file:
            pickle.dump(contextualEmbeddingsTrain, file)

else:
    file_path = f'embed/' + dataset_path + '/u_prime_BERT_train.pkl'
    with open(file_path, 'rb') as file:
        contextualEmbeddingsTrain = pickle.load(file)
        
# Record end time
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time
print(f"Contexualized train data - Elapsed time: {elapsed_time} seconds")

Processing Ranges: 2160it [00:00, 55304.10it/s]
Processing Dialogs: 100%|██████████████████████████████████████████████████████████| 2160/2160 [07:25<00:00,  4.85it/s]


Contexualized train data - Elapsed time: 447.6954336166382 seconds


<h4> Contexualize test data

In [248]:
checkFile = os.path.isfile("embed/" + dataset_path + "/u_prime_BERT_test.pkl")
start_time = time.time()

if not checkFile:
    dialogs = []
    for range_pair in tqdm(rangesTest):
        start_idx, end_idx = range_pair            
        dialog = list(X_test['Utterance'][start_idx:end_idx + 1])
        dialogs.append(dialog)

    contextualEmbeddingsTest = []

    for dialog in tqdm(dialogs, desc="Processing Dialogs"):
        inputs = tokenizer(dialog, return_tensors='pt', padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :].tolist()
        contextualEmbeddingsTest.append(torch.tensor(embeddings))

    file_path = f'embed/' + dataset_path + '/u_prime_BERT_test.pkl'
    with open(file_path, 'wb') as file:
        pickle.dump(contextualEmbeddingsTest, file)

else:
    file_path = f'embed/' + dataset_path + '/u_prime_BERT_test.pkl'
    with open(file_path, 'rb') as file:
        contextualEmbeddingsTest = pickle.load(file)

end_time = time.time()

elapsed_time = end_time - start_time
print(f"Contexualized test data - Elapsed time: {elapsed_time} seconds")

100%|█████████████████████████████████████████████████████████████████████████████| 577/577 [00:00<00:00, 38644.53it/s]
Processing Dialogs: 100%|████████████████████████████████████████████████████████████| 577/577 [02:02<00:00,  4.71it/s]


Contexualized test data - Elapsed time: 124.44786024093628 seconds


<h4> Contexualize val data

In [249]:
# rangesDev

In [250]:
checkFile = os.path.isfile("embed/" + dataset_path + "/u_prime_BERT_dev.pkl")
start_time = time.time()

if not checkFile:
    dialogs = []
    for range_pair in tqdm(rangesDev):
        start_idx, end_idx = range_pair            
        dialog = list(X_dev['Utterance'][start_idx:end_idx + 1])
        dialogs.append(dialog)

    contextualEmbeddingsDev = []

    for dialog in tqdm(dialogs, desc="Processing Dialogs"):
        inputs = tokenizer(dialog, return_tensors='pt', padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :].tolist()
        contextualEmbeddingsDev.append(torch.tensor(embeddings))

    file_path = f'embed/' + dataset_path + '/u_prime_BERT_dev.pkl'
    with open(file_path, 'wb') as file:
        pickle.dump(contextualEmbeddingsDev, file)

else:
    file_path = f'embed/' + dataset_path + '/u_prime_BERT_dev.pkl'
    with open(file_path, 'rb') as file:
        contextualEmbeddingsDev = pickle.load(file)

end_time = time.time()

elapsed_time = end_time - start_time
print(f"Contexualized test data - Elapsed time: {elapsed_time} seconds")

100%|████████████████████████████████████████████████████████████████████████████████████████| 270/270 [00:00<?, ?it/s]
Processing Dialogs: 100%|████████████████████████████████████████████████████████████| 270/270 [00:51<00:00,  5.26it/s]

Contexualized test data - Elapsed time: 51.460155963897705 seconds





<h4> Getting speaker encoder for train set

In [251]:
# Check if the file exists
checkFile = os.path.isfile("data/dump/" + dataset_path + "/speaker_encoder_train.pkl")
encodedSpeakersTrain = []

if not checkFile:
    for range_pair in rangesTrain:
        start_idx, end_idx = range_pair
        speaker_per_dialog = X_train['Speaker'][start_idx:end_idx + 1].copy()
        speaker_feature = sorted(set(speaker_per_dialog))
        speaker_encoder = {feature: i for i, feature in enumerate(speaker_feature)}
        speaker_decoder = {i: feature for i, feature in enumerate(speaker_feature)}

        encoded_speaker = speaker_per_dialog.replace(speaker_encoder)
        encodedSpeakersTrain.append(encoded_speaker)

    # Save encoded speaker list and ranges to a file using pickle
    file_path = 'data/dump/' + dataset_path + '/speaker_encoder_train.pkl'
    with open(file_path, 'wb') as file:
        pickle.dump([encodedSpeakersTrain, rangesTrain], file)
else:
    # Load encoded speaker list and ranges from the existing pickle file
    file = open('data/dump/' + dataset_path + '/speaker_encoder_train.pkl', "rb")
    encodedSpeakersTrain, rangesTrain = pickle.load(file)
    file.close()

<h4> Getting speaker encoder for test set

In [252]:
checkFile = os.path.isfile("data/dump/" + dataset_path + "/speaker_encoder_test.pkl")
encodedSpeakersTest = []

if not checkFile:
    for range_pair in rangesTest:
        start_idx, end_idx = range_pair
        speaker_per_dialog = X_test['Speaker'][start_idx:end_idx + 1].copy()
        speaker_feature = sorted(set(speaker_per_dialog))
        speaker_encoder = {feature: i for i, feature in enumerate(speaker_feature)}
        speaker_decoder = {i: feature for i, feature in enumerate(speaker_feature)}

        encoded_speaker = speaker_per_dialog.replace(speaker_encoder)
        encodedSpeakersTest.append(encoded_speaker)

    # Save encoded speaker list and ranges to a file using pickle
    file_path = 'data/dump/' + dataset_path + '/speaker_encoder_test.pkl'
    with open(file_path, 'wb') as file:
        pickle.dump([encodedSpeakersTest, rangesTest], file)
else:
    # Load encoded speaker list and ranges from the existing pickle file
    file = open('data/dump/' + dataset_path + '/speaker_encoder_test.pkl', "rb")
    encodedSpeakersTest, rangesTest = pickle.load(file)
    file.close()

<h4> Getting speaker encoder for val set

In [253]:
checkFile = os.path.isfile("data/dump/" + dataset_path + "/speaker_encoder_dev.pkl")
encodedSpeakersDev = []

if not checkFile:
    for range_pair in rangesDev:
        start_idx, end_idx = range_pair
        speaker_per_dialog = X_dev['Speaker'][start_idx:end_idx + 1].copy()
        speaker_feature = sorted(set(speaker_per_dialog))
        speaker_encoder = {feature: i for i, feature in enumerate(speaker_feature)}
        speaker_decoder = {i: feature for i, feature in enumerate(speaker_feature)}

        encoded_speaker = speaker_per_dialog.replace(speaker_encoder)
        encodedSpeakersDev.append(encoded_speaker)

    # Save encoded speaker list and ranges to a file using pickle
    file_path = 'data/dump/' + dataset_path + '/speaker_encoder_dev.pkl'
    with open(file_path, 'wb') as file:
        pickle.dump([encodedSpeakersDev, rangesDev], file)
else:
    # Load encoded speaker list and ranges from the existing pickle file
    file = open('data/dump/' + dataset_path + '/speaker_encoder_dev.pkl', "rb")
    encodedSpeakersDev, rangesDev = pickle.load(file)
    file.close()

<h4>Getting data required for graph processing

In [254]:
D_m = 100
D_g = 150
D_p = 150
D_e = int(contextualEmbeddingsTrain[0].shape[1]/2)
D_h = 100
D_a = 100
graph_h=100
n_speakers=2
max_seq_len=110
window_past=0
window_future=5
# vocab_size=vocab_size
n_classes=7
listener_state=False
context_attention='general'
dropout=0.5
nodal_attention=False
no_cuda=True
n_relations = 2 * n_speakers ** 2
att_model = MaskedEdgeAttention(2 * D_e, max_seq_len, no_cuda)
nodal_attention=True
edge_type_mapping = {}
for j in range(n_speakers):
    for k in range(n_speakers):
        edge_type_mapping[str(j) + str(k) + '0'] = len(edge_type_mapping)
        edge_type_mapping[str(j) + str(k) + '1'] = len(edge_type_mapping)

Sample Experiment

In [255]:
# x_data = torch.tensor([
#                         [[1, 2, 3, 0, 0],     # Utterance 1
#                         [4, 5, 0, 0, 0]],    # Utterance 2
#                        [[6, 7, 8, 9, 0],     # Utterance 3
#                         [10, 0, 0, 0, 0]]])  # Utterance 4

# umask_data = torch.tensor([[1, 1],        # Dialogue 1 has 2 utterances
#                            [1, 0]])       # Dialogue 2 has 1 utterance
# # features = cnn_feat_extractor(x_data, umask_data)
# # emotions, hidden = lstm(features)

# textf = x_data.squeeze(0)  # Remove batch dimension (1, utterance_size, embedding_size) -> (utterance_size, embedding_size)
# umask = torch.FloatTensor([[1] * textf.size(0)])  # Adjust to (1, utterance_size)
# lengths = [(umask[j] == 1).nonzero().tolist()[-1][0] + 1 for j in range(len(umask))]

In [256]:
# print(textf)
# print(umask)
# print(lengths)

In [257]:
# encodedSpeakersTrain

June 5 possible mismatch

In [258]:
# for ranges in rangesDev:
#     if ranges[1]-ranges[0] == 0:
#         print("delete")

In [259]:
# max_seq_len = 17
# umask = torch.FloatTensor([[1] * 10 + [0] * (17 - 10)])
# assert umask.size(1) == max_seq_len, f"umask size is {umask.size(1)}, expected {max_seq_len}"
# lengths = [(umask[j] == 1).nonzero().tolist()[-1][0] + 1 for j in range(len(umask))]
# print(umask)
# print(lengths)

In [260]:
class ContextDataset(Dataset):
    def __init__(self, contextEmbeddings, rangesSet, encodedSpeakersSet):
#         print(contextEmbeddings[0])
        self.contextEmbeddings = contextEmbeddings
        self.rangesSet = rangesSet
        self.encodedSpeakersSet = encodedSpeakersSet

    def __len__(self):
        return len(self.rangesSet)

    def __getitem__(self, idx):
        startIdx, endIdx = self.rangesSet[idx]
#         sequence = self.X_set["sequence"][startIdx:endIdx+1].tolist()
        convs = self.contextEmbeddings[idx]
        qmask = self.encodedSpeakersSet[startIdx: endIdx+1]
        return convs, qmask

# Define the ContextEncoding function
def ContextEncoding(file_path, dataset):
    all_emotions, all_umask, all_seq_lengths = [], [], []
    all_features, all_edge_index, all_edge_norm, all_edge_type, all_edge_index_lengths = [], [], [], [], []
    max_seq_len = 30
    dataloader = DataLoader(dataset, batch_size=1, shuffle=False)
    for convs, qmask in tqdm(dataloader, desc="Encoding Progress", unit="batch"):
        convs = convs[0]
        textf = convs.unsqueeze(1)

        umask = torch.FloatTensor([[1] * len(textf) + [0] * (max_seq_len - len(textf))])
        assert umask.size(1) == max_seq_len, f"umask size is {umask.size(1)}, expected {max_seq_len}"
        lengths = [(umask[j] == 1).nonzero().tolist()[-1][0] + 1 for j in range(len(umask))]
        
        features, edge_index, \
        edge_norm, edge_type, \
        edge_index_lengths = batch_graphify(textf, 
                                            qmask,
                                            lengths,
                                            window_past,
                                            window_future,
                                            edge_type_mapping,
                                            att_model, 
                                            no_cuda)
        all_umask.append(umask)
        all_seq_lengths.append(lengths)
        all_features.append(features)
        all_edge_index.append(edge_index)
        all_edge_norm.append(edge_norm)
        all_edge_type.append(edge_type)
        all_edge_index_lengths.append(edge_index_lengths)
        
    with open(file_path, 'wb') as file:
        pickle.dump([   all_umask, \
                        all_seq_lengths,
                        all_features, \
                        all_edge_index, \
                        all_edge_norm, \
                        all_edge_type, \
                        all_edge_index_lengths], file)
    
    return all_features, all_edge_index, all_edge_norm, all_edge_type, all_edge_index_lengths


In [261]:
# File paths
file_path1 = 'embed/' + dataset_path + '/pre_h_prime_BERT_train.pkl'
file_path2 = 'embed/' + dataset_path + '/pre_h_prime_BERT_test.pkl'
file_path3 = 'embed/' + dataset_path + '/pre_h_prime_BERT_dev.pkl'

# Check if files exist
checkFile1 = os.path.isfile(file_path1)
checkFile2 = os.path.isfile(file_path2)
checkFile3 = os.path.isfile(file_path3)

if not checkFile1:
    encodedSpeakersFlat = [speaker for dialogue in encodedSpeakersTrain for speaker in dialogue]
    oheEncodedSpeakersFlat = torch.FloatTensor([[1, 0] if x == 0 else [0, 1] for x in encodedSpeakersFlat])
    trainDataset = ContextDataset(contextualEmbeddingsTrain, rangesTrain, oheEncodedSpeakersFlat)
    start_time = time.time()
    
    all_features, \
     all_edge_index, \
     all_edge_norm, \
     all_edge_type, \
     all_edge_index_lengths = ContextEncoding(file_path1, trainDataset)
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    print("It took", elapsed_time, "seconds to encode train text")

if not checkFile2:
    encodedSpeakersFlat = [speaker for dialogue in encodedSpeakersTest for speaker in dialogue]
    oheEncodedSpeakersFlat = torch.FloatTensor([[1, 0] if x == 0 else [0, 1] for x in encodedSpeakersFlat])
    
    testDataset = ContextDataset(contextualEmbeddingsTest, rangesTest, oheEncodedSpeakersFlat)
    start_time = time.time()
    testContext, _, _, _, _ = ContextEncoding(file_path2, testDataset)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print("It took", elapsed_time, "seconds to encode test text")
    
if not checkFile3:
    encodedSpeakersFlat = [speaker for dialogue in encodedSpeakersDev for speaker in dialogue]
    oheEncodedSpeakersFlat = torch.FloatTensor([[1, 0] if x == 0 else [0, 1] for x in encodedSpeakersFlat])
    
    devDataset = ContextDataset(contextualEmbeddingsDev, rangesDev, oheEncodedSpeakersFlat)
    start_time = time.time()
    devContext, _, _, _, _ = ContextEncoding(file_path3, devDataset)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print("It took", elapsed_time, "seconds to encode test text")

if checkFile1 and checkFile2 and checkFile3:
#     with open(file_path1[0], 'rb') as file1:
#         trainContext = pickle.load(file1)
    with open(file_path1, 'rb') as file1:
        _, _, features, edge_index, \
        edge_norm, edge_type, edge_index_lengths = pickle.load(file1)     
#     with open(file_path2[0], 'rb') as file2:
#         testContext = pickle.load(file2)
#     with open(file_path3[0], 'rb') as file3:
#         devContext = pickle.load(file3)

Encoding Progress: 100%|███████████████████████████████████████████████████████| 2160/2160 [00:06<00:00, 313.01batch/s]


It took 8.484763145446777 seconds to encode train text


Encoding Progress: 100%|█████████████████████████████████████████████████████████| 577/577 [00:01<00:00, 316.74batch/s]


It took 2.674950361251831 seconds to encode test text


Encoding Progress: 100%|█████████████████████████████████████████████████████████| 270/270 [00:00<00:00, 331.71batch/s]


It took 0.9863080978393555 seconds to encode test text


In [262]:
contextualEmbeddingsTrain[0].shape

torch.Size([14, 768])

In [263]:
print(len(contextualEmbeddingsTrain))
print(len(contextualEmbeddingsTest))
print(len(contextualEmbeddingsDev))

2160
577
270


Unsupervised visualization

In [264]:
# Assuming contextual_embeddings is your list of contextual embeddings

# Flatten the list of contextual embeddings into a single list
flattened_embeddings = [emb for dialogue in contextual_embeddings for emb in dialogue]

# Convert the flattened list into a single tensor
tensor_data = torch.tensor(flattened_embeddings)

# Check the shape of the tensor
print(tensor_data.shape)

torch.Size([6144])


In [265]:
labelDecoder

{0: 'anger',
 1: 'disgust',
 2: 'fear',
 3: 'joy',
 4: 'neutral',
 5: 'sadness',
 6: 'surprise'}

Distribution of labels in train data

In [266]:
# Calculate the counts for each unique label
uniqueLabelsTrain, labelCountsTrain = np.unique(list(X_train["Emotion"]), return_counts=True)

# Print the counts for each unique label
for label, count in zip(uniqueLabelsTrain, labelCountsTrain):
    print(f"{labelDecoder[label]}: {count} train occurrences")

anger: 1500 train occurrences
disgust: 364 train occurrences
fear: 338 train occurrences
joy: 2312 train occurrences
neutral: 5960 train occurrences
sadness: 876 train occurrences
surprise: 1490 train occurrences


Distribution of labels in test data

In [267]:
# Calculate the counts for each unique label
uniqueLabelsTest, labelCountsTest = np.unique(list(X_test["Emotion"]), return_counts=True)

# Print the counts for each unique label
for label, count in zip(uniqueLabelsTest, labelCountsTest):
    print(f"{labelDecoder[label]}: {count} test occurrences")

anger: 516 test occurrences
disgust: 99 test occurrences
fear: 60 test occurrences
joy: 495 test occurrences
neutral: 1615 test occurrences
sadness: 263 test occurrences
surprise: 352 test occurrences


In [278]:
#set this to true if you finish the context encoding, also manually change the file
key = False

In [282]:
def apply_label_decoder(df, columns, decoder):
    for column in columns:
        df[column] = df[column].map(decoder)
    return df

def load_and_concatenate_predictions(dataset_path):
    # Load predictions from pickle files
    file_name = f"data/dump/{dataset_path}/BERT_data_for_classifier/BERT_predictedTest.pkl"
    with open(file_name, 'rb') as file:
        df_predictions_BERT = pickle.load(file)

    file_name = f"data/dump/{dataset_path}/BERT_data_for_classifier/egat_predictedTest.pkl"
    with open(file_name, 'rb') as file:
        df_predictions_BERT_egat = pickle.load(file)

    # Concatenate DataFrames
    df_concatenated = pd.concat([
        X_test[["Utterance", "Emotion"]],
        df_predictions_BERT["predicted_label"].rename("BERT_predicted_label"),
        df_predictions_BERT_egat["predicted_label"].rename("EGAT_predicted_label")
    ], axis=1)

    # Apply label decoder to specified columns
    df_concatenated = apply_label_decoder(df_concatenated, ["Emotion", "BERT_predicted_label", "EGAT_predicted_label"], labelDecoder)

    # Save the DataFrame to CSV
    output_file = f"data/dump/{dataset_path}/BERT_data_for_classifier/test_predictions.csv"
    df_concatenated.to_csv(output_file, index=False)

    return df_concatenated

# Example usage:
if key:
    concatenated_df = load_and_concatenate_predictions(dataset_path)
    print(concatenated_df.head())

                                           Utterance   Emotion  \
0  Why do all you’re coffee mugs have numbers on ...  surprise   
1  Oh. That’s so Monica can keep track. That way ...     anger   
2                                       Y'know what?   neutral   
3                                              Okay.   neutral   
4  Ross, didn't you say that there was an elevato...   neutral   

  BERT_predicted_label EGAT_predicted_label  
0              neutral              neutral  
1              neutral              neutral  
2              neutral              neutral  
3              neutral              neutral  
4              neutral              neutral  


Visualize utterance embeddnig (u') with T-SNE

In [268]:
# labels = torch.tensor(X_train["Emotion"])

In [269]:
# label_decoder

In [270]:
# runTSNE = 1
# if runTSNE:
#     from sklearn.manifold import TSNE
#     import matplotlib.pyplot as plt

#     # List of perplexity values to loop over
#     perplexity_values = [50]

#     # Loop over each perplexity value
#     for perplexity in perplexity_values:
#         # Initialize t-SNE with the current perplexity value
#         tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42)

#         # Fit and transform the data using t-SNE
#         h_prime_tsne = tsne.fit_transform(tensor_data.detach().numpy())

#         # Plot the node embeddings with different colors for each label
#         plt.figure(figsize=(10, 8))
#         for label, emotion in zip(range(len(label_encoder)), label_encoder):
#             indices = (labels == label).nonzero().squeeze()
#             plt.scatter(h_prime_tsne[indices, 0], h_prime_tsne[indices, 1], label=f'{emotion}')
#         plt.title(f'Utterance Embeddings (Train) Visualization (t-SNE) - Perplexity {perplexity}', color="white")
#         plt.xlabel('Dimension 1', color="white")
#         plt.ylabel('Dimension 2', color="white")
#         plt.legend()
#         plt.show()

 Visualize utterance embedding (u') with PCA

In [271]:
# pca = PCA(n_components=2)
# pca_result = pca.fit_transform(tensor_data.detach().numpy())

# # Plot the PCA result with color-coded labels
# plt.figure(figsize=(8, 6))
# for label in np.unique(labels):
#     indices = labels == label
#     plt.scatter(pca_result[indices, 0], pca_result[indices, 1], label=f'{label_decoder[label]}', alpha=0.5)
#     plt.title('PCA Visualization of Utterance Embeddings (Train) with Color-Coded Labels')
#     plt.xlabel('Principal Component 1')
#     plt.ylabel('Principal Component 2')
#     plt.legend()
#     plt.grid(True)
#     plt.show()