In [2]:
import os,re, time, pickle, collections, importlib, datetime, torch, nltk, pandas as pd, numpy as np
from chardet import detect
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from collections import defaultdict, Counter
from wordebd import WORDEBD
from vocab import Vocab, Vectors
from munch import Munch
from cnnlstmseq import CNNLSTMseq
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from nltk.tokenize import word_tokenize

from transformers import BertModel, BertTokenizer
import transformers

# Autoreload extensions (if you're using Jupyter Notebook or IPython)
%load_ext autoreload
%autoreload 2


In [3]:
key=True

In [4]:
def get_encoding_type(file):
    with open(file, 'rb') as f:
        rawdata = f.read()
    return detect(rawdata)['encoding']

def detect_misspelling(source):
    pass

def replace_spelling(source):
    return re.sub("Åf", "'", source)

In [5]:
def preprocess_text(data):
    '''
    Preprocess text data
    @param data: list of text examples
    @return preprocessed_data: list of preprocessed text examples
    '''
    preprocessed_data = []
    for example in data:
        # Convert to lowercase
        example = example.lower()
        # Remove punctuation
        example = re.sub(r'[^\w\s]', '\'', example)
        preprocessed_data.append(example)
    return preprocessed_data

def load_pretrained_glove():
    print("Loading GloVe...")
    glv_vector = {}
    f = open('/embed/glove/glove.840B.300d.txt', encoding='utf-8')
    for line in f:
        values = line.split()
        word, coefs = values[0], np.asarray(values[1:], dtype='float')
        try:
            glv_vector[word] = coefs
        except ValueError:
            continue
    f.close()
    start_time = time.time()
    print(f"Took {time.time() - start_time} seconds to load pretrained GloVe model.")
    return glv_vector

def encode_labels(encoder, l):
    return encoder[l]

def _read_words(data, convmode=None):
    '''    
    Count the occurrences of all words
    @param convmode: str, None for non conversational scope, 'naive' for classic or naive approach, 'conv' for conversation depth into account (one additional dim and nested values)
    @param data: list of examples
    @return words: list of words (with duplicates)
    '''    
    words = []
    if convmode is None:
        for example in data:
            words += example.split()
    return words

def find_value_ranges(lst):
    value_ranges = []
    start_index = 0

    for i in range(1, len(lst)):
        if lst[i] != lst[i - 1]:
            value_ranges.append((start_index, i - 1))
            start_index = i

    # Add the last range
    value_ranges.append((start_index, len(lst) - 1))

    return value_ranges

In [6]:
# Read the CSV file
X_train = pd.read_csv('data/train_sent_emo_dya.csv', encoding='MacRoman')
X_test = pd.read_csv('data/test_sent_emo_dya.csv', encoding='MacRoman')

# Display the first three rows
print(X_train.shape)
print(X_test.shape)

(12840, 12)
(3400, 12)


In [7]:
# Define features to drop
drop_features = list(X_train.columns[6:]) 

# Create DataFrame for target labels
y_train = pd.DataFrame()
y_test = pd.DataFrame()

y_train["Emotion"] = X_train["Emotion"].copy()
y_test["Emotion"] = X_test["Emotion"].copy()

y_train["Dialogue_ID"] = X_train["Dialogue_ID"].copy()
y_test["Dialogue_ID"] = X_test["Dialogue_ID"].copy()

# Drop features from X_train DataFrame
X_train = X_train.drop(drop_features, axis=1)

print(X_train[:3])
print(y_train[:3])

print(X_test[:3])
print(y_test[:3])

                                           Utterance          Speaker  \
0  also I was the point person on my companyÅfs t...         Chandler   
1                  You mustÅfve had your hands full.  The Interviewer   
2                            That I did. That I did.         Chandler   

   Emotion Sentiment  Dialogue_ID  Utterance_ID  
0  neutral   neutral            0             0  
1  neutral   neutral            0             1  
2  neutral   neutral            0             2  
   Emotion  Dialogue_ID
0  neutral            0
1  neutral            0
2  neutral            0
                                           Utterance Speaker   Emotion  \
0  Why do all you‚Äôre coffee mugs have numbers o...    Mark  surprise   
1  Oh. That‚Äôs so Monica can keep track. That wa...  Rachel     anger   
2                                       Y'know what?  Rachel   neutral   

  Sentiment  Dialogue_ID  Utterance_ID  Old_Dialogue_ID  Old_Utterance_ID  \
0  positive            0             

In [10]:
X_train["Utterance"] = X_train["Utterance"].apply(lambda x: replace_spelling(x))
X_test["Utterance"] = X_test["Utterance"].apply(lambda x: replace_spelling(x))

X_train["Utterance"] = preprocess_text(X_train["Utterance"].tolist())
X_test["Utterance"] = preprocess_text(X_test["Utterance"].tolist())

# Print the first 14 rows of X_train DataFrame
print(X_train[:14])

                                            Utterance          Speaker  \
0   also i was the point person on my company's tr...         Chandler   
1                    you must've had your hands full'  The Interviewer   
2                             that i did' that i did'         Chandler   
3       so let's talk a little bit about your duties'  The Interviewer   
4                              my duties'  all right'         Chandler   
5   now you'll be heading a whole division' so you...  The Interviewer   
6                                              i see'         Chandler   
7   but there'll be perhaps 30 people under you so...  The Interviewer   
8                                       good to know'         Chandler   
9                               we can go into detail  The Interviewer   
10                             no don't i beg of you'         Chandler   
11  all right then' we'll have a definite answer f...  The Interviewer   
12                                    

In [11]:
checkFile1 = os.path.isfile("data/dump/label_encoder.pkl")
checkFile2 = os.path.isfile("data/dump/label_decoder.pkl")

if key:
    labels = sorted(set(y_train.Emotion))
    labelEncoder = {label: i for i, label in enumerate(labels)}
    labelDecoder = {i: label for i, label in enumerate(labels)}

    pickle.dump(labelEncoder, open('data/dump/label_encoder.pkl', 'wb'))
    pickle.dump(labelDecoder, open('data/dump/label_decoder.pkl', 'wb'))
else:
    file1 = open('data/dump/label_encoder.pkl', 'rb')
    file2 = open('data/dump/label_decoder.pkl', 'rb')
    labelEncoder = pickle.load(file1)
    labelDecoder = pickle.load(file2)
    file1.close()
    file2.close()

In [14]:
labelEncoder

{'anger': 0,
 'disgust': 1,
 'fear': 2,
 'joy': 3,
 'neutral': 4,
 'sadness': 5,
 'surprise': 6}

In [15]:
# Apply label encoding to the "Emotion" column in y_train
y_train["Emotion"] = y_train["Emotion"].apply(lambda x: encode_labels(labelEncoder, x))
y_test["Emotion"] = y_test["Emotion"].apply(lambda x: encode_labels(labelEncoder, x))

# Copy the encoded "Emotion" column from y_train to X_train
X_train["Emotion"] = y_train["Emotion"].copy()
X_test["Emotion"] = y_test["Emotion"].copy()

In [16]:
# Check if the file already exists
checkFile1 = os.path.isfile("data/dump/labels_train.pkl")
checkFile2 = os.path.isfile("data/dump/labels_test.pkl")

if key:
    pickle.dump(X_train["Emotion"], open('data/dump/labels_train.pkl', 'wb'))
    pickle.dump(X_test["Emotion"], open('data/dump/labels_test.pkl', 'wb'))

Creating an embedding

Testing on smaller data. Uncomment to see the size of updated representations


In [17]:
# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = transformers.BertTokenizer.from_pretrained(model_name)
model = transformers.BertModel.from_pretrained(model_name)

# Define your dialog data
dialogs = [
    "How are you today?",
    "I'm doing well, thank you!",
    "That's good to hear.",
    "Yes, it is.",
    "Do you have any plans for the weekend?",
    "Not really, just relaxing at home.",
    "Sounds nice.",
    "Indeed."
]

# Tokenize and encode the dialogs
encoded_dialogs = [tokenizer.encode(dialog, add_special_tokens=True) for dialog in dialogs]

# Pad sequences to the same length
max_length = max(len(dialog) for dialog in encoded_dialogs)
padded_dialogs = [dialog + [tokenizer.pad_token_id] * (max_length - len(dialog)) for dialog in encoded_dialogs]

# Create attention masks
attention_masks = [[1] * len(dialog) + [0] * (max_length - len(dialog)) for dialog in encoded_dialogs]

# Convert to PyTorch tensors
input_ids = torch.tensor(padded_dialogs)
attention_masks = torch.tensor(attention_masks)

# Obtain the BERT embeddings
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_masks)

# Extract the contextual embeddings (CLS token)
contextual_embeddings = outputs[0][:, 0, :]  # Extract embeddings for the [CLS] token

# Print the shape of the contextual embeddings
print("Shape of contextual embeddings:", contextual_embeddings.shape)

Shape of contextual embeddings: torch.Size([8, 768])


This is just a duplicate of code above. Using this on train data

In [19]:
rangesTrain = find_value_ranges(X_train["Dialogue_ID"])
print(len(rangesTrain))

rangesTest = find_value_ranges(X_test["Dialogue_ID"])
print(len(rangesTest))

2160
577


Testing on small sample

In [20]:
# # Load pre-trained BERT model and tokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained('bert-base-uncased')

# # List of text dialogs
# dialogs = [
#     ["How are you today?", "I'm doing well, thank you!"],
#     ["That's good to hear.", "Yes, it is.", "Do you have any plans for the weekend?", "Not really, just relaxing at home."],
#     ["Sounds nice.", "Indeed."]
# ]

# # List to store contextual embeddings for each utterance
# contextual_embeddings = []

# # Iterate through each dialog
# for dialog in dialogs:
#     # Tokenize and convert dialog to input IDs
#     inputs = tokenizer(dialog, return_tensors='pt', padding=True, truncation=True)
    
#     # Get BERT model outputs
#     with torch.no_grad():
#         outputs = model(**inputs)

#     # Extract contextual embeddings (CLS token represents the entire sequence)
#     embeddings = outputs.last_hidden_state[:, 0, :].tolist()

#     # Store embeddings for each utterance in the dialog
#     contextual_embeddings.append(embeddings)

# # Print the list of contextual embeddings
# print("List of Contextual Embeddings:")
# # for embedding in contextual_embeddings:
# #     print(embedding)

#### Contexualized train data

In [21]:
# If the file doesn't exist, compute updated representations and save them
if key:
    dialogs = []
    for range_pair, iteration in zip(rangesTrain, range(len(rangesTrain))):
        start_idx, end_idx = range_pair            
        dialog = list(X_train['Utterance'][start_idx:end_idx + 1])
        dialogs.append(dialog)

    # List to store contextual embeddings for each utterance
    contextualEmbeddingsTrain = []

    # Iterate through each dialog
    for dialog in dialogs:
        # Tokenize and convert dialog to input IDs
        inputs = tokenizer(dialog, return_tensors='pt', padding=True, truncation=True)

        # Get BERT model outputs
        with torch.no_grad():
            outputs = model(**inputs)

        # Extract contextual embeddings (CLS token represents the entire sequence)
        embeddings = outputs.last_hidden_state[:, 0, :].tolist()

        # Store embeddings for each utterance in the dialog
        contextualEmbeddingsTrain.append(torch.tensor(embeddings))

#         if iteration % 800 == 0 | iteration == len(ranges):
    file_path = f'embed/u_prime_BERT_train.pkl'
    with open(file_path, 'wb') as file:
            pickle.dump(contextualEmbeddingsTrain, file)

else:
    file_path = f'embed/u_prime_BERT_train.pkl'
    with open(file_path, 'rb') as file:
        contextualEmbeddingsTrain = pickle.load(file)

<h4> Contexualize test data

In [22]:
# If the file doesn't exist, compute updated representations and save them
if key:
    dialogs = []
    for range_pair, iteration in zip(rangesTest, range(len(rangesTest))):
        start_idx, end_idx = range_pair            
        dialog = list(X_train['Utterance'][start_idx:end_idx + 1])
        dialogs.append(dialog)

    # List to store contextual embeddings for each utterance
    contextualEmbeddingsTest = []

    # Iterate through each dialog
    for dialog in dialogs:
        # Tokenize and convert dialog to input IDs
        inputs = tokenizer(dialog, return_tensors='pt', padding=True, truncation=True)

        # Get BERT model outputs
        with torch.no_grad():
            outputs = model(**inputs)

        # Extract contextual embeddings (CLS token represents the entire sequence)
        embeddings = outputs.last_hidden_state[:, 0, :].tolist()

        # Store embeddings for each utterance in the dialog
        contextualEmbeddingsTest.append(torch.tensor(embeddings))

#         if iteration % 800 == 0 | iteration == len(ranges):
    file_path = f'embed/u_prime_BERT_test.pkl'
    with open(file_path, 'wb') as file:
            pickle.dump(contextualEmbeddingsTest, file)

else:
    file_path = f'embed/u_prime_BERT_test.pkl'
    with open(file_path, 'rb') as file:
        contextualEmbeddingsTest = pickle.load(file)

<h4> Getting speaker encoder for train set

In [24]:
# Check if the file exists
checkFile = os.path.isfile("data/dump/speaker_encoder_train.pkl")
encodedSpeakersTrain = []

if key:
    for range_pair in rangesTrain:
        start_idx, end_idx = range_pair
        speaker_per_dialog = X_train['Speaker'][start_idx:end_idx + 1].copy()
        speaker_feature = sorted(set(speaker_per_dialog))
        speaker_encoder = {feature: i for i, feature in enumerate(speaker_feature)}
        speaker_decoder = {i: feature for i, feature in enumerate(speaker_feature)}

        encoded_speaker = speaker_per_dialog.replace(speaker_encoder)
        encodedSpeakersTrain.append(encoded_speaker)

    # Save encoded speaker list and ranges to a file using pickle
    file_path = 'data/dump/speaker_encoder_train.pkl'
    with open(file_path, 'wb') as file:
        pickle.dump([encodedSpeakersTrain, rangesTrain], file)
else:
    # Load encoded speaker list and ranges from the existing pickle file
    file = open('data/dump/speaker_encoder_train.pkl', "rb")
    encodedSpeakersTrain, rangesTrain = pickle.load(file)
    file.close()

<h4> Getting speaker encoder for test set

In [25]:
# Check if the file exists
checkFile = os.path.isfile("data/dump/speaker_encoder_test.pkl")
encodedSpeakersTest = []

if key:
    for range_pair in rangesTest:
        start_idx, end_idx = range_pair
        speaker_per_dialog = X_train['Speaker'][start_idx:end_idx + 1].copy()
        speaker_feature = sorted(set(speaker_per_dialog))
        speaker_encoder = {feature: i for i, feature in enumerate(speaker_feature)}
        speaker_decoder = {i: feature for i, feature in enumerate(speaker_feature)}

        encoded_speaker = speaker_per_dialog.replace(speaker_encoder)
        encodedSpeakersTest.append(encoded_speaker)

    # Save encoded speaker list and ranges to a file using pickle
    file_path = 'data/dump/speaker_encoder_test.pkl'
    with open(file_path, 'wb') as file:
        pickle.dump([encodedSpeakersTest, rangesTest], file)
else:
    # Load encoded speaker list and ranges from the existing pickle file
    file = open('data/dump/speaker_encoder_test.pkl', "rb")
    encodedSpeakersTest, rangesTest = pickle.load(file)
    file.close()

Unsupervised visualization

In [26]:
# Assuming contextual_embeddings is your list of contextual embeddings

# Flatten the list of contextual embeddings into a single list
flattened_embeddings = [emb for dialogue in contextual_embeddings for emb in dialogue]

# Convert the flattened list into a single tensor
tensor_data = torch.tensor(flattened_embeddings)

# Check the shape of the tensor
print(tensor_data.shape)

torch.Size([6144])


In [27]:
labelDecoder

{0: 'anger',
 1: 'disgust',
 2: 'fear',
 3: 'joy',
 4: 'neutral',
 5: 'sadness',
 6: 'surprise'}

Distribution of labels in train data

In [30]:
# Calculate the counts for each unique label
uniqueLabelsTrain, labelCountsTrain = np.unique(list(X_train["Emotion"]), return_counts=True)

# Print the counts for each unique label
for label, count in zip(uniqueLabelsTrain, labelCountsTrain):
    print(f"{labelDecoder[label]}: {count} train occurrences")

anger: 1500 train occurrences
disgust: 364 train occurrences
fear: 338 train occurrences
joy: 2312 train occurrences
neutral: 5960 train occurrences
sadness: 876 train occurrences
surprise: 1490 train occurrences


Distribution of labels in test data

In [31]:
# Calculate the counts for each unique label
uniqueLabelsTest, labelCountsTest = np.unique(list(X_test["Emotion"]), return_counts=True)

# Print the counts for each unique label
for label, count in zip(uniqueLabelsTest, labelCountsTest):
    print(f"{labelDecoder[label]}: {count} train occurrences")

anger: 516 train occurrences
disgust: 99 train occurrences
fear: 60 train occurrences
joy: 495 train occurrences
neutral: 1615 train occurrences
sadness: 263 train occurrences
surprise: 352 train occurrences


Visualize utterance embeddnig (u') with T-SNE

In [33]:
# labels = torch.tensor(X_train["Emotion"])

In [34]:
# label_decoder

In [None]:
# runTSNE = 1
# if runTSNE:
#     from sklearn.manifold import TSNE
#     import matplotlib.pyplot as plt

#     # List of perplexity values to loop over
#     perplexity_values = [50]

#     # Loop over each perplexity value
#     for perplexity in perplexity_values:
#         # Initialize t-SNE with the current perplexity value
#         tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42)

#         # Fit and transform the data using t-SNE
#         h_prime_tsne = tsne.fit_transform(tensor_data.detach().numpy())

#         # Plot the node embeddings with different colors for each label
#         plt.figure(figsize=(10, 8))
#         for label, emotion in zip(range(len(label_encoder)), label_encoder):
#             indices = (labels == label).nonzero().squeeze()
#             plt.scatter(h_prime_tsne[indices, 0], h_prime_tsne[indices, 1], label=f'{emotion}')
#         plt.title(f'Utterance Embeddings (Train) Visualization (t-SNE) - Perplexity {perplexity}', color="white")
#         plt.xlabel('Dimension 1', color="white")
#         plt.ylabel('Dimension 2', color="white")
#         plt.legend()
#         plt.show()

 Visualize utterance embedding (u') with PCA

In [None]:
# pca = PCA(n_components=2)
# pca_result = pca.fit_transform(tensor_data.detach().numpy())

# # Plot the PCA result with color-coded labels
# plt.figure(figsize=(8, 6))
# for label in np.unique(labels):
#     indices = labels == label
#     plt.scatter(pca_result[indices, 0], pca_result[indices, 1], label=f'{label_decoder[label]}', alpha=0.5)
#     plt.title('PCA Visualization of Utterance Embeddings (Train) with Color-Coded Labels')
#     plt.xlabel('Principal Component 1')
#     plt.ylabel('Principal Component 2')
#     plt.legend()
#     plt.grid(True)
#     plt.show()