In [4]:
import os, re, time, pickle, collections, importlib, datetime, torch
import pandas as pd, numpy as np, pickle
from chardet import detect
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences
from collections import defaultdict, Counter
from wordebd import WORDEBD
from vocab import Vocab, Vectors
from munch import Munch
from cnnlstmseq import CNNLSTMseq
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
def get_encoding_type(file):
    with open(file, 'rb') as f:
        rawdata = f.read(),
    return detect(rawdata['encoding'])

def detect_misspelling(source):
    pass
def replace_spelling(source):
    return re.sub("Åf", "'", source)

In [6]:
# referenced from DialogueGCN, mastodon code
def preprocess_text(x):
    for punct in '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\'':
        x = x.replace(punct, ' ')
    x = ' '.join(x.split())
    x = x.lower()

    return x

def load_pretrained_glove():
    print("Loading GloVe...")
    glv_vector = {}
    f = open('/embed/glove/glove.840B.300d.txt', encoding='utf-8')
    for line in f:
        values = line.split()
        word, coefs = values[0], np.asarray(values[1:], dtype='float')
        try:
            glv_vector[word] = coefs

        except ValueError:
            continue
    f.close()
    start_time = time.time()
    print(f"Took {time.time() - start_time} seconds to load pretrained GloVe model.")
    return glv_vector


def encode_labels(encoder, l):
    return encoder[l]

def load_data_from_npy(file_path):
    try:
        data = np.load(file_path, allow_pickle=True)
        if isinstance(data, np.ndarray):
            if data.ndim == 2:
                return pd.DataFrame(data)
            else:
                raise ValueError("The loaded array is not two-dimensional.")
        else:
            raise TypeError("The loaded object is not a NumPy array.")
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None
    except Exception as e:
        print(f"Error: An exception occurred - {str(e)}")
        return None
    
# def sentence_embedding(sentence, embeddings):
#     words = sentence.split()
#     vectors = [embeddings.get(word, np.zeros(300)) for word in words]
#     mean_vector = np.mean(vectors, axis=0)
#     return mean_vector

def _read_words(data, convmode=None):
    '''    
        Count the occurrences of all words
        @param convmode: str, None for non conversational scope, 'naive' for classic or naive approach, 'conv' for conversation depth into account (one additional dim and nested values)
        @param data: list of examples
        @return words: list of words (with duplicates)
    '''    
    words = []
    if convmode is None:
        for example in data:
            words += example.split()     
    return words

def _data_to_nparray(data, vocab, args):
    '''
        Convert the data into a dictionary of np arrays for speed.
    '''
    raw = np.array([e for e in data["Utterance"]], dtype=object)
    doc_label = np.array([x for x in data["Emotion"]], dtype=np.int64)

    # compute the max text length
    text_len = np.array([len(e) for e in data["Utterance"]])
    max_text_len = max(text_len)
    ids = np.array([e for e in data['Dialogue_ID']])
    ids2 = np.array([e for e in data['Utterance_ID']])

    # initialize the big numpy array by <pad>
    text = vocab.stoi['<pad>'] * np.ones([len(data), max_text_len],
                                     dtype=np.int64)

    del_idx = []
    # convert each token to its corresponding id
    for i in range(len(X_train)):
        text[i, :len(X_train['Utterance'][i])] = [vocab.stoi[x] if x in vocab.stoi else vocab.stoi['<unk>']
                for x in X_train['Utterance'][i]]

        # filter out document with only unk and pad
        if np.max(text[i]) < 2:
            del_idx.append(i)

    vocab_size = vocab.vectors.size()[0]


    ## Curation for padding (string instead of list of list)
    raw = [ ["<pad>" if m == ["<pad>", "<pad>", "<pad>", "<pad>", "<pad>"] else m for m in c ] for c in raw ]

    ids, ids2, text_len, text, doc_label, raw = _del_by_idx( [ids, ids2, text_len, text, doc_label, raw], del_idx, 0)
    new_data = {
        'ids': ids,
        'ids2': ids2,
        'text': text,
        'text_len': text_len,
        'label': doc_label,
#         'raw': raw,
        'vocab_size': vocab_size,
    }

    return new_data

def _del_by_idx(array_list, idx, axis):

    '''        
        Delete the specified index for each array in the array_lists",

        @params: array_list: list of np arrays
        @params: idx: list of int
        @params: axis: int

        @return: res: tuple of pruned np arrays
    '''
    if type(array_list) is not list:
        array_list = [array_list]

    # modified to perform operations in place
    for i, array in enumerate(array_list):
        array_list[i] = np.delete(array, idx, axis)

    if len(array_list) == 1:
        return array_list[0]
    else:
        return array_list

def find_value_ranges(lst):
    value_ranges = []
    start_index = 0

    for i in range(1, len(lst)):
        if lst[i] != lst[i - 1]:
            value_ranges.append((start_index, i - 1))
            start_index = i

    # Add the last range
    value_ranges.append((start_index, len(lst) - 1))

    return value_ranges


In [7]:
os.listdir()

['.git',
 '.gitignore',
 '.idea',
 '.ipynb_checkpoints',
 'cnnlstmseq.py',
 'context_encoder.ipynb',
 'data',
 'embed',
 'emotion_classifier.py',
 'GAT.py',
 'LICENSE',
 'prototype_context_encoder.ipynb',
 'README.md',
 'relationtype_encoder.ipynb',
 'vocab.py',
 'wordebd.py',
 '__pycache__']

In [8]:
#not working...
# get_encoding_type("data/train_sent_emo_dya.csv")

In [12]:
X_train = pd.read_csv('data/train_sent_emo_dya.csv', encoding='MacRoman')
X_train[:3]

Unnamed: 0,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Old_Dialogue_ID,Old_Utterance_ID,Season,Episode,StartTime,EndTime
0,also I was the point person on my companyÅfs t...,Chandler,neutral,neutral,0,0,0,0,8,21,"00:16:16,059","00:16:21,731"
1,You mustÅfve had your hands full.,The Interviewer,neutral,neutral,0,1,0,1,8,21,"00:16:21,940","00:16:23,442"
2,That I did. That I did.,Chandler,neutral,neutral,0,2,0,2,8,21,"00:16:23,442","00:16:26,389"


In [26]:
drop_features = list(X_train.keys()[6:])
# drop_features.append("Emotion")
drop_features
y_train = pd.DataFrame()
y_train["Emotion"] = X_train["Emotion"].copy()
X_train = X_train.drop(drop_features, axis=1)
X_train[:3]

Unnamed: 0,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID
0,also I was the point person on my company's tr...,Chandler,neutral,neutral,0,0
1,You must've had your hands full.,The Interviewer,neutral,neutral,0,1
2,That I did. That I did.,Chandler,neutral,neutral,0,2


In [27]:
y_train[:3]

Unnamed: 0,Emotion
0,neutral
1,neutral
2,neutral


In [28]:
X_train["Utterance"] = X_train["Utterance"].apply(lambda x: replace_spelling(x))
X_train[:3]

Unnamed: 0,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID
0,also I was the point person on my company's tr...,Chandler,neutral,neutral,0,0
1,You must've had your hands full.,The Interviewer,neutral,neutral,0,1
2,That I did. That I did.,Chandler,neutral,neutral,0,2


In [29]:
checkFile1 = os.path.isfile("data/dump/label_encoder.pkl")
checkFile2 = os.path.isfile("data/dump/label_decoder.pkl")

if checkFile1 and checkFile2 is False:
    labels = set(y_train.Emotion)
    label_encoder = {label: i for i, label in enumerate(labels)}
    label_decoder = {i: label for i, label in enumerate(labels)}

    pickle.dump(label_encoder, open('data/dump/label_encoder.pkl', 'wb'))
    pickle.dump(label_decoder, open('data/dump/label_decoder.pkl', 'wb'))
    
else:
    file1 = open('data/dump/label_encoder.pkl', 'rb')
    file2 = open('data/dump/label_decoder.pkl', 'rb')
    label_encoder = pickle.load(file1)
    label_decoder = pickle.load(file2)
    file1.close()
    file2.close()

In [30]:
y_train["Emotion"] = y_train["Emotion"].apply(lambda x: encode_labels(label_encoder, x))
y_train[:15]

Unnamed: 0,Emotion
0,3
1,3
2,3
3,3
4,0
5,3
6,3
7,3
8,3
9,3


In [49]:
X_train["Emotion"] = y_train["Emotion"].copy()

In [31]:
## tokenize all sentences ##
checkFile = os.path.isfile("data/dump/tokenizer.pkl")

if checkFile is False:
    all_text = list(X_train.Utterance)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(all_text)
    pickle.dump(tokenizer, open('data/dump/tokenizer.pkl', 'wb'))
else:
    file = open('data/dump/tokenizer.pkl', 'rb')
    tokenizer = pickle.load(file)
    file.close()

In [32]:
## convert the sentences into sequences ## 
train_sequence = tokenizer.texts_to_sequences(list(X_train.Utterance)) 
X_train['sentence_length'] = [len(item) for item in train_sequence] 
 
max_num_tokens = 250 

train_sequence = pad_sequences(train_sequence, maxlen=max_num_tokens, padding='post') 
 
X_train['sequence'] = list(train_sequence)

In [None]:
# tokenizer.word_counts

Idk why glove embeddings and toknizers were used in the orig source code...

In [33]:
## save the data in pickle format ##
checkFile = os.path.isfile("data/dump/per_dialog_ids.pkl")
if checkFile is False:
    dialogSpeakers, dialogInputSeq, dialogInputMaxSeqLen, dialogLabels = {}, {}, {}, {}
    X_train_dialog_ids = set(X_train.Dialogue_ID)
    all_data = X_train.copy()
    # all_data = X_train.append(X_test, ignore_index=True).append(X_valid, ignore_index=True)

    for item in list(X_train_dialog_ids):
        X_df = all_data[all_data.Dialogue_ID == item]
        y_df = y_train[y_train.Dialogue_ID == item] 

        dialogSpeakers[item] = list(X_df.Speaker)
        dialogInputSeq[item] = list(X_df.sequence)
        dialogInputMaxSeqLen[item] = max(list(X_df.sentence_length))
        dialogLabels[item] = list(y_df.Emotion)

    pickle.dump([dialogSpeakers, dialogInputSeq, dialogInputMaxSeqLen, dialogLabels, X_train_dialog_ids],
                open('data/dump/per_dialog_ids.pkl', 'wb'))
else:
    file = open('data/dump/per_dialog_ids.pkl', "rb")
    dialogSpeakers, dialogInputSeq, dialogInputMaxSeqLen, dialogLabels, X_train_dialog_ids = pickle.load(file)
    file.close()

In [None]:
## save pretrained embedding matrix ## 
# file = open('data/dump/glv_embedding_matrix.npy', "rb") 
# if file is None: 
#     glv_vector = load_pretrained_glove() 
#     word_vector_length = len(glv_vector['the'])#dim=300 
#     word_index = tokenizer.word_index 
#     inv_word_index = {v: k for k, v in word_index.items()} 
#     num_unique_words = len(word_index) 
#     glv_embedding_matrix = np.zeros((num_unique_words + 1, word_vector_length)) 
 
#     for j in range(1, num_unique_words + 1): 
#         glv_embedding_matrix[j] = glv_vector.get(inv_word_index[j], np.random.randn(word_vector_length) / 200) 
 
#     np.save('data/dump/glv_embedding_matrix.npy', glv_embedding_matrix) 
#     print('Done. Completed preprocessing.') 
# else: 
#     glv_embedding_matrix = np.load('data/dump/glv_embedding_matrix.npy', allow_pickle=True) 
#     file.close()

In [None]:
# vocab_size, embedding_dim = glv_embedding_matrix.shape
# vocab_size, embedding_dim

In [34]:
file_path = os.path.join(os.getcwd(), "data/wiki-news-300d-1M.vec")
if os.path.isfile(file_path):
    print(f"{file_path} exists")
else:
    print(f"The file does not exist in the current directory.")

C:\Users\edayo\Downloads\4y2t\THSST-2\ug_thesis\ER_GAT\data/wiki-news-300d-1M.vec exists


In [35]:
vectors = Vectors(name="wiki-news-300d-1M.vec", url="data/", cache="data/")
vectors.cache(name="data/wiki-news-300d-1M.vec", url="data/", cache="data/")

  0%|                                                                                       | 0/999994 [00:00<?, ?it/s]Skipping token b'999994' with 1-dimensional vector [b'300']; likely a header
100%|████████████████████████████████████████████████████████████████████████| 999994/999994 [02:04<00:00, 8014.83it/s]


In [36]:
print(vectors.vectors.shape)

torch.Size([999994, 300])


In [37]:
vocab = Vocab(counter=collections.Counter(_read_words(X_train.Utterance)),
                  vectors=vectors,
                  specials=['<pad>', '<unk>'],
                  min_freq=5)

In [38]:
# print word embedding statistics 
wv_size = vocab.vectors.size() 
print('Total num. of words: {}, word vector dimension: {}'.format( 
   wv_size[0], 
   wv_size[1]))

Total num. of words: 2120, word vector dimension: 300


In [39]:
ebd = WORDEBD(vocab, False)
ebd

WORDEBD(
  (embedding_layer): Embedding(2120, 300)
)

In [40]:
args = Munch({
    "cnn_filter_sizes":[3,4,5],
    "cnn_num_filters":100,
    "cuda":-1,
    "mode":"train",
    "snapshot":'',
})

Creating an embedding

In [41]:
model = CNNLSTMseq(ebd, args) # ProtoSeq

In [43]:
print("{}, Building embedding".format(
    datetime.datetime.now().strftime('%y/%m/%d %H:%M:%S')), flush=True),
if args.snapshot != '':
    if args.multitask:
        print("{}, Loading pretrained embedding from {}".format(
            datetime.datetime.now().strftime('%y/%m/%d %H:%M:%S'),
            '%s_%s.ebd' % (args.snapshot, args.task),
            ))
        model.load_state_dict(  torch.load( '%s_%s.ebd' % (args.snapshot, args.task) ), strict=False  )
    else:   
        # load pretrained models,
        print("{}, Loading pretrained embedding from {}".format(
            datetime.datetime.now().strftime('%y/%m/%d %H:%M:%S'),
            '{}.ebd'.format(args.snapshot)
            ))
        model.load_state_dict(  torch.load( '{}.ebd'.format(args.snapshot) ), strict=False  )
# if args.cuda != -1: ,
#     model.cuda(args.cuda),
# else: ,

24/01/25 23:44:37, Building embedding


In [46]:
model.train()

CNNLSTMseq(
  (ebd): WORDEBD(
    (embedding_layer): Embedding(2120, 300)
  )
  (convs): ModuleList(
    (0): Conv1d(300, 100, kernel_size=(3,), stride=(1,))
    (1): Conv1d(300, 100, kernel_size=(4,), stride=(1,))
    (2): Conv1d(300, 100, kernel_size=(5,), stride=(1,))
  )
  (lstm): LSTM(300, 150, bidirectional=True)
)

In [50]:
# Convert everything into np array for fast data loading
_X_train = _data_to_nparray(X_train, vocab, args)

  arr = asarray(arr)


In [51]:
_X_train

{'ids': array([   0,    0,    0, ..., 2159, 2159, 2159]),
 'ids2': array([0, 1, 2, ..., 4, 5, 6]),
 'text': array([[6, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 6, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 6, ..., 0, 0, 0],
        [1, 1, 6, ..., 0, 0, 0]], dtype=int64),
 'text_len': array([84, 32, 23, ..., 36,  5, 60]),
 'label': array([3, 3, 3, ..., 0, 3, 5], dtype=int64),
 'vocab_size': 2120}

In [52]:
model.eval()

CNNLSTMseq(
  (ebd): WORDEBD(
    (embedding_layer): Embedding(2120, 300)
  )
  (convs): ModuleList(
    (0): Conv1d(300, 100, kernel_size=(3,), stride=(1,))
    (1): Conv1d(300, 100, kernel_size=(4,), stride=(1,))
    (2): Conv1d(300, 100, kernel_size=(5,), stride=(1,))
  )
  (lstm): LSTM(300, 150, bidirectional=True)
)

Testing on smaller data. Uncomment to see the size of updated representations

In [55]:
# sample data 
data = [ 
#     ["how are you", "I am great how about you", "good too"], 
    ["hes"], 
    # ... more conversations ... 
] 
tmp_in = []         
for conversation in data: 
    turn_indices = [torch.tensor([vocab.stoi[word] if word in vocab.stoi else vocab.stoi['<unk>'] for word in turn]) 
                for turn in conversation] 
#     print((turn_indices)) 
    # Pad sequences to a fixed length (adjust this based on your model requirements) 
    max_seq_len = max(max(len(turn), 5) for turn in turn_indices) 
 
    padded_turns = [torch.nn.functional.pad(turn, pad=(0, max_seq_len - len(turn))) for turn in turn_indices] 
 
    # Stack the padded turns along a new dimension 
    batched_input = torch.stack(padded_turns) 
    input_data = {'Utterance': batched_input} 
    tmp_in = max_seq_len 
    print(model.ebd(input_data["Utterance"], None).size()) 
    print(len(model.ebd(input_data["Utterance"], None))) 
 
    model(input_data) 

torch.Size([1, 5, 300])
1


This is just a duplicate of code above. Using this on train data

In [61]:
updated_representations = [] 
 
checkFile = os.path.isfile("embed/updated_representation_list.pkl") 
 
if checkFile is False: 
    for range_pair in ranges: 
        start_idx, end_idx = range_pair 
    #     print(start_idx , \   \", end_idx) 
        conversation = X_train['Utterance'][start_idx:end_idx+1] 
    #     conversation = X_train['Utterance'][247:249] 
 
        turn_indices = [torch.tensor([vocab.stoi[word] if word in vocab.stoi else vocab.stoi['<unk>'] for word in turn]) 
                    for turn in conversation] 
        max_seq_len = max(max(len(turn), 5) for turn in turn_indices) 
        padded_turns = [torch.nn.functional.pad(turn, pad=(0, max_seq_len - len(turn))) for turn in turn_indices] 
 
        # Stack the padded turns along a new dimension 
        batched_input = torch.stack(padded_turns) 
        input_data = {'Utterance': batched_input} 
        output_representation = model(input_data) 
 
        updated_representations.append(output_representation) 
     
     
    file_path = 'embed/updated_representation_list.pkl' 
    # Save the list to a file using pickle 
    with open(file_path, 'wb') as file: 
        pickle.dump(updated_representations, file) 
     
else: 
    file_path = 'embed/updated_representation_list.pkl' 
 
    # Load the list from the file using pickle 
    with open(file_path, 'rb') as file: 
        updated_representations = pickle.load(file)

In [63]:
updated_representations[0]

tensor([[-2.8721e-01,  5.8134e-01, -1.3142e-01,  ...,  1.8101e-02,
         -4.6824e-04,  1.9901e-02],
        [-1.6920e-01,  1.8220e-01, -1.2245e-01,  ...,  1.3620e-02,
         -2.0732e-03,  8.3473e-03],
        [-8.1502e-02,  7.7161e-02, -6.6144e-02,  ...,  1.3882e-02,
          3.4588e-03, -1.4834e-03],
        ...,
        [-4.1162e-03,  2.6335e-02,  2.8706e-02,  ..., -1.6475e-01,
         -1.3978e-01,  2.8344e-02],
        [-1.7579e-02,  1.8380e-02,  3.3130e-02,  ..., -2.5659e-01,
         -2.2489e-01,  1.5857e-02],
        [-2.9680e-02,  8.5039e-03,  3.3814e-02,  ..., -3.8804e-01,
         -2.8153e-01,  1.1250e-03]], requires_grad=True)

In [56]:
ranges = find_value_ranges(_X_train["ids"])
ranges[:10]

[(0, 13),
 (14, 18),
 (19, 31),
 (32, 39),
 (40, 46),
 (47, 57),
 (58, 60),
 (61, 61),
 (62, 80),
 (81, 82)]

In [58]:
checkFile = os.path.isfile( "data/dump/speaker_encoder.pkl") 
encoded_speaker_list = [] 
if checkFile is False: 
    for range_pair in ranges: 
        start_idx, end_idx = range_pair 
        speaker_per_dialog = X_train['Speaker'][start_idx:end_idx+1].copy() 
        speaker_feature = set(speaker_per_dialog) 
        speaker_encoder = {feature: i for i, feature in enumerate(speaker_feature)} 
        speaker_decoder = {i: feature for i, feature in enumerate(speaker_feature)} 
        # print( "ID:  ",  range_pair,    ", speaker_encoder) 
        # print( "ID:  ",  speaker_per_dialog) 
 
        encoded_speaker = speaker_per_dialog.replace(speaker_encoder) 
        encoded_speaker_list.append(encoded_speaker) 
 
    file_path = 'data/dump/speaker_encoder.pkl' 
    with open(file_path, 'wb') as file: 
        pickle.dump([encoded_speaker_list, ranges], file) 
#     pickle.dump([encoded_speaker_list, ranges], open('data/dump/speaker_encoder.pkl')) 
else: 
    file = open('data/dump/speaker_encoder.pkl',  "rb ") 
    encoded_speaker_list = pickle.load(file) 
    file.close()

In [59]:
encoded_speaker_list

[0     1
 1     0
 2     1
 3     0
 4     1
 5     0
 6     1
 7     0
 8     1
 9     0
 10    1
 11    0
 12    1
 13    0
 Name: Speaker, dtype: int64,
 14    0
 15    1
 16    0
 17    1
 18    0
 Name: Speaker, dtype: int64,
 19    1
 20    2
 21    3
 22    0
 23    3
 24    0
 25    3
 26    0
 27    3
 28    0
 29    3
 30    0
 31    3
 Name: Speaker, dtype: int64,
 32    2
 33    0
 34    1
 35    0
 36    1
 37    1
 38    1
 39    1
 Name: Speaker, dtype: int64,
 40    0
 41    0
 42    0
 43    0
 44    0
 45    0
 46    0
 Name: Speaker, dtype: int64,
 47    1
 48    2
 49    3
 50    0
 51    3
 52    0
 53    3
 54    0
 55    3
 56    0
 57    3
 Name: Speaker, dtype: int64,
 58    1
 59    0
 60    1
 Name: Speaker, dtype: int64,
 61    0
 Name: Speaker, dtype: int64,
 62    1
 63    3
 64    0
 65    4
 66    0
 67    0
 68    2
 69    5
 70    4
 71    5
 72    4
 73    5
 74    4
 75    5
 76    4
 77    5
 78    4
 79    4
 80    4
 Name: Speaker, dtype: int64,
 