Most codes of context encoder is referenced from ProtoSeq by Gaël Guibon https://github.com/gguibon/protoseq <br>

For the vocab library, I downloaded the source code from https://github.com/vincentzlt/torchtext 

Put all libraries here

In [93]:
# !pip install torchtext

In [94]:
%load_ext autoreload
%autoreload 2
import os, re, time, pickle, collections, importlib, datetime

import pandas as pd, numpy as np, pickle
from chardet import detect

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from collections import defaultdict, Counter

from wordebd import WORDEBD
from vocab import Vocab, Vectors
from munch import Munch
from cnnlstmseq import CNNLSTMseq


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Cleaning methods

In [95]:
def get_encoding_type(file):
    with open(file, 'rb') as f:
        rawdata = f.read()
    return detect(rawdata)['encoding']

def detect_misspelling(source):
    pass

def replace_spelling(source):    
    return re.sub("Åf", "'", source)

Context Encoder

In [96]:
# referenced from DialogueGCN, mastodon code
def preprocess_text(x):
    # Use regex to replace punctuations with spaces
    x = re.sub(r'[^\w\s]', ' ', x)

    # Replace multiple whitespaces with a single space
    x = ' '.join(x.split())

    # Convert to lowercase
    x = x.lower()

    return x

def load_pretrained_glove(file_path='embed/glove/glove.840B.300d.txt', vector_dimension=300):
    glv_vector = {}
    start_time = time.time()
    with open(file_path, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            
            # Skip lines with unexpected formats or incomplete data
            if len(values) < vector_dimension + 1 or not all(v.replace('.', '').isdigit() or v.startswith('-') for v in values[1:]):
                continue
            
            word = values[0]
            coefs = np.asarray(values[1:vector_dimension+1], dtype='float')
            glv_vector[word] = coefs
            print(f"Took {time.time() - start_time} seconds to load pretrained GloVe model.")
    return glv_vector

def encode_labels(encoder, l):
    return encoder[l]

def load_data_from_npy(file_path):
    try:
        data = np.load(file_path, allow_pickle=True)
        if isinstance(data, np.ndarray):
            if data.ndim == 2:
                return pd.DataFrame(data)
            else:
                raise ValueError("The loaded array is not two-dimensional.")
        else:
            raise TypeError("The loaded object is not a NumPy array.")
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None
    except Exception as e:
        print(f"Error: An exception occurred - {str(e)}")
        return None
    
# def sentence_embedding(sentence, embeddings):
#     words = sentence.split()
#     vectors = [embeddings.get(word, np.zeros(300)) for word in words]
#     mean_vector = np.mean(vectors, axis=0)
#     return mean_vector

def _read_words(data, convmode=None):
    '''
        Count the occurrences of all words
        @param convmode: str, None for non conversational scope, 'naive' for classic or naive approach, 'conv' for conversation depth into account (one additional dim and nested values)
        @param data: list of examples
        @return words: list of words (with duplicates)
    '''
    words = []
    if convmode is None:
        for example in data:
            words += example     
    return words

Put main function here

In [97]:
print(get_encoding_type("data/train_sent_emo_dya.csv"))

MacRoman


In [98]:
X_train = pd.read_csv('data/train_sent_emo_dya.csv', encoding='MacRoman')

In [99]:
drop_features = list(X_train.keys()[6:])
drop_features.append("Emotion")
drop_features
y_train = X_train[["Emotion", "Dialogue_ID"]].copy()
X_train = X_train.drop(drop_features, axis=1)
X_train[:10]

Unnamed: 0,Utterance,Speaker,Sentiment,Dialogue_ID,Utterance_ID
0,also I was the point person on my companyÅfs t...,Chandler,neutral,0,0
1,You mustÅfve had your hands full.,The Interviewer,neutral,0,1
2,That I did. That I did.,Chandler,neutral,0,2
3,So letÅfs talk a little bit about your duties.,The Interviewer,neutral,0,3
4,My duties? All right.,Chandler,positive,0,4
5,"Now youÅfll be heading a whole division, so yo...",The Interviewer,neutral,0,5
6,I see.,Chandler,neutral,0,6
7,But thereÅfll be perhaps 30 people under you s...,The Interviewer,neutral,0,7
8,Good to know.,Chandler,neutral,0,8
9,We can go into detail,The Interviewer,neutral,0,9


In [100]:
y_train[:15]

Unnamed: 0,Emotion,Dialogue_ID
0,neutral,0
1,neutral,0
2,neutral,0
3,neutral,0
4,surprise,0
5,neutral,0
6,neutral,0
7,neutral,0
8,neutral,0
9,neutral,0


In [101]:
X_train["Utterance"] = X_train["Utterance"].apply(lambda x: replace_spelling(x)).apply(lambda x: preprocess_text(x))
X_train[:10]

Unnamed: 0,Utterance,Speaker,Sentiment,Dialogue_ID,Utterance_ID
0,also i was the point person on my company s tr...,Chandler,neutral,0,0
1,you must ve had your hands full,The Interviewer,neutral,0,1
2,that i did that i did,Chandler,neutral,0,2
3,so let s talk a little bit about your duties,The Interviewer,neutral,0,3
4,my duties all right,Chandler,positive,0,4
5,now you ll be heading a whole division so you ...,The Interviewer,neutral,0,5
6,i see,Chandler,neutral,0,6
7,but there ll be perhaps 30 people under you so...,The Interviewer,neutral,0,7
8,good to know,Chandler,neutral,0,8
9,we can go into detail,The Interviewer,neutral,0,9


In [102]:
## encode the emotion labels ##
file1 = open('data/dump/label_encoder.pkl', 'rb')
file2 = open('data/dump/label_decoder.pkl', 'rb')

if file1 and file2 is None:
    labels = set(y_train.Emotion)
    label_encoder = {label: i for i, label in enumerate(labels)}
    label_decoder = {i: label for i, label in enumerate(labels)}

    pickle.dump(label_encoder, open('data/dump/label_encoder.pkl', 'wb'))
    pickle.dump(label_decoder, open('data/dump/label_decoder.pkl', 'wb'))
    
else:
    label_encoder = pickle.load(file1)
    label_decoder = pickle.load(file2)
    file1.close()
    file2.close()

In [103]:
y_train["Emotion"] = y_train["Emotion"].apply(lambda x: encode_labels(label_encoder, x))
y_train[:15]

Unnamed: 0,Emotion,Dialogue_ID
0,1,0
1,1,0
2,1,0
3,1,0
4,5,0
5,1,0
6,1,0
7,1,0
8,1,0
9,1,0


In [104]:
## tokenize all sentences ##
file = open('data/dump/tokenizer.pkl', 'rb')

if file is None:
    all_text = list(X_train.Utterance)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(all_text)
    pickle.dump(tokenizer, open('data/dump/tokenizer.pkl', 'wb'))
else:
    tokenizer = pickle.load(file)
    file.close()

In [105]:
## convert the sentences into sequences ##
train_sequence = tokenizer.texts_to_sequences(list(X_train.Utterance))
X_train['sentence_length'] = [len(item) for item in train_sequence]

max_num_tokens = 250

train_sequence = pad_sequences(train_sequence, maxlen=max_num_tokens, padding='post')

X_train['sequence'] = list(train_sequence)

In [106]:
(tokenizer.word_counts)

OrderedDict([('also', 26),
             ('i', 5701),
             ('was', 616),
             ('the', 2463),
             ('point', 9),
             ('person', 27),
             ('on', 668),
             ('my', 861),
             ('company', 8),
             ('s', 2444),
             ('transition', 1),
             ('from', 142),
             ('kl', 1),
             ('5', 6),
             ('to', 2121),
             ('gr', 1),
             ('6', 10),
             ('system', 10),
             ('you', 4441),
             ('must', 30),
             ('ve', 267),
             ('had', 176),
             ('your', 489),
             ('hands', 13),
             ('full', 10),
             ('that', 1856),
             ('did', 262),
             ('so', 869),
             ('let', 224),
             ('talk', 85),
             ('a', 2067),
             ('little', 198),
             ('bit', 24),
             ('about', 428),
             ('duties', 3),
             ('all', 659),
             ('right', 60

<b>Idk why glove embeddings and toknizers were used in the orig source code...

In [107]:
## save the data in pickle format ##
file = open('data/dump/per_dialog_ids.pkl', "rb")
if file is None:
    dialogSpeakers, dialogInputSeq, dialogInputMaxSeqLen, dialogLabels = {}, {}, {}, {}
    X_train_dialog_ids = set(X_train.Dialogue_ID)
    all_data = X_train.copy()
    # all_data = X_train.append(X_test, ignore_index=True).append(X_valid, ignore_index=True)

    for item in list(X_train_dialog_ids):
        X_df = all_data[all_data.Dialogue_ID == item]
        y_df = y_train[y_train.Dialogue_ID == item]

        dialogSpeakers[item] = list(X_df.Speaker)
        dialogInputSeq[item] = list(X_df.sequence)
        dialogInputMaxSeqLen[item] = max(list(X_df.sentence_length))
        dialogLabels[item] = list(y_df.Emotion)

    pickle.dump([dialogSpeakers, dialogInputSeq, dialogInputMaxSeqLen, dialogLabels, X_train_dialog_ids], 
                open('data/dump/per_dialog_ids.pkl', 'wb'))
else:
    dialogSpeakers, dialogInputSeq, dialogInputMaxSeqLen, dialogLabels, X_train_dialog_ids = pickle.load(file)
    file.close()

In [108]:
## save pretrained embedding matrix ##
file = open('data/dump/glv_embedding_matrix.npy', "rb")
if file is None:
    glv_vector = load_pretrained_glove()
    word_vector_length = len(glv_vector['the'])#dim=300
    word_index = tokenizer.word_index
    inv_word_index = {v: k for k, v in word_index.items()}
    num_unique_words = len(word_index)
    glv_embedding_matrix = np.zeros((num_unique_words + 1, word_vector_length))

    for j in range(1, num_unique_words + 1):
        glv_embedding_matrix[j] = glv_vector.get(inv_word_index[j], np.random.randn(word_vector_length) / 200)

    np.save('data/dump/glv_embedding_matrix.npy', glv_embedding_matrix)
    print('Done. Completed preprocessing.')
else:
    glv_embedding_matrix = np.load('data/dump/glv_embedding_matrix.npy', allow_pickle=True)
    file.close()

In [109]:
vocab_size, embedding_dim = glv_embedding_matrix.shape
vocab_size, embedding_dim

(5409, 300)

No need for Word2Vec, I don't know why it was written in the ref source code

In [110]:
# vectors = Vectors(name="wiki-news-300d-1M.vec", cache="data/", unk_init=['<pad>', '<unk>'])

In [111]:
# vectors.vectors.shape

In [112]:
# vocab = Vocab(ordered_dict=tokenizer.word_counts, specials=['<pad>', '<unk>'], min_freq=5)

In [113]:
# vocab.set_default_index(vocab['<unk>'])

In [114]:
# vocab['hi']

In [115]:
vectors = Vectors(name="wiki-news-300d-1M.vec", url="data/", cache="data/")
vectors.cache(name="data/wiki-news-300d-1M.vec", url="data/", cache="data/")

In [116]:
# file_path = os.path.join(os.getcwd(), "data/wiki-news-300d-1M.vec")
# if os.path.isfile(file_path):
#     print(f"The file exists in the current directory.")
# else:
#     print(f"The file does not exist in the current directory.")

In [117]:
print(vectors.vectors.shape)

torch.Size([999994, 300])


In [118]:
vocab = Vocab(counter=collections.Counter(_read_words(X_train.Utterance)), 
              vectors=vectors,
              specials=['<pad>', '<unk>'], 
              min_freq=5)

Main func

In [120]:
ebd = WORDEBD(vocab, False)

In [121]:
ebd

WORDEBD(
  (embedding_layer): Embedding(40, 300)
)

In [122]:
args = Munch({
    "cnn_filter_sizes":[3,4,5],
    "cnn_num_filters":100,
    "cuda":-1,
    "mode":"train",
    "snapshot":'',
})

Creating an embedding

In [123]:
model = CNNLSTMseq(ebd, args) # ProtoSeq

In [124]:
print("{}, Building embedding".format(
    datetime.datetime.now().strftime('%y/%m/%d %H:%M:%S')), flush=True)

if args.snapshot != '':
    if args.multitask:

        print("{}, Loading pretrained embedding from {}".format(
            datetime.datetime.now().strftime('%y/%m/%d %H:%M:%S'),
            '%s_%s.ebd' % (args.snapshot, args.task)
            ))
        model.load_state_dict(  torch.load( '%s_%s.ebd' % (args.snapshot, args.task) ), strict=False  )

    else:    
        # load pretrained models
        print("{}, Loading pretrained embedding from {}".format(
            datetime.datetime.now().strftime('%y/%m/%d %H:%M:%S'),
            '{}.ebd'.format(args.snapshot)
            ))
        model.load_state_dict(  torch.load( '{}.ebd'.format(args.snapshot) ), strict=False  )

# if args.cuda != -1: 
#     model.cuda(args.cuda)
# else: 
#     model

24/01/18 16:34:39, Building embedding


In [125]:
model.train()

CNNLSTMseq(
  (ebd): WORDEBD(
    (embedding_layer): Embedding(40, 300)
  )
  (convs): ModuleList(
    (0): Conv1d(300, 100, kernel_size=(3,), stride=(1,))
    (1): Conv1d(300, 100, kernel_size=(4,), stride=(1,))
    (2): Conv1d(300, 100, kernel_size=(5,), stride=(1,))
  )
  (lstm): LSTM(300, 150, bidirectional=True)
)

In [None]:
# dir(model.train())