In [1]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Input, Dense, Embedding, LSTM, Concatenate, Reshape, GRU, Bidirectional
from keras import optimizers
from keras.models import load_model
from keras.metrics import categorical_accuracy
from sklearn.utils import class_weight
from fastText import load_model
import pickle
import os
import re
import io
import sys
from keras.models import Model

# Path to training and testing data file. This data can be downloaded from a link, details of which will be provided.
trainDataPath = "data\\train.txt"
testDataPath = "data\\devwithoutlabels.txt"
# Output file that will be generated. This file can be directly submitted.
solutionPath = "/"
# Path to directory where GloVe file is saved.
gloveDir = "vectors"
vectorName = "vectors.txt"

Using TensorFlow backend.


In [2]:
label2emotion = {0: "others", 1: "happy", 2: "sad", 3: "angry"}
emotion2label = {"others": 0, "happy": 1, "sad": 2, "angry": 3}

NUM_CLASSES = 4                 # Number of classes - Happy, Sad, Angry, Others
# To set the upper limit on the number of tokens extracted using keras.preprocessing.text.Tokenizer
MAX_NB_WORDS = 15000
# All sentences having lesser number of words than this will be padded
MAX_SEQUENCE_LENGTH = 24
EMBEDDING_DIM = 300               # The dimension of the word embeddings
# The batch size to be chosen for training the model.
BATCH_SIZE = 128
# The dimension of the representations learnt by the LSTM model
LSTM_DIM = 600
# Fraction of the units to drop for the linear transformation of the inputs. Ref - https://keras.io/layers/recurrent/
DROPOUT = 0.5
NUM_EPOCHS = 15                  # Number of epochs to train a model for
LEARNING_RATE = 1e-4

In [3]:
def preprocessData(dataFilePath, mode):
    """Load data from a file, process and return indices, conversations and labels in separate lists
    Input:
        dataFilePath : Path to train/test file to be processed
        mode : "train" mode returns labels. "test" mode doesn't return labels.
    Output:
        indices : Unique conversation ID list
        conversations : List of 3 turn conversations, processed and each turn separated by the <eos> tag
        labels : [Only available in "train" mode] List of labels
    """
    indices = []
    conversations = []
    labels = []
    u1 = []
    u2 = []
    u3 = []
    with io.open(dataFilePath, encoding="utf8") as finput:
        finput.readline()
        for line in finput:
            # Convert multiple instances of . ? ! , to single instance
            # okay...sure -> okay . sure
            # okay???sure -> okay ? sure
            # Add whitespace around such punctuation
            # okay!sure -> okay ! sure
            repeatedChars = ['.', '?', '!', ',']
            for c in repeatedChars:
                lineSplit = line.split(c)
                while True:
                    try:
                        lineSplit.remove('')
                    except:
                        break
                cSpace = ' ' + c + ' '
                line = cSpace.join(lineSplit)

            line = line.strip().split('\t')
            if mode == "train":
                # Train data contains id, 3 turns and label
                label = emotion2label[line[4]]
                labels.append(label)

            conv = ' <eos> '.join(line[1:4])

            u1.append(line[1])
            u2.append(line[2])
            u3.append(line[3])

            # Remove any duplicate spaces
            duplicateSpacePattern = re.compile(r'\ +')
            conv = re.sub(duplicateSpacePattern, ' ', conv)

            indices.append(int(line[0]))
            conversations.append(conv.lower())

    if mode == "train":
        return indices, conversations, labels, u1, u2, u3
    else:
        return indices, conversations, u1, u2, u3

In [4]:
def getEmbeddingMatrix(wordIndex):
    """Populate an embedding matrix using a word-index. If the word "happy" has an index 19,
       the 19th row in the embedding matrix should contain the embedding vector for the word "happy".
    Input:
        wordIndex : A dictionary of (word : index) pairs, extracted using a tokeniser
    Output:
        embeddingMatrix : A matrix where every row has 100 dimensional GloVe embedding
    """
    if vectorName.startswith('glove'):
        embeddingsIndex = {}
    #     Load the embedding vectors from ther GloVe file
        with io.open(os.path.join(gloveDir, vectorName), encoding="utf8") as f:
            for line in f:
                values = line.split(' ')
               # print(values)
                word = values[0]
                embeddingVector = np.array([float(val) for val in values[1:]])
                embeddingsIndex[word] = embeddingVector

        print('Found %s word vectors.' % len(embeddingsIndex))
        with open(os.path.join(gloveDir, vectorName+'_index.pickle'), 'wb') as handle:
            pickle.dump(embeddingsIndex, handle, protocol=pickle.HIGHEST_PROTOCOL)

        # Minimum word index of any word is 1.
        embeddingMatrix = np.zeros((len(wordIndex) + 1, EMBEDDING_DIM))
        for word, i in wordIndex.items():
            embeddingVector = embeddingsIndex.get(word)
            if embeddingVector is not None:
                # words not found in embedding index will be all-zeros.
                embeddingMatrix[i] = embeddingVector
        
        with open(os.path.join(gloveDir, vectorName+'_matrix.pickle'), 'wb') as handle:
            pickle.dump(embeddingMatrix, handle, protocol=pickle.HIGHEST_PROTOCOL)
        return embeddingMatrix
    else:
        with io.open(os.path.join(gloveDir, vectorName), 'r', encoding='utf-8', newline='\n', errors='ignore') as fin:
            data = {}
            for line in fin:
                tokens = line.rstrip().split(' ')
                data[tokens[0]] = list(map(float, tokens[1:]))
            
        with open(os.path.join(gloveDir, vectorName+'_index.pickle'), 'wb') as handle:
            pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)
            
            # Minimum word index of any word is 1.
        embeddingMatrix = np.zeros((len(wordIndex) + 1, EMBEDDING_DIM))
        for word, i in wordIndex.items():
            embeddingVector = data.get(word)
            if embeddingVector is not None:
                # words not found in embedding index will be all-zeros.
                embeddingMatrix[i] = embeddingVector
                
        with open(os.path.join(gloveDir, vectorName+'_matrix.pickle'), 'wb') as handle:
            pickle.dump(embeddingMatrix, handle, protocol=pickle.HIGHEST_PROTOCOL)
        return embeddingMatrix

In [5]:
def buildModel(embeddingMatrix):
    """Constructs the architecture of the model
    Input:
        embeddingMatrix : The embedding matrix to be loaded in the embedding layer.
    Output:
        model : A basic LSTM model
    """
    x1 = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32', name='main_input1')
    x2 = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32', name='main_input2')
    x3 = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32', name='main_input3')

    embeddingLayer = Embedding(embeddingMatrix.shape[0],
                               EMBEDDING_DIM,
                               weights=[embeddingMatrix],
                               input_length=MAX_SEQUENCE_LENGTH,
                               trainable=False)
    emb1 = embeddingLayer(x1)
    emb2 = embeddingLayer(x2)
    emb3 = embeddingLayer(x3)

    _lstm1 = Bidirectional(LSTM(LSTM_DIM, dropout=DROPOUT))
    _lstm2 = Bidirectional(LSTM(LSTM_DIM, dropout=DROPOUT))
    _lstm3 = Bidirectional(LSTM(LSTM_DIM, dropout=DROPOUT))

    lstm1 = _lstm1(emb1)
    lstm2 = _lstm2(emb2)
    lstm3 = _lstm3(emb3)

    inp = Concatenate(axis=-1)([lstm1, lstm2, lstm3])
    # context = Dense(LSTM_DIM)(Concatenate(axis=-1)([lstm1, lstm2]))
    # text = Dense(LSTM_DIM)(lstm3)

    inp = Reshape((3, 2*LSTM_DIM, ))(inp)

    lstm_up = LSTM(LSTM_DIM, dropout=DROPOUT)

    out = lstm_up(inp)

    # out = Dense(NUM_CLASSES)(Concatenate(axis=-1)([text, context]))
    out = Dense(NUM_CLASSES, activation='softmax')(out)

    adam = optimizers.adam(lr=LEARNING_RATE, amsgrad=True)
    model = Model([x1, x2, x3], out)
    model.compile(loss='categorical_crossentropy',
                  optimizer=adam,
                  metrics=['acc'])
    print(model.summary())
    return model

In [6]:
print("Processing training data...")
trainIndices, trainTexts, labels, u1_train, u2_train, u3_train = preprocessData(trainDataPath, mode="train")
class_weights = class_weight.compute_class_weight('balanced', np.unique(labels), labels)
print(class_weights)

print("Processing test data...")
testIndices, testTexts, u1_test, u2_test, u3_test = preprocessData(testDataPath, mode="test")

print("Extracting tokens...")
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(u1_train+u2_train+u3_train)

u1_trainSequences, u2_trainSequences, u3_trainSequences = tokenizer.texts_to_sequences(u1_train), tokenizer.texts_to_sequences(u2_train),tokenizer.texts_to_sequences(u3_train)

u1_testSequences, u2_testSequences, u3_testSequences = tokenizer.texts_to_sequences(u1_test),tokenizer.texts_to_sequences(u2_test),tokenizer.texts_to_sequences(u3_test)

wordIndex = tokenizer.word_index
print("Found %s unique tokens." % len(wordIndex))

print("Populating embedding matrix...")
embeddingMatrix = getEmbeddingMatrix(wordIndex)

u1_data = pad_sequences(u1_trainSequences, maxlen=MAX_SEQUENCE_LENGTH)
u2_data = pad_sequences(u2_trainSequences, maxlen=MAX_SEQUENCE_LENGTH)
u3_data = pad_sequences(u3_trainSequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(labels))
print("Shape of training data tensor: ", u1_data.shape)
print("Shape of label tensor: ", labels.shape)

# Randomize data
np.random.shuffle(trainIndices)

u1_data = u1_data[trainIndices]
u2_data = u2_data[trainIndices]
u3_data = u3_data[trainIndices]

labels = labels[trainIndices]

Processing training data...
[0.50441531 1.77704454 1.38019403 1.36941518]
Processing test data...
Extracting tokens...
Found 16830 unique tokens.
Populating embedding matrix...
Shape of training data tensor:  (30160, 24)
Shape of label tensor:  (30160, 4)


In [None]:
model = buildModel(embeddingMatrix)

model.fit([u1_data, u2_data, u3_data], labels, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, class_weight=class_weights, verbose=2)
model.save('EP%d_LR%de-5_LDim%d_BS%d.h5' % (NUM_EPOCHS, int(LEARNING_RATE*(10**5)), LSTM_DIM, BATCH_SIZE))

In [None]:
print("Creating solution file...")
    u1_testData, u2_testData, u3_testData = pad_sequences(u1_testSequences, maxlen=MAX_SEQUENCE_LENGTH), pad_sequences(
        u2_testSequences, maxlen=MAX_SEQUENCE_LENGTH), pad_sequences(u3_testSequences, maxlen=MAX_SEQUENCE_LENGTH)
    predictions = model.predict(
        [u1_testData, u2_testData, u3_testData], batch_size=BATCH_SIZE)
    predictions = predictions.argmax(axis=1)

    with io.open(solutionPath, "w", encoding="utf8") as fout:
        fout.write(
            '\t'.join(["id", "turn1", "turn2", "turn3", "label"]) + '\n')
        with io.open(testDataPath, encoding="utf8") as fin:
            fin.readline()
            for lineNum, line in enumerate(fin):
                fout.write('\t'.join(line.strip().split('\t')[:4]) + '\t')
                fout.write(label2emotion[predictions[lineNum]] + '\n')
    print("Completed. Model parameters: ")
    print("Learning rate : %.3f, LSTM Dim : %d, Dropout : %.3f, Batch_size : %d"
          % (LEARNING_RATE, LSTM_DIM, DROPOUT, BATCH_SIZE))

In [None]:
words = [word for word in wordIndex.keys()]
with open('vectors\\words.txt', 'w', encoding='utf8') as f:
    for item in words:
        f.write("%s\n" % item)

In [None]:
words

In [7]:
u1_data

array([[   0,    0,    0, ...,  168,   67,  614],
       [   0,    0,    0, ...,   99,    3,    2],
       [   0,    0,    0, ...,    2,   23, 1518],
       ...,
       [   0,    0,    0, ..., 1860,  292,  353],
       [   0,    0,    0, ...,    1,  195,    2],
       [   0,    0,    0, ...,   36,   30,   19]])

In [16]:
np.array(u1_train)[trainIndices]

array(['U had ur lunch  ? ', 'i think i will get better talking to you',
       'Do you know cooking', ..., 'My gf is going to her native place 😭',
       'When did I meet you', 'Just like that'], dtype='<U309')

In [14]:
wordIndex['lunch']

614