### Imports

In [128]:
import csv
import emoji
from keras import optimizers
from keras.layers import Dense, Embedding, LSTM, Bidirectional, Dropout
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from nltk.tokenize import TweetTokenizer
import numpy as np
import pandas as pd
import re
import time

### Utils

#### Global Constants

In [61]:
TRAIN_FILE = 'data/train.txt'
DEV_FILE = 'data/dev.txt'
TEST_FILE = 'data/test.txt'

TURNS_NAMES = ["turn1", "turn2", "turn3"]
LABEL = ["label"]
CONCATENATED_TURNS = "turns"

##### Emoticons map

In [19]:
EMOTICONS_MAP = {
    '😘': ' emoticon',
    '😍': ' happyemoticon',
    '😁': ' happyemoticon',
    '😭': ' sademoticon',
    '😑': ' sademoticon',
    '😻': ' happyemoticon',
    '😂': ' happyemoticon',
    '👍': ' emoticon',
    '😀': ' happyemoticon',
    ':D': ' happyemoticon',
    '🙂':  ' happyemoticon',
    '<3': ' happyemoticon',
    '😓' : ' sademoticon',
    '😒' : ' angryemoticon',
    '😈' : ' emoticon',
    '👿' : ' angryemoticon',
    '🖑' : ' happyemoticon',
    '😾' : ' emoticon',
    '😠' : ' angryemoticon',
    '👻' : ' emoticon',
    ':(' : ' sademoticon',
    ':)' : ' happyemoticon',
    'xD' : ' happyemoticon',
    '💔' : ' sademoticon',
    '😥' : ' emoticon',
    '😞' : ' sademoticon',
    '😤' : ' angryemoticon',
    '😃' : ' happyemoticon',
    '😦' : ' sademoticon',
    ':3' : ' emoticon',
    '😼' : ' emoticon',
    '😏' : ' happyemoticon',
    '😱' : ' sademoticon',
    '😬' : ' sademoticon',
    '🙁' : ' sademoticon',
    '</3' : ' sademoticon',
    '😺' : ' happyemoticon',
    '😣' : ' angryemoticon',
    '😢' : ' sademoticon',
    '😆' : ' happyemoticon',
    '😄' : ' happyemoticon',
    '😅' : ' happyemoticon',
    ':-)' : ' happyemoticon',
    '😊' : ' happyemoticon',
    '😕' : ' sademoticon',
    '😽' : ' happyemoticon',
    '🙀' : ' angryemoticon',
    '🤣' : ' happyemoticon',
    '🤐' : ' emoticon',
    '😡' : ' sademoticon',
    '👌' : ' happyemoticon', 
    '😮' : ' emoticon',
    '❤️' : ' happyemoticon',
    '🙄' : ' happyemoticon',
    '😿' : ' sademoticon',
    '😉' : ' happyemoticon',
    '😋' : ' happyemoticon',
    '😐' : ' emoticon',
    '😹' : ' happyemoticon',
    '😴' : ' sademoticon',
    '💤' : ' emoticon',
    '😜' : ' happyemoticon',
    '😇' : ' happyemoticon',
    '😔' : ' sademoticon',
    '😩' : ' sademoticon',
    '❤' : ' happyemoticon',
    '😲' : ' emoticon',
    '😫' : ' sademoticon',
    '😳' : ' sademoticon',
    '😰' : ' sademoticon',
}
print(len(EMOTICONS_MAP.keys()))

70


#### print_model

In [None]:
def print_model(model_summary, parameters, accuracy, file_name="models/experiments.txt"):
    with open(file_name, "a") as f:
        delimiter = "=============================================="
        acc_delim = "----------------------------------------------"
        format_string = "===Experiment===\n%s\n%s\n%s\n%s\n%s\n"
        f.write(format_string % (model_summary,
                                 delimiter,
                                 parameters,
                                 acc_delim,
                                 str(accuracy)))

#### Data manipulation

In [3]:
def parse_file(file_path):
    output_dict = dict()
    with open(file_path, newline='\n', encoding='utf8') as csvfile:
        return pd.read_csv(csvfile, sep="\t")

##### Load data

In [4]:
train_data = parse_file(TRAIN_FILE)
dev_data = parse_file(DEV_FILE)
test_data = parse_file(TEST_FILE)

##### Preprocess

In [5]:
def concatenate_turns(df, delim="fullstop"):
    turns = [("%s %s %s %s %s" %
                 (row[TURNS_NAMES[0]], delim,
                  row[TURNS_NAMES[1]], delim,
                  row[TURNS_NAMES[2]])).lower()
                 for index, row in df.iterrows()]
    df[CONCATENATED_TURNS] = pd.Series(turns, index=df.index)
    return df

In [6]:
def emoticons_replace(df):
    for index, row in df.iterrows():
        turns = emoji.demojize(row[CONCATENATED_TURNS])
        # remove delimiters ":"  (:smiley: -> smiley)
        for emoj in re.findall(":\w*:", turns):
            turns  = turns.replace(emoj, emoj[1:-1])
        df.at[index, CONCATENATED_TURNS] = turns
    return df

In [7]:
tweet_tokenizer = TweetTokenizer()
def tokenize_turns(df):
    turns = [tweet_tokenizer.tokenize(row[CONCATENATED_TURNS]) 
                for idx, row in df.iterrows()]
    df[CONCATENATED_TURNS] = pd.Series(turns, index=df.index)
    return df

In [53]:
train = emoticons_replace(concatenate_turns(train_data))
dev = emoticons_replace(concatenate_turns(dev_data))

In [103]:
# Ordinal encoding
# from sklearn import preprocessing
# le = preprocessing.LabelEncoder()
# print(le.fit(train[LABEL]))
# print(le.classes_)
# print(train[LABEL])
# print(le.transform(train[LABEL]))
# print(train[LABEL])

# One Hot encoding

In [114]:
all_text = train[CONCATENATED_TURNS]
all_text = all_text.append(dev[CONCATENATED_TURNS])

tokenizer.fit_on_texts(all_text)
X_train = pad_sequences(tokenizer.texts_to_sequences(train[CONCATENATED_TURNS]))
X_dev = pad_sequences(tokenizer.texts_to_sequences(dev[CONCATENATED_TURNS]))

Y_train = pd.get_dummies(train[LABEL])
Y_dev = pd.get_dummies(dev[LABEL])

#### Model

In [119]:
# print(tokenizer.word_counts)
print(dir(tokenizer))
print(len(tokenizer.word_counts))
# print(tokenizer.word_index)

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'char_level', 'document_count', 'filters', 'fit_on_sequences', 'fit_on_texts', 'index_docs', 'lower', 'num_words', 'sequences_to_matrix', 'split', 'texts_to_matrix', 'texts_to_sequences', 'texts_to_sequences_generator', 'word_counts', 'word_docs', 'word_index']
16974


In [123]:
vocabulary_size = len(tokenizer.word_counts) + 1


epochs = 10
embed_dim = 256
lstm_out = 128
batch_size = 128
drop_out = 0.3
loss_fct = 'binary_crossentropy'
activation_fct = 'softmax'
optimizer = "Adam-0.01"

parameters = """Epochs:%s\nEmbed_dim: %s\nLstm_out: %s\nBatch size: %s\nDrop_out: %s
Loss_fct: %s\nActivaion_fct: %s\nOptimizer: %s\n
""" %(str(epochs), str(embed_dim), str(lstm_out), str(batch_size), str(drop_out), loss_fct,
      activation_fct, optimizer)

In [130]:
model = Sequential()
model.add(Embedding(vocabulary_size, embed_dim,input_length = X_train.shape[1]))
model.add(Bidirectional(LSTM(lstm_out)))
model.add(Dropout(drop_out))
model.add(Dense(4,activation=activation_fct))
adam = optimizers.Adam(lr=0.01)
rmsprop = optimizers.RMSprop(lr=0.005)#, rho=0.9, epsilon=None, decay=0.0)
sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss = loss_fct, optimizer=adam, metrics = ['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 163, 256)          4345600   
_________________________________________________________________
bidirectional_4 (Bidirection (None, 256)               394240    
_________________________________________________________________
dropout_3 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 4)                 1028      
Total params: 4,740,868
Trainable params: 4,740,868
Non-trainable params: 0
_________________________________________________________________
None
