### Imports

In [108]:
import csv
import emoji
from keras import backend as K
from keras import optimizers
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
from keras.engine import Layer
from keras.layers import Dense, Embedding, LSTM, Bidirectional
from keras.layers import Dropout, Input, TimeDistributed, PReLU
from keras.models import Sequential, Model, load_model, save_model
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from nltk.tokenize import TweetTokenizer
import numpy as np
import pandas as pd
import re
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
import tensorflow as tf
import tensorflow_hub as hub
import time

In [2]:
# try:
#     from nltk.corpus import words
# except LookupError:
#     import nltk
#     print("Downloading nltk words...")
#     nltk.download("words")
#     from nltk.corpus import words

#### Elmo layer

In [109]:
# Create a custom layer that allows us to update weights (lambda layers do not have trainable parameters!)

class ElmoEmbeddingLayer(Layer):
    def __init__(self, **kwargs):
        self.dimensions = 1024
        self.trainable=True
        super(ElmoEmbeddingLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.elmo = hub.Module('https://tfhub.dev/google/elmo/2', trainable=self.trainable,
                               name="{}_module".format(self.name))

        self.trainable_weights += K.tf.trainable_variables(scope="^{}_module/.*".format(self.name))
        super(ElmoEmbeddingLayer, self).build(input_shape)

    def call(self, x, mask=None):
        result = self.elmo(K.squeeze(K.cast(x, tf.string), axis=1),
                      as_dict=True,
                      signature='default',
                      )['default']
        return result

#     def compute_mask(self, inputs, mask=None):
#         return K.not_equal(inputs, '--PAD--')

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.dimensions)

### Utils

#### Global Constants

In [4]:
TRAIN_FILE = 'data/train.txt'
DEV_FILE = 'data/dev.txt'
TEST_FILE = 'data/test.txt'

TURNS_NAMES = ["turn1", "turn2", "turn3"]
LABEL = ["label"]
CONCATENATED_TURNS = "turns"

##### Emoticons map

In [5]:
EMOTICONS_MAP = {
    '😘': ' emoticon',
    '😍': ' happyemoticon',
    '😁': ' happyemoticon',
    '😭': ' sademoticon',
    '😑': ' sademoticon',
    '😻': ' happyemoticon',
    '😂': ' happyemoticon',
    '👍': ' emoticon',
    '😀': ' happyemoticon',
    ':D': ' happyemoticon',
    '🙂':  ' happyemoticon',
    '<3': ' happyemoticon',
    '😓' : ' sademoticon',
    '😒' : ' angryemoticon',
    '😈' : ' emoticon',
    '👿' : ' angryemoticon',
    '🖑' : ' happyemoticon',
    '😾' : ' emoticon',
    '😠' : ' angryemoticon',
    '👻' : ' emoticon',
    ':(' : ' sademoticon',
    ':)' : ' happyemoticon',
    'xD' : ' happyemoticon',
    '💔' : ' sademoticon',
    '😥' : ' emoticon',
    '😞' : ' sademoticon',
    '😤' : ' angryemoticon',
    '😃' : ' happyemoticon',
    '😦' : ' sademoticon',
    ':3' : ' emoticon',
    '😼' : ' emoticon',
    '😏' : ' happyemoticon',
    '😱' : ' sademoticon',
    '😬' : ' sademoticon',
    '🙁' : ' sademoticon',
    '</3' : ' sademoticon',
    '😺' : ' happyemoticon',
    '😣' : ' angryemoticon',
    '😢' : ' sademoticon',
    '😆' : ' happyemoticon',
    '😄' : ' happyemoticon',
    '😅' : ' happyemoticon',
    ':-)' : ' happyemoticon',
    '😊' : ' happyemoticon',
    '😕' : ' sademoticon',
    '😽' : ' happyemoticon',
    '🙀' : ' angryemoticon',
    '🤣' : ' happyemoticon',
    '🤐' : ' emoticon',
    '😡' : ' sademoticon',
    '👌' : ' happyemoticon', 
    '😮' : ' emoticon',
    '❤️' : ' happyemoticon',
    '🙄' : ' happyemoticon',
    '😿' : ' sademoticon',
    '😉' : ' happyemoticon',
    '😋' : ' happyemoticon',
    '😐' : ' emoticon',
    '😹' : ' happyemoticon',
    '😴' : ' sademoticon',
    '💤' : ' emoticon',
    '😜' : ' happyemoticon',
    '😇' : ' happyemoticon',
    '😔' : ' sademoticon',
    '😩' : ' sademoticon',
    '❤' : ' happyemoticon',
    '😲' : ' emoticon',
    '😫' : ' sademoticon',
    '😳' : ' sademoticon',
    '😰' : ' sademoticon',
}
print(len(EMOTICONS_MAP.keys()))

70


#### print_model

In [6]:
def print_model(model_summary, parameters, accuracy, file_name="models/experiments.txt"):
    with open(file_name, "a") as f:
        delimiter = "=============================================="
        acc_delim = "----------------------------------------------"
        format_string = "===Experiment===\n%s\n%s\n%s\n%s\n%s\n"
        f.write(format_string % (model_summary,
                                 delimiter,
                                 parameters,
                                 acc_delim,
                                 str(accuracy)))

#### Data manipulation

In [110]:
def parse_file(file_path):
    output_dict = dict()
    with open(file_path, newline='\n', encoding='utf8') as csvfile:
        return pd.read_csv(csvfile, sep="\t")

##### Load data

In [27]:
train_data = parse_file(TRAIN_FILE)
dev_data = parse_file(DEV_FILE)
test_data = parse_file(TEST_FILE)

##### Preprocess

In [111]:
def concatenate_turns(df, delim="fullstop"):
    turns = [("%s %s %s %s %s" %
                 (row[TURNS_NAMES[0]], delim,
                  row[TURNS_NAMES[1]], delim,
                  row[TURNS_NAMES[2]])).lower()
                 for index, row in df.iterrows()]
    df[CONCATENATED_TURNS] = pd.Series(turns, index=df.index)
    return df

In [29]:
def emoticons_replace(df):
    for index, row in df.iterrows():
        turns = emoji.demojize(row[CONCATENATED_TURNS])
        # remove delimiters ":"  (:smiley: -> smiley)
        for emoj in re.findall(":\w*:", turns):
            turns  = turns.replace(emoj, emoj[1:-1])
        df.at[index, CONCATENATED_TURNS] = turns
    return df

In [11]:
# tweet_tokenizer = TweetTokenizer()
# def tokenize_turns(df):
#     turns = [tweet_tokenizer.tokenize(row[CONCATENATED_TURNS]) 
#                 for idx, row in df.iterrows()]
#     df[CONCATENATED_TURNS] = pd.Series(turns, index=df.index)
#     return df

In [96]:
train = emoticons_replace(concatenate_turns(train_data))
dev = emoticons_replace(concatenate_turns(dev_data))
test = emoticons_replace(concatenate_turns(test_data))

In [None]:
# Ordinal encoding
# from sklearn import preprocessing
# le = preprocessing.LabelEncoder()
# print(le.fit(train[LABEL]))
# print(le.classes_)
# print(train[LABEL])
# print(le.transform(train[LABEL]))
# print(train[LABEL])

# One Hot encoding

In [95]:
max_sentence = 189 # 163,82,189
### angry: [1 0 0 0]
### happy: [0 1 0 0]
### others: [0 0 1 0]
### sad: [0 0 0 1]
labels = {0: 'angry',
          1: 'happy',
          2: 'others',
          3: 'sad'}

In [None]:
tokenizer = Tokenizer()
all_text = train[CONCATENATED_TURNS]
all_text = all_text.append(dev[CONCATENATED_TURNS])
all_text = all_text.append(test[CONCATENATED_TURNS])

# TODO
# alternative: fit_on_english corpora: 
# tokenizer.fit_on_texts(words.words())
tokenizer.fit_on_texts(all_text)
X_train = pad_sequences(tokenizer.texts_to_sequences(train[CONCATENATED_TURNS]),
                        maxlen=max_sentence)
X_dev = pad_sequences(tokenizer.texts_to_sequences(dev[CONCATENATED_TURNS]),
                      maxlen=max_sentence)
X_test = pad_sequences(tokenizer.texts_to_sequences(test[CONCATENATED_TURNS]),
                       maxlen=max_sentence)


In [112]:
### angry: [1 0 0 0]
### happy: [0 1 0 0]
### others: [0 0 1 0]
### sad: [0 0 0 1]
Y_train = pd.get_dummies(train[LABEL]).as_matrix()
# for i, t in enumerate(train[LABEL].iterrows()):
#     if t[1]['label']=='others':
#         print(Y_train[i])
#         break
Y_dev = pd.get_dummies(dev[LABEL]).as_matrix()
Y_test = pd.get_dummies(test[LABEL]).as_matrix()

In [None]:
print(dir(tokenizer))

for tok in tokenizer.word_count:
    if tok[1] == 1:
        print(tok)

### Model

#### Model eval metrics

In [107]:
class Metrics(Callback):
    def on_train_begin(self, logs={}):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []

    def on_epoch_end(self, epoch, logs={}):
        val_predict = (np.asarray(self.model.predict(self.validation_data[0]))).round()
        val_targ = self.validation_data[1]
        _val_f1 = f1_score(val_targ, val_predict, average='micro')
        _val_recall = recall_score(val_targ, val_predict, average='micro')
        _val_precision = precision_score(val_targ, val_predict, average='micro')
        print(_val_f1)
        print(_val_recall)
        self.val_f1s.append(_val_f1)
        self.val_recalls.append(_val_recall)
        self.val_precisions.append(_val_precision)
        print("— val_f1: %f — val_precision: %f — val_recall %f" %(_val_f1, _val_precision, _val_recall))
        print_metrics_predicted(val_predict, val_targ)
        return
 
metrics = Metrics()

def print_metrics_predicted(X,Y):
    tp =[0,0,0,0]
    fp =[0,0,0,0]
    fn =[0,0,0,0]
    for i,pred in enumerate(predicts):
        p = np.argmax(pred)
        y = np.argmax(Y[i])
        if p == y:
            tp[p] += 1
        else:
            fp[p] +=1
            fn[y] +=1
    prec = sum(tp)/sum(tp+fp)
    rec = sum(tp)/sum(tp+fn)
    print("F1 all")
    print(2*prec*rec/(prec+rec)) 
    print("***")
    for i in range(4):
        print("F1 %s: " % labels[i])
        prec = tp[i]/(tp[i]+fp[i])
        rec = tp[i]/(tp[i]+fn[i])
        print(2*prec*rec/(prec+rec))
        print("****")
    tp.pop(2)
    fp.pop(2)
    fn.pop(2)
    print("F1 happy angry sad")
    prec = sum(tp)/sum(tp+fp)
    rec = sum(tp)/sum(tp+fn)
    f1= 2*prec*rec/(prec+rec)
    print(f1)

def print_metrics(model,X,Y, file_name):
    predicts = (np.asarray(model.predict(X))).round()
    tp =[0,0,0,0]
    fp =[0,0,0,0]
    fn =[0,0,0,0]
    for i,pred in enumerate(predicts):
        p = np.argmax(pred)
        y = np.argmax(Y[i])
        if p == y:
            tp[p] += 1
        else:
            fp[p] +=1
            fn[y] +=1
    prec = sum(tp)/sum(tp+fp)
    rec = sum(tp)/sum(tp+fn)
    with open(file_name, 'a') as f:
        print("F1 all", file=f)
        print(2*prec*rec/(prec+rec), file=f) 
        print("***", file=f)
        for i in range(4):
            print("F1 %s: " % labels[i], file=f)
            prec = tp[i]/(tp[i]+fp[i])
            rec = tp[i]/(tp[i]+fn[i])
            print(2*prec*rec/(prec+rec), file=f)
            print("****", file=f)
        tp.pop(2)
        fp.pop(2)
        fn.pop(2)
        print("F1 happy angry sad", file=f)
        prec = sum(tp)/sum(tp+fp)
        rec = sum(tp)/sum(tp+fn)
        f1= 2*prec*rec/(prec+rec)
        print(f1, file=f)
        return f1

#### Model params

In [106]:
vocabulary_size = len(tokenizer.word_counts) + 1

epochs = 10
embed_dim = 256
lstm_out = 128
batch_size = 128
drop_out = 0.3
loss_fct = 'binary_crossentropy'
activation_fct = 'softmax'
optimizer = "Adam-0.01"

parameters = """Epochs:%s\nEmbed_dim: %s\nLstm_out: %s\nBatch size: %s\nDrop_out: %s
Loss_fct: %s\nActivaion_fct: %s\nOptimizer: %s\n
""" %(str(epochs), str(embed_dim), str(lstm_out), str(batch_size), str(drop_out), loss_fct,
      activation_fct, optimizer)

NameError: name 'tokenizer' is not defined

#### Model train

In [124]:
input_text = Input(shape=(1,), dtype="string")
embedding = ElmoEmbeddingLayer()(input_text)
# bilstm = Bidirectional(LSTM(lstm_out), input_shape=(1024,))(embedding)
dense = Dense(512, activation='relu')(embedding)
dense = PReLU()(dense)
dr = Dropout(drop_out)(dense)
dense = Dense(256)(dr)
dense = PReLU()(dense)
dr = Dropout(drop_out)(dense)

pred = Dense(4, activation='softmax')(dense)

model = Model(inputs=[input_text], outputs=pred)
adam = optimizers.Adam(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer='nadam', metrics=['accuracy'])
model.summary()

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0401 19:04:36.349662  3648 saver.py:1483] Saver not created because there are no variables in the graph to restore


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_36 (InputLayer)        (None, 1)                 0         
_________________________________________________________________
elmo_embedding_layer_35 (Elm (None, 1024)              4         
_________________________________________________________________
dense_43 (Dense)             (None, 512)               524800    
_________________________________________________________________
p_re_lu_17 (PReLU)           (None, 512)               512       
_________________________________________________________________
dropout_22 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_44 (Dense)             (None, 256)               131328    
_________________________________________________________________
p_re_lu_18 (PReLU)           (None, 256)               256       
__________

In [122]:
# input_text = Input(shape=(1,), dtype="string")
# embedding = ElmoEmbeddingLayer()(input_text)
# # bilstm = Bidirectional(LSTM(lstm_out), input_shape=(1024,))(embedding)
# # dr = Dropout(drop_out)(bilstm)
# pred = Dense(4, activation=activation_fct)(embedding)

# model = Model(inputs=[input_text], outputs=pred)
# adam = optimizers.Adam(lr=0.01)
# model.compile(loss=loss_fct, optimizer=adam, metrics=['accuracy'])
# model.summary()

In [78]:
# model = Sequential()
# # model.add(Input(shape=(1,), dtype='string'))
# model.add(ElmoEmbeddingLayer()(Input(shape=(1,))))
# # model.add(Embedding(vocabulary_size, embed_dim,input_length = X_train.shape[1]))
# model.add(Bidirectional(LSTM(lstm_out)))
# model.add(Dropout(drop_out))
# model.add(Dense(4,activation=activation_fct))
# adam = optimizers.Adam(lr=0.01)
# rmsprop = optimizers.RMSprop(lr=0.005)#, rho=0.9, epsilon=None, decay=0.0)
# sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
# model.compile(loss = loss_fct, optimizer=adam, metrics = ['accuracy'])
# print(model.summary())

In [89]:
MODEL_CHECKPOINT = 'models/2_elmo-dense-prelu'

In [126]:
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
print(sess)

<tensorflow.python.client.session.Session object at 0x000002A6EE4218D0>


In [123]:
model.fit(train[CONCATENATED_TURNS], Y_train, epochs=2, verbose=1,
          batch_size=batch_size,
          validation_data=(dev[CONCATENATED_TURNS], Y_dev),
          callbacks=[metrics,
                     EarlyStopping(),
                     ModelCheckpoint(MODEL_CHECKPOINT, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
                    ])

Train on 30160 samples, validate on 2755 samples
Epoch 1/2


KeyboardInterrupt: 

In [36]:
# model.fit(X_train, Y_train, epochs=epochs, verbose=1, batch_size=batch_size,
#           validation_data=(X_dev, Y_dev),
#           callbacks=[metrics,
#                      EarlyStopping(),
#                      ModelCheckpoint(MODEL_CHECKPOINT, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
#                     ])

#### Model eval

In [None]:
results = model.evaluate(X_test, Y_test, callbacks=[metrics])

In [None]:
print(results)
print(model.metrics_names)

In [44]:
f1 = print_metrics(model, test[CONCATENATED_TURNS], Y_test, "Elmo-2epochs-results.txt")
print(f1)

0.33487052146151114


#### Model save & load

In [45]:
MODEL_PATH = '%s-%s.json' % (MODEL_CHECKPOINT, str(f1))
MODEL_W_PATH = '%s-%s.h5' % (MODEL_CHECKPOINT, str(f1))
OPTS_PATH = 'models/1_opts-elmo-bilstm-dr-dense'
# with open(OPTS_PATH, 'w') as f:
#     f.write(str(model.get_config()))
#     f.write("\n%s" % parameters)

In [None]:
model.save(MODEL_PATH)

In [46]:
def load_saved_model(model_path):
    return load_model(model_path)

In [53]:
loaded_model = load_model("models/1_emb-bilstm-dr-dense-0.6213307240704501.json")

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


W0401 17:39:31.756818  3648 deprecation.py:506] From f:\python 36 64\lib\site-packages\keras\backend\tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [56]:
loaded_model = load_model("models/1_elmo-dense-dense",
                          custom_objects={'ElmoEmbeddingLayer': ElmoEmbeddingLayer})

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0401 17:44:39.113742  3648 saver.py:1483] Saver not created because there are no variables in the graph to restore


ValueError: An operation has `None` for gradient. Please make sure that all of your ops have a gradient defined (i.e. are differentiable). Common ops without gradient: K.argmax, K.round, K.eval.

In [54]:
f1 = print_metrics(loaded_model, test[CONCATENATED_TURNS], Y_test, OPTS_PATH)
print(f1)

ValueError: Error when checking input: expected embedding_6_input to have shape (189,) but got array with shape (1,)

In [None]:
print(np.shape(X_train))
print(np.shape(Y_train))

print(np.shape(X_dev))

print(np.shape(Y_dev))


print(np.shape(X_test))

print(np.shape(Y_test))


In [None]:
# y_true = [[0,0,0,1], [0,1,0,0], [1,0,0,0]]
# y_pred = [[0,0,0,1], [1,0,0,0], [1,0,0,0]]
# # print(f1_score(y_true, y_pred, average='macro'))
# print(f1_score(y_true, y_pred, average='micro')) 
# # print(f1_score(y_true, y_pred, average='weighted')) 
# # print(f1_score(y_true, y_pred, average='samples')) 

In [None]:
import inspect as i
import sys
sys.stdout.write(i.getsource(model.evaluate))