### Imports

In [144]:
import csv
import emoji
from keras import optimizers
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
from keras.layers import Dense, Embedding, LSTM, Bidirectional, Dropout
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from nltk.tokenize import TweetTokenizer
import numpy as np
import pandas as pd
import re
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
import time

In [None]:
# try:
#     from nltk.corpus import words
# except LookupError:
#     import nltk
#     print("Downloading nltk words...")
#     nltk.download("words")
#     from nltk.corpus import words

### Utils

#### Global Constants

In [4]:
TRAIN_FILE = 'data/train.txt'
DEV_FILE = 'data/dev.txt'
TEST_FILE = 'data/test.txt'

TURNS_NAMES = ["turn1", "turn2", "turn3"]
LABEL = ["label"]
CONCATENATED_TURNS = "turns"

##### Emoticons map

In [19]:
EMOTICONS_MAP = {
    '😘': ' emoticon',
    '😍': ' happyemoticon',
    '😁': ' happyemoticon',
    '😭': ' sademoticon',
    '😑': ' sademoticon',
    '😻': ' happyemoticon',
    '😂': ' happyemoticon',
    '👍': ' emoticon',
    '😀': ' happyemoticon',
    ':D': ' happyemoticon',
    '🙂':  ' happyemoticon',
    '<3': ' happyemoticon',
    '😓' : ' sademoticon',
    '😒' : ' angryemoticon',
    '😈' : ' emoticon',
    '👿' : ' angryemoticon',
    '🖑' : ' happyemoticon',
    '😾' : ' emoticon',
    '😠' : ' angryemoticon',
    '👻' : ' emoticon',
    ':(' : ' sademoticon',
    ':)' : ' happyemoticon',
    'xD' : ' happyemoticon',
    '💔' : ' sademoticon',
    '😥' : ' emoticon',
    '😞' : ' sademoticon',
    '😤' : ' angryemoticon',
    '😃' : ' happyemoticon',
    '😦' : ' sademoticon',
    ':3' : ' emoticon',
    '😼' : ' emoticon',
    '😏' : ' happyemoticon',
    '😱' : ' sademoticon',
    '😬' : ' sademoticon',
    '🙁' : ' sademoticon',
    '</3' : ' sademoticon',
    '😺' : ' happyemoticon',
    '😣' : ' angryemoticon',
    '😢' : ' sademoticon',
    '😆' : ' happyemoticon',
    '😄' : ' happyemoticon',
    '😅' : ' happyemoticon',
    ':-)' : ' happyemoticon',
    '😊' : ' happyemoticon',
    '😕' : ' sademoticon',
    '😽' : ' happyemoticon',
    '🙀' : ' angryemoticon',
    '🤣' : ' happyemoticon',
    '🤐' : ' emoticon',
    '😡' : ' sademoticon',
    '👌' : ' happyemoticon', 
    '😮' : ' emoticon',
    '❤️' : ' happyemoticon',
    '🙄' : ' happyemoticon',
    '😿' : ' sademoticon',
    '😉' : ' happyemoticon',
    '😋' : ' happyemoticon',
    '😐' : ' emoticon',
    '😹' : ' happyemoticon',
    '😴' : ' sademoticon',
    '💤' : ' emoticon',
    '😜' : ' happyemoticon',
    '😇' : ' happyemoticon',
    '😔' : ' sademoticon',
    '😩' : ' sademoticon',
    '❤' : ' happyemoticon',
    '😲' : ' emoticon',
    '😫' : ' sademoticon',
    '😳' : ' sademoticon',
    '😰' : ' sademoticon',
}
print(len(EMOTICONS_MAP.keys()))

70


#### print_model

In [None]:
def print_model(model_summary, parameters, accuracy, file_name="models/experiments.txt"):
    with open(file_name, "a") as f:
        delimiter = "=============================================="
        acc_delim = "----------------------------------------------"
        format_string = "===Experiment===\n%s\n%s\n%s\n%s\n%s\n"
        f.write(format_string % (model_summary,
                                 delimiter,
                                 parameters,
                                 acc_delim,
                                 str(accuracy)))

#### Data manipulation

In [5]:
def parse_file(file_path):
    output_dict = dict()
    with open(file_path, newline='\n', encoding='utf8') as csvfile:
        return pd.read_csv(csvfile, sep="\t")

##### Load data

In [6]:
train_data = parse_file(TRAIN_FILE)
dev_data = parse_file(DEV_FILE)
test_data = parse_file(TEST_FILE)

##### Preprocess

In [7]:
def concatenate_turns(df, delim="fullstop"):
    turns = [("%s %s %s %s %s" %
                 (row[TURNS_NAMES[0]], delim,
                  row[TURNS_NAMES[1]], delim,
                  row[TURNS_NAMES[2]])).lower()
                 for index, row in df.iterrows()]
    df[CONCATENATED_TURNS] = pd.Series(turns, index=df.index)
    return df

In [8]:
def emoticons_replace(df):
    for index, row in df.iterrows():
        turns = emoji.demojize(row[CONCATENATED_TURNS])
        # remove delimiters ":"  (:smiley: -> smiley)
        for emoj in re.findall(":\w*:", turns):
            turns  = turns.replace(emoj, emoj[1:-1])
        df.at[index, CONCATENATED_TURNS] = turns
    return df

In [9]:
tweet_tokenizer = TweetTokenizer()
def tokenize_turns(df):
    turns = [tweet_tokenizer.tokenize(row[CONCATENATED_TURNS]) 
                for idx, row in df.iterrows()]
    df[CONCATENATED_TURNS] = pd.Series(turns, index=df.index)
    return df

In [10]:
train = emoticons_replace(concatenate_turns(train_data))
dev = emoticons_replace(concatenate_turns(dev_data))
test = emoticons_replace(concatenate_turns(test_data))

In [103]:
# Ordinal encoding
# from sklearn import preprocessing
# le = preprocessing.LabelEncoder()
# print(le.fit(train[LABEL]))
# print(le.classes_)
# print(train[LABEL])
# print(le.transform(train[LABEL]))
# print(train[LABEL])

# One Hot encoding

In [117]:
max_sentence = 189 # 163,82,189
### angry: [1 0 0 0]
### happy: [0 1 0 0]
### others: [0 0 1 0]
### sad: [0 0 0 1]
labels = {0: 'angry',
          1: 'happy',
          2: 'others',
          3: 'sad'}

In [111]:
tokenizer = Tokenizer()
all_text = train[CONCATENATED_TURNS]
all_text = all_text.append(dev[CONCATENATED_TURNS])
all_text = all_text.append(test[CONCATENATED_TURNS])

# TODO
# alternative: fit_on_english corpora: 
# tokenizer.fit_on_texts(words.words())
tokenizer.fit_on_texts(all_text)
X_train = pad_sequences(tokenizer.texts_to_sequences(train[CONCATENATED_TURNS]),
                        maxlen=max_sentence)
X_dev = pad_sequences(tokenizer.texts_to_sequences(dev[CONCATENATED_TURNS]),
                      maxlen=max_sentence)
X_test = pad_sequences(tokenizer.texts_to_sequences(test[CONCATENATED_TURNS]),
                       maxlen=max_sentence)
### angry: [1 0 0 0]
### happy: [0 1 0 0]
### others: [0 0 1 0]
### sad: [0 0 0 1]
Y_train = pd.get_dummies(train[LABEL]).as_matrix()
# for i, t in enumerate(train[LABEL].iterrows()):
#     if t[1]['label']=='others':
#         print(Y_train[i])
#         break
Y_dev = pd.get_dummies(dev[LABEL]).as_matrix()
Y_test = pd.get_dummies(test[LABEL]).as_matrix()

In [112]:
print(dir(tokenizer))

for tok in tokenizer.word_count:
    if tok[1] == 1:
        print(tok)

#### Model

#### Model eval metrics

In [136]:
class Metrics(Callback):
    def on_train_begin(self, logs={}):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []

    def on_epoch_end(self, epoch, logs={}):
        val_predict = (np.asarray(self.model.predict(self.validation_data[0]))).round()
        val_targ = self.validation_data[1]
        _val_f1 = f1_score(val_targ, val_predict, average='micro')
        _val_recall = recall_score(val_targ, val_predict, average='micro')
        _val_precision = precision_score(val_targ, val_predict, average='micro')
        print(_val_f1)
        print(_val_recall)
        self.val_f1s.append(_val_f1)
        self.val_recalls.append(_val_recall)
        self.val_precisions.append(_val_precision)
        print("— val_f1: %f — val_precision: %f — val_recall %f" %(_val_f1, _val_precision, _val_recall))
        return
 
metrics = Metrics()

def print_metrics(model,X,Y, file_name):
    predicts = (np.asarray(model.predict(X))).round()
    tp =[0,0,0,0]
    fp =[0,0,0,0]
    fn =[0,0,0,0]
    for i,pred in enumerate(predicts):
        p = np.argmax(pred)
        y = np.argmax(Y[i])
        if p == y:
            tp[p] += 1
        else:
            fp[p] +=1
            fn[y] +=1
    prec = sum(tp)/sum(tp+fp)
    rec = sum(tp)/sum(tp+fn)
    with open(file_name, 'a') as f:
        print("F1 all", file=f)
        print(2*prec*rec/(prec+rec), file=f) 
        print("***", file=f)
        for i in range(4):
            print("F1 %s: " % labels[i], file=f)
            prec = tp[i]/(tp[i]+fp[i])
            rec = tp[i]/(tp[i]+fn[i])
            print(2*prec*rec/(prec+rec), file=f)
            print("****", file=f)
        tp.pop(2)
        fp.pop(2)
        fn.pop(2)
        print("F1 happy angry sad", file=f)
        prec = sum(tp)/sum(tp+fp)
        rec = sum(tp)/sum(tp+fn)
        f1= 2*prec*rec/(prec+rec)
        print(f1, file=f)
        return f1

#### Model params

In [122]:
vocabulary_size = len(tokenizer.word_counts) + 1

epochs = 10
embed_dim = 256
lstm_out = 128
batch_size = 128
drop_out = 0.3
loss_fct = 'binary_crossentropy'
activation_fct = 'softmax'
optimizer = "Adam-0.01"

parameters = """Epochs:%s\nEmbed_dim: %s\nLstm_out: %s\nBatch size: %s\nDrop_out: %s
Loss_fct: %s\nActivaion_fct: %s\nOptimizer: %s\n
""" %(str(epochs), str(embed_dim), str(lstm_out), str(batch_size), str(drop_out), loss_fct,
      activation_fct, optimizer)

#### Model train

In [145]:
model = Sequential()
model.add(Embedding(vocabulary_size, embed_dim,input_length = X_train.shape[1]))
model.add(Bidirectional(LSTM(lstm_out)))
model.add(Dropout(drop_out))
model.add(Dense(4,activation=activation_fct))
adam = optimizers.Adam(lr=0.01)
rmsprop = optimizers.RMSprop(lr=0.005)#, rho=0.9, epsilon=None, decay=0.0)
sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss = loss_fct, optimizer=adam, metrics = ['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 189, 256)          4783104   
_________________________________________________________________
bidirectional_6 (Bidirection (None, 256)               394240    
_________________________________________________________________
dropout_6 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 4)                 1028      
Total params: 5,178,372
Trainable params: 5,178,372
Non-trainable params: 0
_________________________________________________________________
None


In [151]:
MODEL_CHECKPOINT = 'models/1_emb-bilstm-dr-dense'

In [150]:
model.fit(X_train, Y_train, epochs=epochs, verbose=1, batch_size=batch_size,
          validation_data=(X_dev, Y_dev),
          callbacks=[metrics,
                     EarlyStopping(),
                     ModelCheckpoint(MODEL_CHECKPOINT, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
                    ])

Train on 30160 samples, validate on 2755 samples
Epoch 1/10
0.8631578947368421
— val_f1: 0.875875 — val_precision: 0.888972 — val_recall 0.863158
Epoch 00000: val_acc improved from -inf to 0.93884, saving model to models/1_emb-bilstm-dr-dense
Epoch 2/10
0.8686025408348458
— val_f1: 0.880265 — val_precision: 0.892245 — val_recall 0.868603
Epoch 00001: val_acc improved from 0.93884 to 0.94093, saving model to models/1_emb-bilstm-dr-dense
Epoch 3/10
0.86497277676951
— val_f1: 0.871776 — val_precision: 0.878687 — val_recall 0.864973
Epoch 00002: val_acc did not improve


<keras.callbacks.History at 0x1f2f95d0ef0>

#### Model eval

In [66]:
results = model.evaluate(X_test, Y_test, callbacks=[metrics])

In [64]:
print(results)
print(model.metrics_names)

[0.25542368246338765, 0.9074695951970222]
['loss', 'acc']


In [153]:
f1 = print_metrics(model, X_test, Y_test, OPTS_PATH)
print(f1)

0.6213307240704501


#### Model save & load

In [154]:
MODEL_PATH = '%s-%s.json' % (MODEL_CHECKPOINT, str(f1))
MODEL_W_PATH = '%s-%s.h5' % (MODEL_CHECKPOINT, str(f1))
OPTS_PATH = 'models/1_opts-emb-bilstm-dr-dense'
with open(OPTS_PATH, 'w') as f:
    f.write(str(model.get_config()))
    f.write("\n%s" % parameters)

In [155]:
model.save(MODEL_PATH)

In [None]:
def load_model(model_path):
    return load_model(model_path)

In [120]:
loaded_model = load_model(MODEL_PATH)

In [59]:
print(np.shape(X_train))
print(np.shape(Y_train))

print(np.shape(X_dev))

print(np.shape(Y_dev))


print(np.shape(X_test))

print(np.shape(Y_test))


(30160, 163)
(30160, 4)
(2755, 82)
(2755, 4)
(5509, 189)
(5509, 4)


In [31]:
# y_true = [[0,0,0,1], [0,1,0,0], [1,0,0,0]]
# y_pred = [[0,0,0,1], [1,0,0,0], [1,0,0,0]]
# # print(f1_score(y_true, y_pred, average='macro'))
# print(f1_score(y_true, y_pred, average='micro')) 
# # print(f1_score(y_true, y_pred, average='weighted')) 
# # print(f1_score(y_true, y_pred, average='samples')) 

ValueError: multiclass-multioutput is not supported

In [None]:
import inspect as i
import sys
sys.stdout.write(i.getsource(model.evaluate))