### Imports

In [20]:
import csv
import emoji
from keras import optimizers
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
from keras.layers import Dense, Embedding, LSTM, Bidirectional, Dropout
from keras.models import Sequential, load_model
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from nltk.tokenize import TweetTokenizer
import numpy as np
import pandas as pd
import re
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
import time
from matplotlib import pyplot as plt


In [21]:
# try:
#     from nltk.corpus import words
# except LookupError:
#     import nltk
#     print("Downloading nltk words...")
#     nltk.download("words")
#     from nltk.corpus import words

### Utils

#### Global Constants

In [22]:
TRAIN_FILE = 'data/train.txt'
DEV_FILE = 'data/dev.txt'
TEST_FILE = 'data/test.txt'

TURNS_NAMES = ["turn1", "turn2", "turn3"]
LABEL = ["label"]
CONCATENATED_TURNS = "turns"

##### Emoticons map

In [23]:
EMOTICONS_MAP = {
    '😘': ' emoticon',
    '😍': ' happyemoticon',
    '😁': ' happyemoticon',
    '😭': ' sademoticon',
    '😑': ' sademoticon',
    '😻': ' happyemoticon',
    '😂': ' happyemoticon',
    '👍': ' emoticon',
    '😀': ' happyemoticon',
    ':D': ' happyemoticon',
    '🙂':  ' happyemoticon',
    '<3': ' happyemoticon',
    '😓' : ' sademoticon',
    '😒' : ' angryemoticon',
    '😈' : ' emoticon',
    '👿' : ' angryemoticon',
    '🖑' : ' happyemoticon',
    '😾' : ' emoticon',
    '😠' : ' angryemoticon',
    '👻' : ' emoticon',
    ':(' : ' sademoticon',
    ':)' : ' happyemoticon',
    'xD' : ' happyemoticon',
    '💔' : ' sademoticon',
    '😥' : ' emoticon',
    '😞' : ' sademoticon',
    '😤' : ' angryemoticon',
    '😃' : ' happyemoticon',
    '😦' : ' sademoticon',
    ':3' : ' emoticon',
    '😼' : ' emoticon',
    '😏' : ' happyemoticon',
    '😱' : ' sademoticon',
    '😬' : ' sademoticon',
    '🙁' : ' sademoticon',
    '</3' : ' sademoticon',
    '😺' : ' happyemoticon',
    '😣' : ' angryemoticon',
    '😢' : ' sademoticon',
    '😆' : ' happyemoticon',
    '😄' : ' happyemoticon',
    '😅' : ' happyemoticon',
    ':-)' : ' happyemoticon',
    '😊' : ' happyemoticon',
    '😕' : ' sademoticon',
    '😽' : ' happyemoticon',
    '🙀' : ' angryemoticon',
    '🤣' : ' happyemoticon',
    '🤐' : ' emoticon',
    '😡' : ' sademoticon',
    '👌' : ' happyemoticon', 
    '😮' : ' emoticon',
    '❤️' : ' happyemoticon',
    '🙄' : ' happyemoticon',
    '😿' : ' sademoticon',
    '😉' : ' happyemoticon',
    '😋' : ' happyemoticon',
    '😐' : ' emoticon',
    '😹' : ' happyemoticon',
    '😴' : ' sademoticon',
    '💤' : ' emoticon',
    '😜' : ' happyemoticon',
    '😇' : ' happyemoticon',
    '😔' : ' sademoticon',
    '😩' : ' sademoticon',
    '❤' : ' happyemoticon',
    '😲' : ' emoticon',
    '😫' : ' sademoticon',
    '😳' : ' sademoticon',
    '😰' : ' sademoticon',
}
print(len(EMOTICONS_MAP.keys()))

70


#### print_model

In [24]:
def print_model(model_summary, parameters, accuracy, file_name="models/experiments.txt"):
    with open(file_name, "a") as f:
        delimiter = "=============================================="
        acc_delim = "----------------------------------------------"
        format_string = "===Experiment===\n%s\n%s\n%s\n%s\n%s\n"
        f.write(format_string % (model_summary,
                                 delimiter,
                                 parameters,
                                 acc_delim,
                                 str(accuracy)))

#### Data manipulation

In [25]:
def parse_file(file_path):
    output_dict = dict()
    with open(file_path, newline='\n', encoding='utf8') as csvfile:
        return pd.read_csv(csvfile, sep="\t")

##### Load data

In [26]:
train_data = parse_file(TRAIN_FILE)
dev_data = parse_file(DEV_FILE)
test_data = parse_file(TEST_FILE)

##### Preprocess

In [27]:
def concatenate_turns(df, delim="fullstop"):
    turns = [("%s %s %s %s %s" %
                 (row[TURNS_NAMES[0]], delim,
                  row[TURNS_NAMES[1]], delim,
                  row[TURNS_NAMES[2]])).lower()
                 for index, row in df.iterrows()]
    df[CONCATENATED_TURNS] = pd.Series(turns, index=df.index)
    return df

In [28]:
def emoticons_replace(df):
    for index, row in df.iterrows():
        turns = emoji.demojize(row[CONCATENATED_TURNS])
        # remove delimiters ":"  (:smiley: -> smiley)
        for emoj in re.findall(":\w*:", turns):
            turns  = turns.replace(emoj, emoj[1:-1]).replace("_", " ")
        df.at[index, CONCATENATED_TURNS] = turns
    return df

In [29]:
print(emoticons_replace(dev)["turns"][2])

today i'm very happy fullstop and i'm happy for you red heart fullstop i will be marry


In [30]:
tweet_tokenizer = TweetTokenizer()
def tokenize_turns(df):
    turns = [tweet_tokenizer.tokenize(row[CONCATENATED_TURNS]) 
                for idx, row in df.iterrows()]
    df[CONCATENATED_TURNS] = pd.Series(turns, index=df.index)
    return df

In [31]:
train = emoticons_replace(concatenate_turns(train_data))
dev = emoticons_replace(concatenate_turns(dev_data))
test = emoticons_replace(concatenate_turns(test_data))

In [32]:
# Ordinal encoding
# from sklearn import preprocessing
# le = preprocessing.LabelEncoder()
# print(le.fit(train[LABEL]))
# print(le.classes_)
# print(train[LABEL])
# print(le.transform(train[LABEL]))
# print(train[LABEL])

# One Hot encoding

In [33]:
max_sentence = 189 # 163,82,189
### angry: [1 0 0 0]
### happy: [0 1 0 0]
### others: [0 0 1 0]
### sad: [0 0 0 1]
labels = {0: 'angry',
          1: 'happy',
          2: 'others',
          3: 'sad'}

In [34]:
tokenizer = Tokenizer()
all_text = train[CONCATENATED_TURNS]
all_text = all_text.append(dev[CONCATENATED_TURNS])
all_text = all_text.append(test[CONCATENATED_TURNS])

# TODO
# alternative: fit_on_english corpora: 
# tokenizer.fit_on_texts(words.words())
tokenizer.fit_on_texts(all_text)
X_train = pad_sequences(tokenizer.texts_to_sequences(train[CONCATENATED_TURNS]),
                        maxlen=max_sentence)
X_dev = pad_sequences(tokenizer.texts_to_sequences(dev[CONCATENATED_TURNS]),
                      maxlen=max_sentence)
X_test = pad_sequences(tokenizer.texts_to_sequences(test[CONCATENATED_TURNS]),
                       maxlen=max_sentence)
### angry: [1 0 0 0]
### happy: [0 1 0 0]
### others: [0 0 1 0]
### sad: [0 0 0 1]
Y_train = pd.get_dummies(train[LABEL]).as_matrix()
# for i, t in enumerate(train[LABEL].iterrows()):
#     if t[1]['label']=='others':
#         print(Y_train[i])
#         break
Y_dev = pd.get_dummies(dev[LABEL]).as_matrix()
Y_test = pd.get_dummies(test[LABEL]).as_matrix()

In [36]:
# print(dir(tokenizer))

# for tok in tokenizer.word_count:
#     if tok[1] == 1:
#         print(tok)

#### Model

#### Model eval metrics

In [45]:
class Metrics(Callback):
    def __init__(self, test_X, test_Y, tolerance):
        self.test_X = test_X
        self.test_Y = test_Y
        self.max_f1 = 0
        self.f1_prev = 0
        self.tolerance = tolerance
        self.decreasing_times = 0
        
        
    def on_train_begin(self, logs={}):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []
        self.i = 0
        self.x = []
        
        self.f1s_test = []
        self.f1s_val = []
        self.losses = []
        self.val_losses = []
        
        self.logs = []
        self.fig = plt.figure()
    
    def plot_losses(self, f1_val,f1_test, logs):
        self.logs.append(logs)
        self.x.append(self.i)
        self.losses.append(logs.get('loss'))
        self.val_losses.append(logs.get('val_loss'))
        self.f1s_test.append(f1_test)
        self.f1s_val.append(f1_val)
        self.i += 1
        
#         clear_output(wait=True)
        
        plt.subplot(2,1,1)
        plt.plot(self.x, self.losses, label="train_loss")
        plt.plot(self.x, self.val_losses, label="val_loss")
        plt.legend()
        plt.subplot(2,1,2)
        plt.plot(self.x, self.f1s_val, label="f1_val " + str(f1_val))
        plt.plot(self.x, self.f1s_test, label="f1_test " + str(f1_test))
        
        plt.legend(loc=0)

        plt.show();

    def on_epoch_end(self, epoch, logs={}):
        val_predict = (np.asarray(self.model.predict(self.validation_data[0]))).round()
        val_targ = self.validation_data[1]

        _val_f1 = f1_score(val_targ, val_predict, average='micro')
        _val_recall = recall_score(val_targ, val_predict, average='micro')
        _val_precision = precision_score(val_targ, val_predict, average='micro')

        self.val_f1s.append(_val_f1)
        self.val_recalls.append(_val_recall)
        self.val_precisions.append(_val_precision)
        print("— val_f1: %f — val_precision: %f — val_recall %f" %(_val_f1, _val_precision, _val_recall))
        predicts = self.model.predict(self.test_X)
        test_predict = (np.asarray(predicts)).round()
        f1 = print_metrics_predicted(val_predict, val_targ, MODEL_CHECKPOINT+"-results")
        print("#############\nF1 test:\n#############")
        f_test = print_metrics_predicted(test_predict, self.test_Y, MODEL_CHECKPOINT+"-results")
        self.plot_losses(f1, f_test, logs)
        if f_test > self.max_f1:
            self.max_f1 = f_test
            self.model.save(MODEL_CHECKPOINT)
        if f_test < self.f1_prev:
            self.decreasing_times += 1
            if self.decreasing_times > self.tolerance:
                self.model.stop_training = True
        else:
            self.decreasing_times = 0
        self.f1_prev = f_test
        return
 
# metrics = Metrics()

def print_metrics_predicted(predicts,Y,filename):
    tp =[0,0,0,0]
    fp =[0,0,0,0]
    fn =[0,0,0,0]
    for i,pred in enumerate(predicts):
        p = np.argmax(pred)
        y = np.argmax(Y[i])
        if p == y:
            tp[p] += 1
        else:
            fp[p] +=1
            fn[y] +=1
    prec = sum(tp)/(sum(tp+fp)+np.finfo(float).eps)
    rec = sum(tp)/(sum(tp+fn)+np.finfo(float).eps)
    with open(filename, "w") as f:
      print("F1 all")
      f1_all = 2*prec*rec/(prec+rec+np.finfo(float).eps)
      print(f1_all) 
      f.write(str(f1_all))
      print("***")
      for i in range(4):
          print("F1 %s: " % labels[i])
          f.write("\n%s: " % labels[i])
          prec = tp[i]/(tp[i]+fp[i]+np.finfo(float).eps)
          rec = tp[i]/(tp[i]+fn[i]+np.finfo(float).eps)
          f1 = 2*prec*rec/(prec+rec+np.finfo(float).eps)
          print(f1)
          f.write(str(f1))
          print("****")
      tp.pop(2)
      fp.pop(2)
      fn.pop(2)
      print("F1 happy angry sad")
      f.write("\nF1 happy angry sad: ")
      prec = sum(tp)/(sum(tp+fp)+np.finfo(float).eps)
      rec = sum(tp)/(sum(tp+fn)+np.finfo(float).eps)
      f1= 2*prec*rec/(prec+rec+np.finfo(float).eps)
      f.write(str(f1))
      print(f1)
      return f1


#### Model params

In [38]:
vocabulary_size = len(tokenizer.word_counts) + 1

epochs = 10
embed_dim = 256
lstm_out = 128
batch_size = 128
drop_out = 0.3
loss_fct = 'categorical_crossentropy'
activation_fct = 'softmax'
optimizer = "Adam-0.01"

parameters = """Epochs:%s\nEmbed_dim: %s\nLstm_out: %s\nBatch size: %s\nDrop_out: %s
Loss_fct: %s\nActivaion_fct: %s\nOptimizer: %s\n
""" %(str(epochs), str(embed_dim), str(lstm_out), str(batch_size), str(drop_out), loss_fct,
      activation_fct, optimizer)

#### Model train

In [39]:
model = Sequential()
model.add(Embedding(vocabulary_size, embed_dim,input_length = X_train.shape[1]))
model.add(Bidirectional(LSTM(lstm_out)))
model.add(Dropout(drop_out))
model.add(Dense(4,activation=activation_fct))
adam = optimizers.Adam(lr=0.01)
rmsprop = optimizers.RMSprop(lr=0.005)#, rho=0.9, epsilon=None, decay=0.0)
sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss = loss_fct, optimizer=adam, metrics = ['accuracy'])
print(model.summary())

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 189, 256)          4701184   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               394240    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 1028      
Total params: 5,096,452
Trainable params: 5,096,452
Non-trainable params: 0
_________________________________________________________________
None


In [40]:
MODEL_CHECKPOINT = 'models/1_emb-bilstm-emoji'

In [46]:
model.fit(X_train, Y_train, epochs=50, verbose=1, batch_size=batch_size,
          validation_data=(X_dev, Y_dev),
          callbacks=[Metrics(X_test, Y_test, 3)
                    ])

Train on 30160 samples, validate on 2755 samples
Epoch 1/50

KeyboardInterrupt: 

<matplotlib.figure.Figure at 0x28555436b00>

#### Model eval

In [18]:
# results = model.evaluate(X_test, Y_test, callbacks=[metrics])

In [19]:
# print(results)
# print(model.metrics_names)

In [153]:
# f1 = print_metrics(model, X_test, Y_test, OPTS_PATH)
# print(f1)

0.6213307240704501


#### Model save & load

In [10]:
OPTS_PATH = 'models/1_opts-emb-bilstm-dr-dense'

In [9]:
MODEL_PATH = '%s-%s.json' % (MODEL_CHECKPOINT, str(f1))
MODEL_W_PATH = '%s-%s.h5' % (MODEL_CHECKPOINT, str(f1))

NameError: name 'f1' is not defined

In [None]:
with open(OPTS_PATH, 'w') as f:
    f.write(str(model.get_config()))
    f.write("\n%s" % parameters)

In [155]:
model.save(MODEL_PATH)

In [None]:
def load_saved_model(model_path):
    return load_model(model_path)

In [120]:
loaded_model = load_model(MODEL_PATH)

In [6]:
loaded_model = load_model('models/1_emb-bilstm-dr-dense-0.6213307240704501.json')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.




In [25]:
f1 = print_metrics(loaded_model, X_test, Y_test, OPTS_PATH)
print(f1)

0.6213307240704501


In [59]:
print(np.shape(X_train))
print(np.shape(Y_train))

print(np.shape(X_dev))

print(np.shape(Y_dev))


print(np.shape(X_test))

print(np.shape(Y_test))


(30160, 163)
(30160, 4)
(2755, 82)
(2755, 4)
(5509, 189)
(5509, 4)


In [31]:
# y_true = [[0,0,0,1], [0,1,0,0], [1,0,0,0]]
# y_pred = [[0,0,0,1], [1,0,0,0], [1,0,0,0]]
# # print(f1_score(y_true, y_pred, average='macro'))
# print(f1_score(y_true, y_pred, average='micro')) 
# # print(f1_score(y_true, y_pred, average='weighted')) 
# # print(f1_score(y_true, y_pred, average='samples')) 

ValueError: multiclass-multioutput is not supported

In [None]:
import inspect as i
import sys
sys.stdout.write(i.getsource(model.evaluate))