In [None]:
!pip install emoji

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

import emoji
import re
import csv
from keras import optimizers
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
from keras.layers import Dense, Embedding, LSTM, Bidirectional, Dropout, Input, Reshape
from keras.models import Sequential, Model
from keras.preprocessing.sequence import pad_sequences

from keras.layers import GlobalAveragePooling2D
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from nltk.tokenize import TweetTokenizer
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
import time
import tensorflow as tf
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from matplotlib import pyplot as plt


In [None]:
path_prefix='../input/'
TRAIN_FILE = path_prefix + 'emocontext/train.txt'
DEV_FILE =  path_prefix + 'emocontext/dev.txt'
TEST_FILE =  path_prefix + 'emocontext/test.txt'

TURNS_NAMES = ["turn1", "turn2", "turn3"]
LABEL = ["label"]
TURNS_CONCAT = "turns"
def parse_file(file_path):
    output_dict = dict()
    with open(file_path, newline='\n', encoding='utf8') as csvfile:
        return pd.read_csv(csvfile, sep="\t")

train_data = parse_file(TRAIN_FILE)
dev_data = parse_file(DEV_FILE)
test_data = parse_file(TEST_FILE)

In [None]:
def concatenate_turns(df, delim="fullstop"):
    turns = [("%s %s %s %s %s" %
                 (row[TURNS_NAMES[0]], delim,
                  row[TURNS_NAMES[1]], delim,
                  row[TURNS_NAMES[2]])).lower()
                 for index, row in df.iterrows()]
    df[TURNS_CONCAT] = pd.Series(turns, index=df.index)
    return df
def emoticons_replace(df):
    for index, row in df.iterrows():
        for turn in range(3):
          turns = emoji.demojize(row[TURNS_NAMES[turn]])
          # remove delimiters ":"  (:smiley: -> smiley)
          for emoj in re.findall(":\w*:", turns):
              turns  = turns.replace(emoj, emoj[1:-1]).replace("_", " ")
          df.at[index, TURNS_NAMES[turn]] = turns
    return df

In [None]:
train = concatenate_turns(emoticons_replace(train_data))
dev = concatenate_turns(emoticons_replace(dev_data))
test = concatenate_turns(emoticons_replace(test_data))

In [None]:
tweet_tokenizer = TweetTokenizer()
def tokenize_turns(df):
    turns = [tweet_tokenizer.tokenize(row[TURNS_CONCAT]) 
                for idx, row in df.iterrows()]
    df[TURNS_CONCAT] = pd.Series(turns, index=df.index)
    return df
train_tok = tokenize_turns(train)
dev_tok = tokenize_turns(dev)
test_tok = tokenize_turns(test)

In [None]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_tok[TURNS_CONCAT])
tokenizer.fit_on_texts(dev_tok[TURNS_CONCAT])
tokenizer.fit_on_texts(test_tok[TURNS_CONCAT])
VOCABULARY = tokenizer.word_index
VOCABULARY['unk'] = 0
vocabulary_size = len(VOCABULARY.keys()) + 1
max_sentence = 189 # 163,82,189
embed_dim = 200

In [None]:
glove_vectors_file = path_prefix+"glove-global-vectors-for-word-representation/glove.twitter.27B.200d.txt"

In [None]:
glove_wordmap = {}
with open(glove_vectors_file, "r", encoding="utf8") as glove:
    for line in glove:
        name, vector = tuple(line.split(" ", 1))
        glove_wordmap[name] = np.fromstring(vector, sep=" ")

In [None]:
embedding_matrix = np.zeros((vocabulary_size, embed_dim))
for word, i in VOCABULARY.items():
    embedding_vector = glove_wordmap.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
### angry: [1 0 0 0]
### happy: [0 1 0 0]
### others: [0 0 1 0]
### sad: [0 0 0 1]

labels = {0: 'angry',
          1: 'happy',
          2: 'others',
          3: 'sad'}
X_train = pad_sequences(tokenizer.texts_to_sequences(
                            train_tok[TURNS_CONCAT]),
                        maxlen=max_sentence)
X_dev = pad_sequences(tokenizer.texts_to_sequences(dev_tok[TURNS_CONCAT]), maxlen=max_sentence)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_tok[TURNS_CONCAT]), maxlen=max_sentence)
Y_train = pd.get_dummies(train[LABEL]).as_matrix()
Y_dev = pd.get_dummies(dev[LABEL]).as_matrix()
Y_test = pd.get_dummies(test[LABEL]).as_matrix()

In [None]:
class Metrics(Callback):
    def __init__(self, test_X, test_Y, tolerance):
        self.test_X = test_X
        self.test_Y = test_Y
        self.max_f1 = 0
        self.f1_prev = 0
        self.tolerance = tolerance
        self.decreasing_times = 0
        
        
    def on_train_begin(self, logs={}):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []
        self.i = 0
        self.x = []
        
        self.f1s_test = []
        self.f1s_val = []
        self.losses = []
        self.val_losses = []
        
        self.logs = []
        self.fig = plt.figure()
    
    def plot_losses(self, f1_val,f1_test, logs):
        self.logs.append(logs)
        self.x.append(self.i)
        self.losses.append(logs.get('loss'))
        self.val_losses.append(logs.get('val_loss'))
        self.f1s_test.append(f1_test)
        self.f1s_val.append(f1_val)
        self.i += 1
        
#         clear_output(wait=True)
        
        plt.subplot(2,1,1)
        plt.plot(self.x, self.losses, label="train_loss")
        plt.plot(self.x, self.val_losses, label="val_loss")
        plt.legend()
        plt.subplot(2,1,2)
        plt.plot(self.x, self.f1s_val, label="f1_val " + '{:.4f}'.format(max(self.f1s_val)))
        plt.plot(self.x, self.f1s_test, label="f1_test " + '{:.4f}'.format(max(self.f1s_test)))
        
        plt.legend(loc=0)

        plt.show();

    def on_epoch_end(self, epoch, logs={}):
        val_predict = (np.asarray(self.model.predict(self.validation_data[0]))).round()
        val_targ = self.validation_data[1]

        _val_f1 = f1_score(val_targ, val_predict, average='micro')
        _val_recall = recall_score(val_targ, val_predict, average='micro')
        _val_precision = precision_score(val_targ, val_predict, average='micro')

        self.val_f1s.append(_val_f1)
        self.val_recalls.append(_val_recall)
        self.val_precisions.append(_val_precision)
        print("— val_f1: %f — val_precision: %f — val_recall %f" %(_val_f1, _val_precision, _val_recall))
        predicts = self.model.predict(self.test_X)
        test_predict = (np.asarray(predicts)).round()
        f1 = print_metrics_predicted(val_predict, val_targ)
        print("#############\nF1 test:\n#############")
        f_test = print_metrics_predicted(test_predict, self.test_Y)
        self.plot_losses(f1, f_test, logs)
        if f_test > self.max_f1:
            self.max_f1 = f_test
        if f_test < self.f1_prev:
            self.decreasing_times += 1
            if self.decreasing_times > self.tolerance:
                self.model.stop_training = True
        else:
            self.decreasing_times = 0
        self.f1_prev = f_test
        return
 
# metrics = Metrics()

def print_metrics_predicted(predicts,Y,filename=None):
    tp =[0,0,0,0]
    fp =[0,0,0,0]
    fn =[0,0,0,0]
    for i,pred in enumerate(predicts):
        p = np.argmax(pred)
        y = np.argmax(Y[i])
        if p == y:
            tp[p] += 1
        else:
            fp[p] +=1
            fn[y] +=1
    prec = sum(tp)/(sum(tp+fp)+np.finfo(float).eps)
    rec = sum(tp)/(sum(tp+fn)+np.finfo(float).eps)
    print("F1 all")
    f1_all = 2*prec*rec/(prec+rec+np.finfo(float).eps)
    print(f1_all) 
    print("***")
    for i in range(4):
      print("F1 %s: " % labels[i])
      prec = tp[i]/(tp[i]+fp[i]+np.finfo(float).eps)
      rec = tp[i]/(tp[i]+fn[i]+np.finfo(float).eps)
      f1 = 2*prec*rec/(prec+rec+np.finfo(float).eps)
      print(f1)
      print("****")
    tp.pop(2)
    fp.pop(2)
    fn.pop(2)
    print("F1 happy angry sad")
    prec = sum(tp)/(sum(tp+fp)+np.finfo(float).eps)
    rec = sum(tp)/(sum(tp+fn)+np.finfo(float).eps)
    f1= 2*prec*rec/(prec+rec+np.finfo(float).eps)
    print(f1)
    return f1


In [None]:
class Attention(tf.keras.Model):
    def __init__(self, units):
        super(Attention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
 
    def call(self, features, hidden):
        hidden_with_time_axis = tf.expand_dims(hidden, 1)
        score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))
        attention_weights = tf.nn.softmax(self.V(score), axis=1)
        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector, axis=1)
 
        return context_vector, attention_weights

In [None]:
epochs = 10
lstm_out = 128
batch_size = 128
drop_out = 0.3
loss_fct = 'binary_crossentropy'
activation_fct = 'softmax'
optimizer = "Adam-0.01"

In [None]:
def create_model():
    input_layer = Input(shape=(max_sentence,), dtype='int32')

    embedding_layer = Embedding(vocabulary_size,
                                embed_dim,
                                weights=[embedding_matrix],
                                input_length=max_sentence,
                                trainable=True)(input_layer)
    bi_lstm = Bidirectional(LSTM(256))(embedding_layer)
    dropout = Dropout(0.4)(bi_lstm)
    dense = Dense(128,activation='relu')(dropout)
    dropout = Dropout(0.2)(dense)
    dense = Dense(64,activation='relu')(dropout)
    dropout = Dropout(0.2)(dense)
    dense = Dense(4,activation='softmax')(dropout)
    adam = optimizers.Adam(lr=0.01)
    rmsprop = optimizers.RMSprop(lr=0.005)#, rho=0.9, epsilon=None, decay=0.0)
    sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)

    model = Model(inputs=[input_layer], outputs=dense)
    model.compile(loss = 'binary_crossentropy', optimizer=rmsprop, metrics = ['accuracy'])
    return model

In [None]:
model = create_model()
model.summary()

In [None]:
import keras
import IPython
# SVG(model_to_dot(model).create(prog='dot', format='svg'))
keras.utils.plot_model(model, to_file='test_keras_plot_model.png', show_shapes=True,show_layer_names=False)
IPython.display.Image('test_keras_plot_model.png')

In [None]:
model.fit(X_train, Y_train, epochs=50, verbose=1, batch_size=128,
          validation_data=(X_dev, Y_dev),
          callbacks=[Metrics(X_test, Y_test, 3),
                    ])