# Install required package and Download Data

In [0]:
! pip install emoji
! wget -q https://storage.googleapis.com/allianz-course/data/glove.6B.50d.txt

# Import packages

In [0]:
import emoji
import csv
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from tensorflow import keras
import tensorflow.keras.backend as K
from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing import sequence
from sklearn.cross_validation import train_test_split
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Dense, Input, Dropout, LSTM, Activation, Embedding

np.random.seed(0)

#**Prepare Input Data**

## Read emojify data in csv

In [0]:
csv_path  = 'https://storage.googleapis.com/allianz-course/data/emojify_data.csv'
csv = pd.read_csv(csv_path)

In [0]:
csv.head(5)

Baesd on the definition, 

![alt text](https://storage.googleapis.com/allianz-course/data/emoji.jpg =340x380)

In [0]:
emoji_dictionary = {"0": "\u2764\uFE0F",    # :heart: prints a black instead of red heart depending on the font
                    "1": ":baseball:",
                    "2": ":smile:",
                    "3": ":disappointed:",
                    "4": ":fork_and_knife:"}

In [0]:
def label_to_emoji(label):
    """
    Converts a label (int or string) into the corresponding emoji code (string) ready to be printed
    """
    return emoji.emojize(emoji_dictionary[str(label)], use_aliases=True)

In [0]:
csv['emoji'] = csv['Score'].apply(lambda x: label_to_emoji(x))

In [0]:
csv.head(10)

In [0]:
all_data = np.asarray(csv['Text'])
all_label = np.asarray(csv['Score'], dtype=int)

In [0]:
print(f"The are {len(all_data)} in total ")

In [0]:
def get_max_length(arr):
    maxLen = len(max(arr, key=len).split())
    return maxLen

In [0]:
MAX_SEQUENCE_LENGTH = get_max_length(all_data)

print('There are {} words in longest sentance.'.format(MAX_SEQUENCE_LENGTH))

## Seperate training and testing data

In [0]:
train_data, test_data, train_label, test_label = train_test_split(all_data, all_label, test_size=0.2)

In [0]:
print(f"The are {len(train_data)} in training data ")
print(f"The are {len(test_data)} in testing data ")

# Embedding layers

## Get pretrained word vector

In [0]:
glove_file = 'glove.6B.50d.txt'

f = open(glove_file)

pretrained_dic = {}
words = set()

for line in f:
    line = line.strip().split()
    curr_word = line[0]
    pretrained_dic[curr_word] = np.array(line[1:], dtype=np.float64)
    words.add(curr_word)

f.close()

In [0]:
# Show a few items in pretrained dictionary
print("pretrained_dic\n")

{k: pretrained_dic[k] for k in list(pretrained_dic)[10:15]}

In [0]:
print('There are {} words in the dictionary'.format(len(pretrained_dic)))
print('Length of vector for a word: {}'.format(len(pretrained_dic['is'])))

## Create word index

In [0]:
print(list(words)[0:10])

In [0]:
word_to_index = {}

In [0]:
index = 1
for key in sorted(words):
  word_to_index[key] = index
  index += 1

In [0]:
# Show a few items in word_to_index
print("word_to_index\n")
{k: word_to_index[k] for k in list(word_to_index)[60000:60010]}

## Creating embedding matrix

In [0]:
EMBEDDING_DIM = 50

embedding_matrix = np.zeros((len(word_to_index)+1, EMBEDDING_DIM))

for word, i in word_to_index.items():
  
  embedding_vector = pretrained_dic.get(word)
  
  if embedding_vector is not None:
    
    embedding_matrix[i] = embedding_vector


In [0]:
embedding_matrix

## Sentence to Index

![alt text](https://storage.googleapis.com/allianz-course/data/embedding_layer.jpg =800x300)

In [0]:
def sentences_to_indices(X, word_to_index, max_len):
    m = X.shape[0]
    X_indices = np.zeros((m, max_len))
    for i in range(m):
        sentence_words = X[i].lower().split()
        j = 0
        for w in sentence_words:
            X_indices[i, j] = word_to_index[w]
            j = j + 1
    return X_indices

In [0]:
test_sentence = np.array(['this book is nice', 'lets go shopping', 'hello'])
max_len = get_max_length(test_sentence)
test_indices = sentences_to_indices(test_sentence, word_to_index, max_len=max_len)

In [0]:
test_indices

# **Build Model**

![alt text](https://storage.googleapis.com/allianz-course/data/lstm_arch.jpg =700x550 )

In [0]:
K.clear_session()

sequence_input = Input(shape = (MAX_SEQUENCE_LENGTH,), dtype = 'int32')

embedding_layer = Embedding(len(word_to_index)+1,
                            EMBEDDING_DIM,
                            weights = [embedding_matrix], 
                            input_length = MAX_SEQUENCE_LENGTH,
                            trainable = False)

embeddings = embedding_layer(sequence_input)

X = LSTM(128, return_sequences=True)(embeddings)
X = Dropout(0.5)(X)
X = LSTM(128, return_sequences=False)(X)
X = Dropout(0.5)(X)
X = Dense(5, activation='softmax')(X)

model = Model(sequence_input, X)

model.summary()

In [0]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

#**Train Model**

In [0]:
train_indices = sentences_to_indices(train_data, word_to_index, MAX_SEQUENCE_LENGTH)

In [0]:
train_indices

In [0]:
train_label_oh = keras.utils.to_categorical(train_label, num_classes = 5)

In [0]:
epoch = 20
train_his = model.fit(train_indices, train_label_oh, epochs=epoch, batch_size=32, shuffle=True, validation_split = 0.2)



In [0]:
plt.figure(figsize=(15,7))

plt.plot(np.arange(1, epoch+1), train_his.history['acc'], label='train_acc', lw=3)
plt.plot(np.arange(1, epoch+1), train_his.history['val_acc'], label='val_acc', lw=3)

plt.ylabel('Acc', family='serif', size=14)
plt.xlabel('Epoch #', family='serif', size=14)
plt.xticks(np.arange(1, epoch+1))
plt.xlim([1, epoch])
plt.legend(prop={'size':14, 'family':'serif'})
plt.title('Acc.',size=14, family= 'serif')
# plt.show()

#**Test Model**

In [0]:
test_indices = sentences_to_indices(test_data, word_to_index, MAX_SEQUENCE_LENGTH)
test_pro = model.predict(test_indices)
test_predict_label = test_pro.argmax(axis=-1)

In [0]:
test_acc = accuracy_score(test_label, test_predict_label)
print('Test Accuracy: {}'.format(test_acc))

In [0]:
x_test = np.array(['I eat a lot','I am angry','I like play ball'])
X_test_indices = sentences_to_indices(x_test, word_to_index, MAX_SEQUENCE_LENGTH)
pred = model.predict(X_test_indices)
for i in range(len(x_test)):
    x = X_test_indices
    num = np.argmax(pred[i])
    print(' prediction: ' + x_test[i] + label_to_emoji(num).strip())