In [None]:
%config Completer.use_jedi = False
import numpy as np
from utils import *
import matplotlib.pyplot as plt
from keras.layers import *
from keras import Model

%matplotlib inline

In [None]:
emoji_dictionary = {"0": ":heart:",
                    "1": ":baseball:",
                    "2": ":smile:",
                    "3": ":disappointed:",
                    "4": ":fork_and_knife:"}

In [None]:
X_train, Y_train = extract_X_Y('./data/train_emoji.csv')
X_test, Y_test = extract_X_Y('./data/test_emoji.csv')

### One-Hot encoding labels

In [None]:
oh_Y_train = one_hot(Y_train.values, 5)
oh_Y_test = one_hot(Y_test.values, 5)

In [None]:
print("X_train Shape:", X_train.shape)
print("Y_train Shape:", oh_Y_train.shape)
print("X_test Shape:", X_test.shape)
print("Y_test Shape:", oh_Y_test.shape)

In [None]:
idx = 50
print(X_train[idx], label_to_emoji(str(Y_train[idx]), emoji_dictionary))
print("Label index %d is one-hot encoded as:" % Y_train[idx], oh_Y_train[idx])

### Implementing Emojifier V-1

![Emojifier V-1](./images/Emojifier-V1.png)

#### Inputs and outputs
* The input of the model is a string corresponding to a sentence (e.g. "I love you). 
* The output will be a probability vector of shape (1,5), (there are 5 emojis to choose from).
* The (1,5) probability vector is passed to an argmax layer, which extracts the index of the emoji with the highest probability.

### Glove Word Vector

[GloVe](https://nlp.stanford.edu/projects/glove/) is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space. 

#### Read GloVe File

In [None]:
words_to_index, index_to_words, word_to_vec_map = read_glove_vecs('./GloVe/glove.6B.50d.txt')

In [None]:
print("Word_to_vector map Shape:", len(word_to_vec_map.get('food')))

In [None]:
word = 'food'
word_idx = words_to_index.get(word)

print("Word '%s' has index %d in GloVe vector." % (word, word_idx))

idx = 250000
word = index_to_words.get(idx)
print("Index %d belong to the word '%s'." % (idx, word))

In [None]:
avg = sentence_to_avg("Morrocan couscous is my favorite dish", word_to_vec_map)
print("avg = \n", avg)

#### Implementing the V1 model

Now that the sentences's average function is created and GloVe word vector is loaded, its time to construct the model. In this case the model uses cross-entropy cost function:

$$ z^{(i)} = W . avg^{(i)} + b$$

$$ a^{(i)} = softmax(z^{(i)})$$

$$ \mathcal{L}^{(i)} = - \sum_{k = 0}^{n_y - 1} Y_{oh,k}^{(i)} * log(a^{(i)}_k)$$

And the gradients are computed as:

$$ \frac{d}{dx}Z^{(i)} = a^{(i)} - Y_{oh}^{(i)}$$

$$ \frac{d}{dx}W^{(i)} = \frac{d}{dx}Z{(i)} . avg^{(i)}$$

$$ \frac{d}{dx}b^{(i)} = \frac{d}{dx}Z^{(i)}$$

***Note:*** The $Y_{oh}$ denotes one-hoted $Y$.

In [None]:
class EmojifierV1:
    def __init__(self, word_to_vector_map, lr=0.01, epochs=400, verbose=1):
        self.lr = lr
        self.epochs = epochs
        self.verbose = verbose
        self.word_to_vector_map = word_to_vector_map
        self.W = None
        self.b = None
    
    def fit(self, X, Y):
        assert str(type(X)) == "<class 'numpy.ndarray'>"
        costs = []
        m = X.shape[0]                          
        n_y = 5                                 
        n_h = len(word_to_vec_map.get('food'))

        W = np.random.randn(n_y, n_h) / np.sqrt(n_h)
        b = np.zeros((n_y,))
        
        self.W = W
        self.b = b
        
        Y_oh = one_hot(Y, n_y) 

        for t in range(self.epochs):
            for i in range(m):
                avg = sentence_to_avg(X[i], self.word_to_vector_map)

                z = np.matmul(self.W, avg) + self.b
                a = softmax(z)

                cost = -(np.matmul(Y_oh[i], np.log(a)))
                
                costs.append(cost)
                
                dz = a - Y_oh[i]
                dW = np.dot(dz.reshape(n_y,1), avg.reshape(1, n_h))
                db = dz

                self.W = self.W - self.lr * dW
                self.b = self.b - self.lr * db

            if self.verbose and t % 100 == 0:
                print("Epoch: " + str(t) + " --- cost = " + str(cost))
        return costs
    
    def predict(self, X):
        m = X.shape[0]
        pred = np.zeros((m, 1))

        for j in range(m):
            avg = sentence_to_avg(X[j], self.word_to_vector_map)

            Z = np.dot(self.W, avg) + self.b
            A = softmax(Z)
            pred[j] = np.argmax(A)
        return pred
    
    def evaluate(self, X, Y):
        pred = self.predict(X)
        accuracy = np.mean((pred[:] == Y.reshape(Y.shape[0],1)[:]))
        return accuracy

In [None]:
if str(type(X_train)) != "<class 'numpy.ndarray'>":
    X_train = X_train.values
    Y_train = Y_train.values
    X_test = X_test.values
    Y_test = Y_test.values

In [None]:
model = EmojifierV1(word_to_vec_map)
cost_list = model.fit(X_train, Y_train)
accuracy = model.evaluate(X_train, Y_train)
print("Accuracy: %.5f" % accuracy)

In [None]:
train_acc = model.evaluate(X_train, Y_train)
print("Training set accuracy: %.5f" % train_acc)
test_acc = model.evaluate(X_test, Y_test)
print('Test set accuracy: %.5f' % test_acc)

In [None]:
def print_predictions(sentences, labels, emoji_dictionary):
    for idx, sentence in enumerate(sentences):
        print(sentence, label_to_emoji(str(labels[idx]), emoji_dictionary))

In [None]:
X_my_sentences = np.array(["i adore you", "i love you", "funny lol", "lets play with a ball", "food is ready", "not feeling happy"])
Y_my_labels = np.array(["0", "0", "2", "1", "4", "3"])

predictions = model.predict(X_my_sentences)
print_predictions(X_my_sentences, Y_my_labels, emoji_dictionary)

In [None]:
sentence = np.array(["I do not like you"])
prediction = model.predict(sentence)
prediction = list(str(int(prediction[0][0])))
print_predictions(sentence, prediction[0][0], emoji_dictionary)

Amazing! 
* Because *adore* has a similar embedding as *love*, the algorithm has generalized correctly even to a word it has never seen before. 
* Words such as *heart*, *dear*, *beloved* or *adore* have embedding vectors similar to *love*. 

#### Word ordering isn't considered in this model
* Note that the model doesn't get the following sentence correct:
>I do not like you ❤

* And it predicts the lable same as:
>I love you ❤

* This algorithm ignores word ordering, so is not good at understanding phrases like "I do not like you" 

#### Confusion matrix
* Printing the confusion matrix can also help understand which classes are more difficult for your model. 
* A confusion matrix shows how often an example whose label is one class ("actual" class) is mislabeled by the algorithm with a different class ("predicted" class).

In [None]:
def plot_confusion_matrix(y_actu, y_pred, title='Confusion matrix', cmap=plt.cm.gray_r):
    
    df_confusion = pd.crosstab(y_actu, y_pred.reshape(y_pred.shape[0],), rownames=['Actual'], colnames=['Predicted'], margins=True)
    
    df_conf_norm = df_confusion / df_confusion.sum(axis=1)
    
    plt.matshow(df_confusion, cmap=cmap) # imshow
    #plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(df_confusion.columns))
    plt.xticks(tick_marks, df_confusion.columns, rotation=45)
    plt.yticks(tick_marks, df_confusion.index)
    #plt.tight_layout()
    plt.ylabel(df_confusion.index.name)
    plt.xlabel(df_confusion.columns.name)

In [None]:
print(Y_test.shape)
pred_test = model.predict(X_test)
print('            ' + label_to_emoji("0", emoji_dictionary)+ '    ' + label_to_emoji("1", emoji_dictionary) + '    ' +  label_to_emoji("2", emoji_dictionary)+ '    ' + label_to_emoji("3", emoji_dictionary)+'   ' + label_to_emoji("4", emoji_dictionary))
print(pd.crosstab(Y_test, pred_test.reshape(55,), rownames=['Actual'], colnames=['Predicted'], margins=True))
plot_confusion_matrix(Y_test, pred_test)

#### V1 Model Conclution
- Even with a 127 training examples, you can get a reasonably good model for Emojifying. 
    - This is due to the generalization power word vectors provides. 
- Emojify-V1 will perform poorly on sentences such as **"This movie is not good and not enjoyable"**
    - It doesn't understand combinations of words.
    - It just averages all the words' embedding vectors together, without considering the ordering of words.

## Emojifier-V2: Using LSTMs in Keras: 

Let's build an LSTM model that takes word **sequences** as input!
* This model will be able to account for the word ordering. 
* Emojifier-V2 will continue to use pre-trained word embeddings to represent words.
* We will feed word embeddings into an LSTM.
* The LSTM will learn to predict the most appropriate emoji. 

![Emojifier-V2](./images/emojifier-v2.png)

In [None]:
def sentences_to_indices(X, word_to_index, max_len):
    m = X.shape[0] 
    X_indices = np.zeros((m, max_len))
    for i in range(m):
        sentence_words = [word.lower().replace('\t', '') for word in X[i].split(' ') if word.replace('\t', '') != '']
        j = 0
        for w in sentence_words:
            X_indices[i, j] = word_to_index[w]
            j += 1
    return X_indices

In [None]:
X1 = np.array(["funny lol", "lets play baseball", "food is ready for you"])
X1_indices = sentences_to_indices(X1, words_to_index, max_len = 5)
print("X1 =", X1)
print("X1_indices =\n", X1_indices)

In [None]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    vocab_len = len(word_to_index) + 1 
    emb_dim = word_to_vec_map["cucumber"].shape[0]
    emb_matrix = np.zeros((vocab_len, emb_dim))
    for word, idx in word_to_index.items():
        emb_matrix[idx, :] = word_to_vec_map[word]
    embedding_layer = Embedding(input_dim=vocab_len, output_dim=emb_dim, trainable=False)
    embedding_layer.build((None,))
    embedding_layer.set_weights([emb_matrix])
    return embedding_layer

In [None]:
embedding_layer = pretrained_embedding_layer(word_to_vec_map, words_to_index)
print(np.asarray(embedding_layer.get_weights()).shape)

### Emojifier-V2 (Keras)

In [None]:
def Emojify_V2_Keras(input_shape, word_to_vec_map, word_to_index):
    sentence_indices = Input(shape=input_shape, dtype='int32')
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    embeddings = embedding_layer(sentence_indices)   
    X = LSTM(128, return_sequences=True)(embeddings)
    X = Dropout(0.5)(X)
    X = LSTM(128)(X)
    X = Dropout(0.5)(X)
    X = Dense(5)(X)
    X = Activation('softmax')(X)
    
    model = Model(inputs=[sentence_indices], outputs=X)
    return model

In [None]:
maxLen = len(max(X_train, key=len).split())
X_train_indices = sentences_to_indices(X_train, words_to_index, maxLen)
Y_train_oh = one_hot(Y_train, C = 5)

In [None]:
model = Emojify_V2_Keras((maxLen,), word_to_vec_map, words_to_index)
model.summary()

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(X_train_indices, Y_train_oh, epochs = 50, batch_size = 32, shuffle=True)

#### Mislabeled sentences

In [None]:
y_test_oh = one_hot(Y_test, 5)
X_test_indices = sentences_to_indices(X_test, words_to_index, maxLen)
pred = model.predict(X_test_indices)
for i in range(len(X_test)):
    x = X_test_indices
    num = np.argmax(pred[i])
    if(num != Y_test[i]):
        print('Expected emoji: ' + label_to_emoji(str(Y_test[i]), emoji_dictionary) + '\n\tprediction: '+ X_test[i] + label_to_emoji(str(num), emoji_dictionary).strip())

### Test your own sentence!

In [None]:
x_test = np.array(['I need help'])
X_test_indices = sentences_to_indices(x_test, words_to_index, maxLen)
print(x_test[0] +' '+  label_to_emoji(str(np.argmax(model.predict(X_test_indices))), emoji_dictionary))