In [1]:
%config Completer.use_jedi = False
from torch.nn.functional import one_hot
import pandas as pd
import emoji
import torch
import numpy as np
from utils import one_hot, softmax, predict

In [2]:
emoji_dictionary = {"0": ":heart:",
                    "1": ":baseball:",
                    "2": ":smile:",
                    "3": ":disappointed:",
                    "4": ":fork_and_knife:"}

In [3]:
def label_to_emoji(target_label):
    assert type(target_label) == str
    emoji_unicode = emoji_dictionary.get(target_label)
    return emoji.emojize(emoji_unicode, use_aliases=True)

In [4]:
def extract_X_Y(file_path):
    data = pd.read_csv(file_path, error_bad_lines=False)
    # In case of prior manilulations.    
    if 'Unnamed: 2' in data.columns or 'Unnamed: 3' in data.columns:
        data = data.drop(columns=['Unnamed: 2', 'Unnamed: 3'])
    
    data.columns = ['phrase', 'label']
    X = data.get('phrase')
    Y = data.get('label')
    return X, Y

In [5]:
X_train, Y_train = extract_X_Y('./data/train_emoji.csv')
X_test, Y_test = extract_X_Y('./data/test_emoji.csv')

### One-Hot encoding labels

In [6]:
oh_Y_train = one_hot(Y_train.values, 5)
oh_Y_test = one_hot(Y_test.values, 5)

In [7]:
print("X_train Shape:", X_train.shape)
print("Y_train Shape:", oh_Y_train.shape)
print("X_test Shape:", X_test.shape)
print("Y_test Shape:", oh_Y_test.shape)

X_train Shape: (131,)
Y_train Shape: (131, 5)
X_test Shape: (55,)
Y_test Shape: (55, 5)


In [8]:
idx = 50
print(X_train[idx], label_to_emoji(str(Y_train[idx])))
print("Label index %d is one-hot encoded as:" % Y_train[idx], oh_Y_train[idx])

I think I will end up alone 😞
Label index 3 is one-hot encoded as: [0. 0. 0. 1. 0.]


### Implementing Emojifier V-1

![Emojifier V-1](./images/Emojifier-V1.png)

#### Inputs and outputs
* The input of the model is a string corresponding to a sentence (e.g. "I love you). 
* The output will be a probability vector of shape (1,5), (there are 5 emojis to choose from).
* The (1,5) probability vector is passed to an argmax layer, which extracts the index of the emoji with the highest probability.

### Glove Word Vector

[GloVe](https://nlp.stanford.edu/projects/glove/) is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space. 

#### Read GloVe File

In [9]:
def read_glove_vecs(file_path):
    print("Loading Glove Model..")
    f = open(file_path,'r', errors = 'ignore', encoding='utf8')
    gloveModel = {}
    words = set()
    for line in f:
        try:
            splitLines = line.split()
            word = splitLines[0]
            words.add(word)
            wordEmbedding = np.array([float(value) for value in splitLines[1:]])
            gloveModel[word] = wordEmbedding
        except:
            pass

    i = 1
    words_to_index = {}
    index_to_words = {}
    for w in sorted(words):
        words_to_index[w] = i
        index_to_words[i] = w
        i = i + 1
    print(len(gloveModel),"words loaded!")
    return words_to_index, index_to_words, gloveModel

In [10]:
words_to_index, index_to_words, word_to_vec_map = read_glove_vecs('./GloVe/glove.6B.50d.txt')

Loading Glove Model..
400000 words loaded!


In [11]:
print("Word_to_vector map Shape:", len(word_to_vec_map.get('food')))

Word_to_vector map Shape: 50


In [12]:
word = 'food'
word_idx = words_to_index.get(word)

print("Word '%s' has index %d in GloVe vector." % (word, word_idx))

idx = 250000
word = index_to_words.get(idx)
print("Index %d belong to the word '%s'." % (idx, word))

Word 'food' has index 151204 in GloVe vector.
Index 250000 belong to the word 'morpheus'.


In [22]:
def sentence_to_avg(sentence, word_to_vector):
    words = (sentence.lower()).split()
    avg = np.zeros((50,))
    total = 0
    for w in words:
        total += word_to_vec_map[w]
    avg = total / float(len(words))
    return avg

In [23]:
avg = sentence_to_avg("Morrocan couscous is my favorite dish", word_to_vec_map)
print("avg = \n", avg)

avg = 
 [-0.008005    0.56370833 -0.50427333  0.258865    0.55131103  0.03104983
 -0.21013718  0.16893933 -0.09590267  0.141784   -0.15708967  0.18525867
  0.6495785   0.38371117  0.21102167  0.11301667  0.02613967  0.26037767
  0.05820667 -0.01578167 -0.12078833 -0.02471267  0.4128455   0.5152061
  0.38756167 -0.898661   -0.535145    0.33501167  0.68806933 -0.2156265
  1.797155    0.10476933 -0.36775333  0.750785    0.10282583  0.348925
 -0.27262833  0.66768    -0.10706167 -0.283635    0.59580117  0.28747333
 -0.3366635   0.23393817  0.34349183  0.178405    0.1166155  -0.076433
  0.1445417   0.09808667]


#### Implementing the V1 model

Now that the sentences's average function is created and GloVe word vector is loaded, its time to construct the model. In this case the model uses cross-entropy cost function:

$$ z^{(i)} = W . avg^{(i)} + b$$

$$ a^{(i)} = softmax(z^{(i)})$$

$$ \mathcal{L}^{(i)} = - \sum_{k = 0}^{n_y - 1} Y_{oh,k}^{(i)} * log(a^{(i)}_k)$$

And the gradients are computed as:

$$ \frac{d}{dx}Z = a^{(i)} - Y_{oh}^{(i)}$$

$$ \frac{d}{dx}W = \frac{d}{dx}Z{(i)} . avg^{(i)}$$

$$ \frac{d}{dx}b = \frac{d}{dx}Z^{(i)}$$

***Note:*** The $Y_{oh}$ denotes one-hoted $Y$.

In [25]:
def model(X, Y, word_to_vector_map, lr=0.01, epochs=400):

    m = X.shape[0]                          
    n_y = 5                                  
    n_h = 50
    
    W = np.random.randn(n_y, n_h) / np.sqrt(n_h)
    b = np.zeros((n_y,))
    
    Y_oh = one_hot(Y, n_y) 
    
    for t in range(epochs):
        for i in range(m):
            avg = sentence_to_avg(X[i], word_to_vec_map)
            
            z = np.matmul(W, avg) + b
            a = softmax(z)
            
            cost = -(np.matmul(Y_oh[i], np.log(a)))
            
            dz = a - Y_oh[i]
            dW = np.dot(dz.reshape(n_y,1), avg.reshape(1, n_h))
            db = dz

            W = W - lr * dW
            b = b - lr * db
        
        if t % 100 == 0:
            print("Epoch: " + str(t) + " --- cost = " + str(cost))
            pred = predict(X, Y, W, b, word_to_vec_map) #predict is defined in emo_utils.py

    return pred, W, b

In [26]:
if str(type(X_train)) != "<class 'numpy.ndarray'>":
    X_train = X_train.values
    Y_train = Y_train.values
    X_test = X_test.values
    Y_test = Y_test.values

In [27]:
pred, W, b = model(X_train, Y_train, word_to_vec_map)
print(pred)

Epoch: 0 --- cost = 1.9938412874344404
Accuracy: 0.31297709923664124
Epoch: 100 --- cost = 0.08066259933502952
Accuracy: 0.9312977099236641
Epoch: 200 --- cost = 0.04833138260591153
Accuracy: 0.9465648854961832
Epoch: 300 --- cost = 0.03840423410146641
Accuracy: 0.9694656488549618
[[2.]
 [3.]
 [0.]
 [4.]
 [0.]
 [3.]
 [2.]
 [3.]
 [1.]
 [3.]
 [3.]
 [1.]
 [3.]
 [2.]
 [3.]
 [2.]
 [3.]
 [1.]
 [2.]
 [3.]
 [0.]
 [2.]
 [2.]
 [2.]
 [1.]
 [4.]
 [3.]
 [3.]
 [4.]
 [0.]
 [3.]
 [4.]
 [2.]
 [0.]
 [3.]
 [2.]
 [2.]
 [3.]
 [4.]
 [2.]
 [2.]
 [0.]
 [2.]
 [3.]
 [0.]
 [3.]
 [2.]
 [4.]
 [3.]
 [0.]
 [3.]
 [3.]
 [3.]
 [4.]
 [2.]
 [1.]
 [1.]
 [1.]
 [2.]
 [3.]
 [1.]
 [0.]
 [0.]
 [0.]
 [3.]
 [4.]
 [4.]
 [2.]
 [2.]
 [1.]
 [2.]
 [0.]
 [3.]
 [2.]
 [2.]
 [0.]
 [3.]
 [3.]
 [1.]
 [2.]
 [1.]
 [2.]
 [2.]
 [4.]
 [3.]
 [3.]
 [2.]
 [4.]
 [0.]
 [0.]
 [3.]
 [3.]
 [3.]
 [3.]
 [2.]
 [0.]
 [1.]
 [2.]
 [3.]
 [0.]
 [2.]
 [2.]
 [2.]
 [3.]
 [2.]
 [2.]
 [2.]
 [4.]
 [1.]
 [1.]
 [3.]
 [3.]
 [4.]
 [1.]
 [2.]
 [1.]
 [1.]
 [3.]
 [1.]
 [0.

In [28]:
print("Training set:")
pred_train = predict(X_train, Y_train, W, b, word_to_vec_map)
print('Test set:')
pred_test = predict(X_test, Y_test, W, b, word_to_vec_map)

Training set:
Accuracy: 0.9770992366412213
Test set:
Accuracy: 0.8545454545454545
