# Emojify 😃

### Import packages

In [1]:
import emoji
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import keras
import keras.layers as tfl
import pandas as pd

### Helper functions

In [2]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

In [3]:
def sentences_to_indices(X, word_to_index, max_len):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    
    Inputs:
        X -- array of sentences (strings), of shape (m, 1)
        word_to_index -- a dictionary containing the each word mapped to its index
        max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this. 
    
    Returns:
        X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
    """
    
    # get number of examples
    m = X.shape[0]
    
    # initialize indices matrix with zeros
    X_indices = np.zeros((m, max_len))
    
    # loop over training examples
    for i in range(m):
        
        # get list sentence words and convert to lower
        sentence_words = X[i].lower().split()
        
        j = 0
        
        # loop over sentence words
        for word in sentence_words:
            
            # if word exists in word_to_index dectionary
            if word in word_to_index:
                # Set the (i,j)th entry of X_indices to the index of the correct word.
                X_indices[i, j] = word_to_index[word]
                
                # increament j
                j+=1
        
    return X_indices

In [4]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Inputs:
        word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
        word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
        embedding_layer -- pretrained layer Keras instance
    """
    
    # adding 1 to fit Keras embedding
    vocab_size = len(word_to_index) + 1              
    any_word = list(word_to_vec_map.keys())[0]
    
    # define dimensionality of the GloVe word vectors (= 50)
    emb_dim = word_to_vec_map[any_word].shape[0]    
      

    # Initialize the embedding matrix as a numpy array of zeros.
    emb_matrix = np.zeros((vocab_size, emb_dim))
    
    # Set each row "idx" of the embedding matrix to be 
    # the word vector representation of the idx'th word of the vocabulary
    for word, idx in word_to_index.items():
        emb_matrix[idx, :] = word_to_vec_map[word]

    # Define Keras embedding layer with the correct input and output sizes
    embedding_layer = tfl.Embedding(vocab_size, emb_dim)

    # Build the embedding layer
    embedding_layer.build((None,))
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [5]:
def label_to_emoji(label):
    """
    Converts a label (int or string) into the corresponding emoji code (string) ready to be printed
    """
    return emoji.emojize(emoji_dictionary[str(label)], use_aliases=True)

### Create the model

In [19]:
def Emojify(input_shape, word_to_vec_map, word_to_index):
    """
    Function creating the Emojify model's graph.
    
    Inputs:
        input_shape -- shape of the input, usually (max_len,)
        word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
        word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
        model -- a model instance in Keras
    """
    
    # Define sentence_indices as the input of the graph.
    sentence_indices = tfl.Input(shape=input_shape, dtype='int32')
    
    # Create the embedding layer pretrained with GloVe Vectors
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    
    # Propagate sentence_indices through your embedding layer
    embeddings = embedding_layer(sentence_indices) 
    
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    X = tfl.LSTM(128, return_sequences=True)(embeddings)
    
    # Add dropout with a probability of 0.5
    X = tfl.Dropout(0.5)(X) 
    
    # Propagate X trough another LSTM layer with 128-dimensional hidden state
    X = tfl.LSTM(128)(X)
    
    # Add dropout with a probability of 0.5
    X = tfl.Dropout(0.5)(X) 
    
    # Propagate X through a Dense layer with 5 units
    X = tfl.Dense(5)(X)
    
    # Add a softmax activation
    X = tfl.Activation('softmax')(X)
    
    # Create the model
    model = keras.Model(inputs=sentence_indices, outputs=X)
        
    return model

### Read the data

In [20]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('/home/abdalla/Datasets/emoji/glove.6B.50d.txt')

In [21]:
train = pd.read_csv('/home/abdalla/Datasets/emoji/train_emoji.csv').iloc[:,:2].values
test = pd.read_csv('/home/abdalla/Datasets/emoji/test_emoji.csv').iloc[:,:2].values

In [22]:
X_train, y_train = train[:,0], np.asarray(train[:,1], dtype=int)
X_test, y_test = test[:,0], np.asarray(test[:,1], dtype=int)

In [23]:
maxLen = len(max(X_train, key=len).split())

In [24]:
X_train_indices = sentences_to_indices(X_train, word_to_index, maxLen)
y_train_oh = np.eye(5)[y_train.reshape(-1)]

In [25]:
X_test_indices = sentences_to_indices(X_test, word_to_index, max_len = maxLen)
y_test_oh = np.eye(5)[y_test.reshape(-1)]

### Train the model

In [26]:
model = Emojify((maxLen,), word_to_vec_map, word_to_index)
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 10)]              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 10, 50)            20000050  
_________________________________________________________________
lstm_2 (LSTM)                (None, 10, 128)           91648     
_________________________________________________________________
dropout_2 (Dropout)          (None, 10, 128)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_3 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 645 

In [27]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [28]:
model.fit(X_train_indices, y_train_oh, epochs = 30, batch_size = 32,
          shuffle=True, validation_data=(X_test_indices, y_test_oh))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7f11490a08d0>

### Test the model

In [29]:
emoji_dictionary = {"0": "\u2764\uFE0F",    # :heart: prints a black instead of red heart depending on the font
                    "1": ":baseball:",
                    "2": ":smile:",
                    "3": ":disappointed:",
                    "4": ":fork_and_knife:"}

In [30]:
t = ["let's go play tennis", 'hello world!', "I love meat", 'that\'s awesome', 'I adore you!', 'how sad']

In [31]:
for i in range(len(t)):
    x_test = np.array(t[i:i+1])
    X_test_indices = sentences_to_indices(x_test, word_to_index, maxLen)
    print(x_test[0] +' '+  label_to_emoji(np.argmax(model.predict(X_test_indices))))

let's go play tennis ⚾
hello world! 😄
I love meat 🍴
that's awesome 😄
I adore you! ❤️
how sad 😞


# DONE :D