In [1]:
import numpy as np
import emoji
import os
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

<div class="alert alert-block alert-info">
<b>How are we Emojifying:</b> We install the library emoji that will enable us to get emojis from a particular representation as stored in the emoji_dictionary. Emoji_dictionary contains the key value representation of a set number of emojis in a dictionary 
</div>

In [2]:
emoji_dictionary = {"0": "\u2764\uFE0F",
                    "1": ":baseball:",
                    "2": ":smile:",
                    "3": ":disappointed:",
                    "4": ":fork_and_knife:"}

In [3]:
for keys,values in emoji_dictionary.items():
    print('The key: ',keys,' corresponds to ' ,emoji.emojize(str(values),use_aliases=True),' this emoji')

The key:  0  corresponds to  ‚ù§Ô∏è  this emoji
The key:  1  corresponds to  ‚öæ  this emoji
The key:  2  corresponds to  üòÑ  this emoji
The key:  3  corresponds to  üòû  this emoji
The key:  4  corresponds to  üç¥  this emoji


<div class="alert alert-block alert-info">
<b>Loading train and test data:</b> We will be loading the train and test data over here and hence we don't need to perform any train test split later. The train and test datasets will have 2 columns. The first column contains a particular sentence and the second column contains the emoji-keys associated with the sentence.
</div>

In [4]:
df_train             = pd.read_csv('train_emoji.csv',header=None,usecols =[0,1])
X_train,Y_train      = df_train[0].values, df_train[1].values
df_test              = pd.read_csv('tesss.csv',header=None,usecols =[0,1])
X_test, Y_test       = df_test[0].values, df_test[1].values

In [5]:
print('The train data looks like this ')
df_train.head(4)

The train data looks like this 


Unnamed: 0,0,1
0,never talk to me again,3
1,I am proud of your achievements,2
2,It is the worst day in my life,3
3,Miss you so much,0


<div class="alert alert-block alert-info">
<b>Visualizing data with Emojis :</b> We will visualize the given data along with the Emojis assigned to them 
</div>

In [6]:
print('We create a function to access the emoji using its key ')
f_access_emoji      = lambda x: emoji.emojize(emoji_dictionary[str(x)], use_aliases=True)

We create a function to access the emoji using its key 


In [7]:
print('We consider few random samples and print the statement and the emoji it will follow as below:- ')
print(X_train[9],f_access_emoji(Y_train[9]))
print(X_train[40],f_access_emoji(Y_train[40]))
print(X_train[80],f_access_emoji(Y_train[80]))

We consider few random samples and print the statement and the emoji it will follow as below:- 
I want to go play ‚öæ
we made it üòÑ
I am so impressed by your dedication to this project üòÑ


<div class="alert alert-block alert-info">
<b>Loading Pre trained Glove embeddings:</b> We will now load pre trained word embeddings 
</div>

In [8]:
path = os.getcwd()+'\\glove.6B\\glove.6B.'+str(50)+'d.txt'
print('Loading word vectors...')
word2vec = {}
words = set()
with open(path,encoding='utf8') as f:
    for line in f:
        values = line.split()
        word   = values[0]
        words.add(word)
        vec = np.asarray(values[1:], dtype='float32')
        word2vec[word] = vec
#     i = 1
#     words_to_index = {}
#     index_to_words = {}
#     for w in sorted(words):
#         words_to_index[w] = i
#         index_to_words[i] = w
#         i = i + 1
        
print('Found %s word vectors.' % len(word2vec))


Loading word vectors...
Found 400000 word vectors.


<div class="alert alert-block alert-info">
<b>Averaging vector representation of a sentence:</b> We will average out the vector representation of a sentence to get the vector representation having values between 0 and 1
</div>

In [9]:
def sentence_to_avg(sentence, word_to_vec_map):
    words = sentence.lower().split()
    avg = np.zeros(50,)
    for w in words:
        avg += word_to_vec_map[w]
    avg = avg/len(words)
    return avg

In [10]:
softmax = lambda x : (np.exp(x - np.max(x)))/(np.exp(x - np.max(x))).sum()

<div class="alert alert-block alert-info">
<b>Predicting Emojis:</b> Here we predict the probability of each emoji and return them in form of a list 
</div>

In [11]:
def predict(X, Y, W, b, word_to_vec_map):
    """
    Given X (sentences) and Y (emoji indices), predict emojis and compute the accuracy of your model over the given set.
    
    Arguments:
    X -- input data containing sentences, numpy array of shape (m, None)
    Y -- labels, containing index of the label emoji, numpy array of shape (m, 1)
    
    Returns:
    pred -- numpy array of shape (m, 1) with your predictions
    """
    m = X.shape[0]
    pred = np.zeros((m, 1))
    
    for j in range(m):                       # Loop over training examples
        
        # Split jth test example (sentence) into list of lower case words
        words = X[j].lower().split()
        
        # Average words' vectors
        avg = np.zeros((50,))
        for w in words:
            avg += word_to_vec_map[w]
        avg = avg/len(words)

        # Forward propagation
        Z = np.dot(W, avg) + b
        A = softmax(Z)
        pred[j] = np.argmax(A)
        
    print("Accuracy: "  + str(np.mean((pred[:] == Y.reshape(Y.shape[0],1)[:]))))
    
    return pred

In [12]:
def model(X, Y, word_to_vec_map, learning_rate = 0.01, num_iterations = 400):
    """
    Model to train word vector representations in numpy.
    
    Arguments:
    X -- input data, numpy array of sentences as strings, of shape (m, 1)
    Y -- labels, numpy array of integers between 0 and 7, numpy-array of shape (m, 1)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
    learning_rate -- learning_rate for the stochastic gradient descent algorithm
    num_iterations -- number of iterations
    
    Returns:
    pred -- vector of predictions, numpy-array of shape (m, 1)
    W -- weight matrix of the softmax layer, of shape (n_y, n_h)
    b -- bias of the softmax layer, of shape (n_y,)
    """
    
    np.random.seed(1)
    m = Y.shape[0]                         
    n_y = 5                                
    n_h = 50                               
    
    W = np.random.randn(n_y, n_h) / np.sqrt(n_h)
    b = np.zeros((n_y,))
    
    Y_oh = np.eye(5)[Y.reshape(-1)]
    
    for t in range(num_iterations):                       
        for i in range(m):                                
            avg = sentence_to_avg(X[i],word_to_vec_map)

            z = W.dot(avg)+b
            a = softmax(z)
            
            cost = -np.sum(np.multiply(Y_oh[i],np.log(a)))
            
            dz = a - Y_oh[i]
            dW = np.dot(dz.reshape(n_y,1), avg.reshape(1, n_h))
            db = dz
            
            W = W - learning_rate * dW
            b = b - learning_rate * db
        
        if t % 100 == 0:
            print("Epoch: " + str(t) + " --- cost = " + str(cost))
            pred = predict(X, Y, W, b, word_to_vec_map)

    return pred, W, b

<div class="alert alert-block alert-info">
<b>Training a model:</b> Here we train the model defined above to get the results needed.
</div>

In [13]:
pred, W, b = model(X_train, Y_train, word2vec,num_iterations=1000)

Epoch: 0 --- cost = 1.9520498647328826
Accuracy: 0.3484848484848485
Epoch: 100 --- cost = 0.07971818659533625
Accuracy: 0.9318181818181818
Epoch: 200 --- cost = 0.0445636922039511
Accuracy: 0.9545454545454546
Epoch: 300 --- cost = 0.034322673638104054
Accuracy: 0.9696969696969697
Epoch: 400 --- cost = 0.02906976766810495
Accuracy: 0.9772727272727273
Epoch: 500 --- cost = 0.02566924985114963
Accuracy: 0.9772727272727273
Epoch: 600 --- cost = 0.023182173795051348
Accuracy: 0.9772727272727273
Epoch: 700 --- cost = 0.021228282746351104
Accuracy: 0.9848484848484849
Epoch: 800 --- cost = 0.019625903386799655
Accuracy: 0.9848484848484849
Epoch: 900 --- cost = 0.01827744285213552
Accuracy: 0.9848484848484849


In [14]:
print("Training set:")
pred_train = predict(X_train, Y_train, W, b, word2vec)
print('Test set:')
pred_test = predict(X_test, Y_test, W, b, word2vec)

Training set:
Accuracy: 0.9848484848484849
Test set:
Accuracy: 0.9107142857142857


<div class="alert alert-block alert-info">
<b>Actual Implementation with Sentences:</b> Here we are going to feed our model with some sentences and the outcome(emojis) and expect to get a similar output
</div>

In [15]:
X_my_sentences = np.array(["i adore you", "i love you", "funny lol", "lets play with a ball","Nice shot played",
                           "food is ready", "not feeling happy"])
Y_my_labels = np.array([[0], [0], [2], [1], [1],[4],[3]])

pred = predict(X_my_sentences, Y_my_labels , W, b, word2vec)

Accuracy: 1.0


In [16]:
for i in range(len(X_my_sentences)):
    print(X_my_sentences[i],f_access_emoji(str(int(pred[i][0]))))

i adore you ‚ù§Ô∏è
i love you ‚ù§Ô∏è
funny lol üòÑ
lets play with a ball ‚öæ
Nice shot played ‚öæ
food is ready üç¥
not feeling happy üòû
