In [1]:
#1. How to work with emoji package
#2. How to process csv files or custom datasets(not keras dataset)
#3. How to use transfer learning
#4. Build an LSTM model
#5. Stacked LSTM
#6. Predictions (i/p: sentence  o/p: most suitable emoji)

## Step 1- How to work with emoji package

In [2]:
!pip install emoji



In [2]:
import emoji as emoji

In [3]:
#emoji.EMOJI_UNICODE

In [4]:
emoji_dictionary = {"0": "\u2764\uFE0F",    # :heart: prints a black instead of red heart depending on the font
                    "1": ":baseball:",
                    "2": ":beaming_face_with_smiling_eyes:",
                    "3": ":downcast_face_with_sweat:",
                    "4": ":fork_and_knife:",
                   }

In [5]:
emoji.emojize(":fork_and_knife:")

'🍴'

In [6]:
for e in emoji_dictionary.values():
    print(emoji.emojize(e))

❤️
⚾
😁
😓
🍴


In [7]:
#we are going to use these 5 emojis as a sentiment analysis o/p (5 o/p classes)

## Step 2: Processing a custom dataset

In [8]:
import numpy as np
import pandas as pd

In [9]:
train = pd.read_csv('dataset/train_emoji.csv',header=None)
test = pd.read_csv('dataset/test_emoji.csv',header=None)

In [79]:
train.head() #last 2 columns not useful

Unnamed: 0,0,1,2,3
0,never talk to me again,3,,
1,I am proud of your achievements,2,,
2,It is the worst day in my life,3,,
3,Miss you so much,0,,[0]
4,food is life,4,,


In [80]:
#print(train[0])

In [81]:
#print the sentences with emojis
data = train.values
for i in range(10):
    print(data[i][0],emoji.emojize(emoji_dictionary[str(data[i][1])]))

never talk to me again 😓
I am proud of your achievements 😁
It is the worst day in my life 😓
Miss you so much ❤️
food is life 🍴
I love you mum ❤️
Stop saying bullshit 😓
congratulations on your acceptance 😁
The assignment is too long  😓
I want to go play ⚾


In [82]:
#convert sentences to embeddings and emojis to one-hot vectors

In [115]:
embeddings = {}
with open('glove.6B.50d.txt',encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coeffs = np.asarray(values[1:],dtype='float32')
        
#         print(word)
#         print(coeffs)
        embeddings[word] = coeffs
    f.close()
print(len(embeddings))

400000


## Step 4 - Converting sentences to vectors (creating the o/p of embedding layer)

In [84]:
def getOutputEmbeddings(X):
    
    #let 10 is max len of sentence, 50 batch size (no. of e.g.)
    embedding_matrix_output = np.zeros((X.shape[0],10,50))
    
    for ix in range(X.shape[0]):
        X[ix] = X[ix].split()
        for jx in range(len(X[ix])):
            #go to every word in current(ix) sentence
            embedding_matrix_output[ix][jx] = embeddings[X[ix][jx].lower()]
            
    return embedding_matrix_output

In [85]:
from keras.utils import to_categorical

In [86]:
XT = train[0]
Xt = test[0]

YT = to_categorical(train[1], num_classes=5)
Yt = to_categorical(test[1], num_classes=5)

print(XT[0])
print(XT.shape)
print(Xt.shape)
# print(YT.shape)
# print(Yt.shape)

never talk to me again
(132,)
(56,)


In [87]:
embed_matrix_train = getOutputEmbeddings(XT)
embed_matrix_test = getOutputEmbeddings(Xt)

print(XT[0])

['never', 'talk', 'to', 'me', 'again']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [88]:
print(embed_matrix_train.shape)
print(embed_matrix_test.shape)


(132, 10, 50)
(56, 10, 50)


## Step 5: Define RNN/LSTM model 

In [89]:
from keras.layers import *
from keras.models import Sequential

In [90]:
model = Sequential()
model.add(LSTM(64,input_shape=(10,50),return_sequences=True)) #hidden state: 64 dim
model.add(Dropout(0.4))
model.add(LSTM(64,input_shape=(10,50), return_sequences=False))
model.add(Dropout(0.3))
model.add(Dense(5))
model.add(Activation('softmax'))
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['acc'])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_5 (LSTM)                (None, 10, 64)            29440     
_________________________________________________________________
dropout_5 (Dropout)          (None, 10, 64)            0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dropout_6 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 5)                 325       
_________________________________________________________________
activation_3 (Activation)    (None, 5)                 0         
Total params: 62,789
Trainable params: 62,789
Non-trainable params: 0
__________________________________________________

In [91]:
# TRAIN MODEL

In [92]:
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint

checkpt = ModelCheckpoint("best_model.h5", monitor='val_loss', verbose=True, save_best_only=True)
earlystop = EarlyStopping(monitor='val_acc', patience=10)

hist = model.fit(embed_matrix_train,YT,batch_size=32,epochs=40,shuffle=True,validation_split=0.2, callbacks=[checkpt, earlystop])

Train on 105 samples, validate on 27 samples
Epoch 1/40

Epoch 00001: val_loss improved from inf to 1.61768, saving model to best_model.h5
Epoch 2/40

Epoch 00002: val_loss did not improve from 1.61768
Epoch 3/40

Epoch 00003: val_loss did not improve from 1.61768
Epoch 4/40

Epoch 00004: val_loss did not improve from 1.61768
Epoch 5/40

Epoch 00005: val_loss improved from 1.61768 to 1.60899, saving model to best_model.h5
Epoch 6/40

Epoch 00006: val_loss improved from 1.60899 to 1.54177, saving model to best_model.h5
Epoch 7/40

Epoch 00007: val_loss improved from 1.54177 to 1.46553, saving model to best_model.h5
Epoch 8/40

Epoch 00008: val_loss improved from 1.46553 to 1.36747, saving model to best_model.h5
Epoch 9/40

Epoch 00009: val_loss improved from 1.36747 to 1.32752, saving model to best_model.h5
Epoch 10/40

Epoch 00010: val_loss did not improve from 1.32752
Epoch 11/40

Epoch 00011: val_loss did not improve from 1.32752
Epoch 12/40

Epoch 00012: val_loss improved from 1.327

In [93]:
pred = model.predict_classes(embed_matrix_test)

print(pred)

[4 3 2 2 2 2 1 2 4 2 1 2 0 2 1 3 2 2 3 2 0 0 4 2 3 1 2 0 1 2 0 1 0 2 0 1 2
 3 4 2 1 0 0 1 2 2 2 2 0 1 1 0 3 2 3 0]


In [94]:
model.evaluate(embed_matrix_test,Yt)



[1.7502823386873518, 0.5714285969734192]

In [95]:
for i in range(30):
    print(' '.join(Xt[i]))
    print(emoji.emojize(emoji_dictionary[str(np.argmax(Yt[i]))]))
    print(emoji.emojize(emoji_dictionary[str(pred[i])]))

I want to eat
🍴
🍴
he did not answer
😓
😓
he got a raise
😁
😁
she got me a present
❤️
😁
ha ha ha it was so funny
😁
😁
he is a good friend
❤️
😁
I am upset
❤️
⚾
We had such a lovely dinner tonight
❤️
😁
where is the food
🍴
🍴
Stop making this joke ha ha ha
😁
😁
where is the ball
⚾
⚾
work is hard
😓
😁
This girl is messing with me
😓
❤️
are you serious ha ha
😁
😁
Let us go play baseball
⚾
⚾
This stupid grader is not working
😓
😓
work is horrible
😓
😁
Congratulation for having a baby
😁
😁
stop messing around
😓
😓
any suggestions for dinner
🍴
😁
I love taking breaks
❤️
❤️
you brighten my day
😁
❤️
I boiled rice
🍴
🍴
she is a bully
😓
😁
Why are you feeling bad
😓
😓
I am upset
😓
⚾
I worked during my birthday
😓
😁
My grandmother is the love of my life
❤️
❤️
enjoy your break
😁
⚾
valentine day is near
❤️
😁


## Django application

In [96]:
with open("model.json", "w") as file:
    file.write(model.to_json())
model.save_weights("model.h5")

In [97]:
from keras.models import model_from_json

In [98]:
with open("model.json", "r") as file:
    model=model_from_json(file.read())
model.load_weights("model.h5")

In [99]:
test_str="hello how are you"
X = pd.Series(test_str)
print(type(X))

<class 'pandas.core.series.Series'>


In [119]:
def getOutputEmbeddings(X):
    
#     embedding_matrix_output = np.zeros((X.shape[0],10,50))
    
#     for ix in range(X.shape[0]):
#         X[ix] = X[ix].split()
#         for jx in range(len(X[ix])):
#             #go to every word in current(ix) sentence
#             embedding_matrix_output[ix][jx] = embeddings[X[ix][jx].lower()]
            
    
    X = X.split()
    embedding_matrix_output = np.zeros((1,10,50))
    for jx in range(len(X)):
        #go to every word in current(ix) sentence
        embedding_matrix_output[0][jx] = embeddings[X[jx].lower()]
            
    return embedding_matrix_output

In [120]:
emb_X = getOutputEmbeddings(test_str)

In [121]:
p = model.predict_classes(emb_X)

In [123]:
p[0]

2