In [98]:
import numpy as np
import pandas as pd
import emoji
from keras.models import Sequential #because we are dealing with sequential datasets
from keras.layers import Dense, LSTM, SimpleRNN, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical


In [99]:
data=pd.read_csv('emoji_data.csv', header=None) #becasue in my file header is not there....so it will start reading from the zeroth line
data.head()

Unnamed: 0,0,1
0,French macaroon is so tasty,4
1,work is horrible,3
2,I am upset,3
3,throw the ball,1
4,Good joke,2


In [100]:
emoji_dict = {
    0: ":red_heart:",
    1: ":baseball:",
    2: ":grinning_face_with_big_eyes:",
    3: ":disappointed_face:",
    4: ":fork_and_knife_with_plate:",
 
}

def label_to_emoji(label):
    return emoji.emojize(emoji_dict[label])

for key, value in emoji_dict.items():
    print(key, label_to_emoji(key))


0 ❤️
1 ⚾
2 😃
3 😞
4 🍽️


In [101]:
X=data[0].values # storing the sentences
Y=data[1].values #expected emoji label
print(len(X))
print(len(Y))
Ytrain = to_categorical(Y)  #here we are doing on hot encoding of labels
print("Categorical Ytrain:", Ytrain)

183
183
Categorical Ytrain: [[0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]
 [0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]
 [1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0.]
 [1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0.]


In [102]:
# Embeddings
file=open('glove.6B.100d.txt', 'r', encoding='utf8')
content=file.readlines()
file.close()   #this is the word embedding file for some words



In [103]:
embeddings={}
for line in content:
    line=line.split() # it will split the line where there is a comma
    embeddings[line[0]]=np.array(line[1:], dtype=float)




In [104]:
def get_maxlen(data):
    maxlen=0
    for i in data:
        maxlen=max(maxlen, len(i))
    return maxlen

    

In [105]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(X)  #here we want to tokenize each sentence placed in X
word2index=tokenizer.word_index #here we are making word 2 index dictionary

print(word2index)

{'i': 1, 'you': 2, 'is': 3, 'the': 4, 'a': 5, 'so': 6, 'am': 7, 'my': 8, 'to': 9, 'this': 10, 'are': 11, 'ha': 12, 'for': 13, 'she': 14, 'he': 15, 'me': 16, 'not': 17, 'love': 18, 'your': 19, 'want': 20, 'have': 21, 'it': 22, 'got': 23, 'like': 24, 'did': 25, 'baseball': 26, 'food': 27, 'was': 28, 'do': 29, 'joke': 30, 'stop': 31, 'will': 32, 'miss': 33, 'life': 34, 'ball': 35, 'good': 36, 'what': 37, 'go': 38, 'job': 39, 'funny': 40, 'bad': 41, 'day': 42, 'great': 43, 'dinner': 44, 'that': 45, 'with': 46, 'at': 47, 'of': 48, 'game': 49, 'we': 50, 'again': 51, 'said': 52, 'yes': 53, 'lol': 54, 'and': 55, 'down': 56, 'had': 57, 'her': 58, 'fun': 59, 'smile': 60, 'lot': 61, 'working': 62, 'him': 63, 'cute': 64, 'on': 65, 'lets': 66, 'messing': 67, 'us': 68, 'play': 69, 'exercise': 70, 'lost': 71, 'never': 72, 'where': 73, 'can': 74, 'well': 75, 'much': 76, 'valentine': 77, 'restaurant': 78, 'awesome': 79, 'likes': 80, 'such': 81, 'shouting': 82, 'proud': 83, 'bravo': 84, 'two': 85, 'fore

In [106]:
Xtokens=tokenizer.texts_to_sequences(X)  #in this we are giving numbers to each word in the list of sentences
maxlen=get_maxlen(Xtokens)
# print(maxlen)
Xtrain=pad_sequences(Xtokens, maxlen=maxlen, padding='post', truncating='post')
print(Xtokens)




[[103, 104, 3, 6, 105], [106, 3, 107], [1, 7, 108], [109, 4, 35], [36, 30], [37, 3, 19, 110, 26, 49], [1, 111, 112], [31, 67, 113], [1, 20, 114, 27], [115, 68, 38, 69, 26], [2, 11, 116, 10, 70], [117, 50, 71, 51], [36, 39], [12, 12, 12, 22, 28, 6, 40], [1, 32, 21, 5, 118, 119], [120, 11, 2, 121, 41], [1, 20, 9, 30], [1, 72, 52, 53, 13, 10], [4, 122, 3, 123], [73, 3, 4, 35], [1, 7, 124], [12, 12, 12, 54], [14, 52, 53], [15, 23, 5, 125], [126, 3, 127, 1, 21], [15, 74, 128, 129, 75], [1, 18, 9, 4, 130, 55, 131], [29, 2, 24, 132], [2, 133, 134, 10, 135], [1, 33, 2, 6, 76], [1, 24, 19, 136], [14, 23, 16, 5, 137], [32, 2, 138, 8, 77], [2, 139, 4, 140], [141, 3, 56, 13, 5, 78], [77, 42, 3, 142], [43, 6, 79], [29, 2, 21, 5, 35], [15, 74, 17, 29, 143], [15, 80, 26], [50, 57, 81, 5, 144, 44, 145], [146, 11, 147], [15, 3, 5, 36, 148], [72, 149, 9, 16, 51], [1, 33, 58], [27, 3, 34], [1, 7, 150, 59], [6, 41, 45, 2, 151, 152, 46, 68], [29, 2, 20, 9, 153, 16, 13, 44], [1, 24, 9, 60], [15, 25, 154, 15

In [107]:
 # Model

embed_size = 100
embedding_matrix=np.zeros((len(word2index)+1, embed_size))  
# we did len(word2index)+1 because their is no word with index 0....so we are intitalizing with a zero

for word, i in word2index.items():
    embed_vector=embeddings[word]
    embedding_matrix[i]=embed_vector

embedding_matrix

array([[ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
         0.      ],
       [-0.046539,  0.61966 ,  0.56647 , ..., -0.37616 , -0.032502,
         0.8062  ],
       [-0.49886 ,  0.76602 ,  0.89751 , ..., -0.41179 ,  0.40539 ,
         0.78504 ],
       ...,
       [-0.46263 ,  0.069864,  0.69095 , ..., -0.29174 ,  0.32041 ,
         0.21202 ],
       [ 0.073242,  0.11134 ,  0.62281 , ...,  0.53417 , -0.1646  ,
        -0.27516 ],
       [ 0.29019 ,  0.80497 ,  0.31187 , ..., -0.33603 ,  0.45998 ,
        -0.11278 ]])

In [108]:
# this is the main part
model=Sequential([
    Embedding(input_dim=len(word2index)+1,output_dim= embed_size, input_length=maxlen, weights=[embedding_matrix], trainable=False),#embeddings are pre trained so they remain fixed
    LSTM(units=16, return_sequences=True), #this ensures that lstm layer returns the full output (its many to many)
    LSTM(units=4),
    Dense(5, activation='softmax') #fully connected layer.....here 5 is number of classes we want to classify
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])




In [109]:
model.fit(Xtrain, Ytrain, epochs=300)

Epoch 1/300
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.1807 - loss: 1.6108
Epoch 2/300
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.3059 - loss: 1.5959 
Epoch 3/300
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.3191 - loss: 1.5792 
Epoch 4/300
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3391 - loss: 1.5672 
Epoch 5/300
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.3014 - loss: 1.5587 
Epoch 6/300
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.3452 - loss: 1.5393 
Epoch 7/300
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.3874 - loss: 1.5260 
Epoch 8/300
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.3740 - loss: 1.5043 
Epoch 9/300
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

<keras.src.callbacks.history.History at 0x2707f280410>

In [112]:
test = ["I'm very excited", "I feel very lonely", "lets eat lunch"]
test_seq=tokenizer.texts_to_sequences(test)
Xtest=pad_sequences(test_seq, maxlen=maxlen, padding='post', truncating='post')

y_pred=model.predict(Xtest)

for i in range(len(test)):
    print(test[i], label_to_emoji(np.argmax(y_pred[i])))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 181ms/step
I'm very excited 😃
I feel very lonely 😞
lets eat lunch 🍽️
