In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.models import Model
from keras.layers import Dense, Dropout, LSTM, GRU
from keras.preprocessing import sequence

In [2]:
# Read csv file
def read_csv(file_name):
  data_frame = pd.read_csv(file_name)
  X = np.array(data_frame["sentence"])
  Y = np.array(data_frame["label"], dtype=int) # labels are integer
  return X, Y

In [3]:
X_train, Y_train = read_csv("/content/drive/MyDrive/Dataset/Emoji_Text_Classification/train.csv")
X_test, Y_test = read_csv("/content/drive/MyDrive/Dataset/Emoji_Text_Classification/test.csv")

In [4]:
# Get max length of sentences
max_len = len(max(X_train, key=len).split(" "))
max_len

10

In [5]:
# Replace labels with related emoji
def label_to_emoji(label):
    emojies = ["❤️", "🏐", "😄", "😞", "🍴"]
    return emojies[label]

index = 10
print(X_train[index], label_to_emoji(Y_train[index]))

she did not answer my text  😞


In [6]:
# Number of sentence in each class
unique, counts = np.unique(Y_train, return_counts=True)
dict(zip(unique, counts))

{0: 22, 1: 19, 2: 38, 3: 36, 4: 17}

## Emojifier-V1

In [7]:
# Convert labels to one hot
num_classes = len(np.unique(Y_train))

Y_train_oh = tf.keras.utils.to_categorical(Y_train, num_classes)
Y_test_oh = tf.keras.utils.to_categorical(Y_test, num_classes)

In [8]:
index = 5
print(Y_train[index], "is converted into one hot", Y_train_oh[index])

0 is converted into one hot [1. 0. 0. 0. 0.]


In [None]:
# Download feature vectors and extract
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip -d glov.6B

In [10]:
# Read feature vectors and save them
def read_glov_vectors(glove_file):
  f = open(glove_file, encoding="utf8")
  words = set()
  words_to_vec = dict()
  for line in f:
    line = line.strip().split()
    word = line[0]
    vec = line[1:]
    words.add(word)
    words_to_vec[word] = np.array(vec, dtype=np.float64)
  return words_to_vec

In [11]:
words_to_vec = read_glov_vectors("/content/glov.6B/glove.6B.50d.txt")

# Test the output of read_glov_vectors function
words_to_vec["hello"]

array([-0.38497 ,  0.80092 ,  0.064106, -0.28355 , -0.026759, -0.34532 ,
       -0.64253 , -0.11729 , -0.33257 ,  0.55243 , -0.087813,  0.9035  ,
        0.47102 ,  0.56657 ,  0.6985  , -0.35229 , -0.86542 ,  0.90573 ,
        0.03576 , -0.071705, -0.12327 ,  0.54923 ,  0.47005 ,  0.35572 ,
        1.2611  , -0.67581 , -0.94983 ,  0.68666 ,  0.3871  , -1.3492  ,
        0.63512 ,  0.46416 , -0.48814 ,  0.83827 , -0.9246  , -0.33722 ,
        0.53741 , -1.0616  , -0.081403, -0.67111 ,  0.30923 , -0.3923  ,
       -0.55002 , -0.68827 ,  0.58049 , -0.11626 ,  0.013139, -0.57654 ,
        0.048833,  0.67204 ])

In [12]:
#  Convert sentences to the average of the word vectors
def sentence_to_avg(sentence):
  words = sentence.lower().split() # Convert uppercase to lowercase
  sum_vectors = np.zeros((50, ))
  for w in words:
    sum_vectors += words_to_vec[w]
  avg_vectors = sum_vectors / len(words)
  return avg_vectors

In [13]:
# Test sentence_to_avg function
sentence_to_avg("Pasta is my favorite food")

array([ 0.242832  ,  0.370774  , -0.524396  ,  0.018644  ,  0.568756  ,
        0.0219878 , -0.48206322, -0.152204  ,  0.235412  ,  0.1979466 ,
       -0.178818  ,  0.3203976 ,  0.3379962 ,  0.1399654 ,  0.56775044,
        0.118648  , -0.04531252,  0.335416  ,  0.149832  , -0.522814  ,
        0.095746  , -0.0468764 ,  0.5508066 ,  0.39369132,  0.275182  ,
       -1.275018  , -0.76076   ,  0.449102  ,  0.7542772 , -0.2332608 ,
        2.82554   ,  0.287742  , -0.325976  ,  0.608572  , -0.020543  ,
        0.286476  , -0.24984   ,  0.899408  ,  0.38995   , -0.270266  ,
        0.3004734 ,  0.315962  , -0.2408782 ,  0.1586226 ,  0.5400462 ,
        0.412066  , -0.1657008 , -0.253566  ,  0.3091806 ,  0.371192  ])

In [14]:
# Get the average of all sentences
X_train_avg = []
for i in range(X_train.shape[0]):
  X_train_avg.append(sentence_to_avg(X_train[i]))

X_train_avg = np.array(X_train_avg)

X_train_avg.shape, Y_train_oh.shape

((132, 50), (132, 5))

In [15]:
# Create model(using perceptron)
class EmojiNet_V1(Model):
    def __init__(self):
        super().__init__()
        self.dense = Dense(num_classes, input_shape=(50,), activation='softmax')

    def call(self, x):
        x = self.dense(x)
        return x

In [None]:
# Compile and fit the model
model = EmojiNet_V1()

model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

model.fit(X_train_avg, Y_train_oh, epochs=400, shuffle=True)

In [17]:
# Evaluation
X_test_avg = []
for i in range(X_test.shape[0]):
    X_test_avg.append(sentence_to_avg(X_test[i]))

X_test_avg = np.array(X_test_avg)
model.evaluate(X_test_avg, Y_test_oh)



[0.628527820110321, 0.8392857313156128]

In [18]:
# Inference
X_me = np.array(["not sad", "i adore you", "i love you", "funny lol", "lets play with a ball", "food is ready", "not feeling happy and funny"])
Y_me = np.array([[2], [0], [0], [2], [1], [4], [3]])
X_me_avg = []

for x in X_me:
    X_me_avg.append(sentence_to_avg(x))

X_me_avg = np.array(X_me_avg)
pred = model.predict(X_me_avg)

for i in range(X_me.shape[0]):
    print(X_me[i], label_to_emoji(np.argmax(pred[i])))

not sad 😞
i adore you ❤️
i love you ❤️
funny lol 😄
lets play with a ball 🏐
food is ready 🍴
not feeling happy and funny 😄


## Emojifier-V2: Using RNNs

In [19]:
# Define model
class EmojiNet_V2(Model):
    def __init__(self):
        super().__init__()
        
        self.lstm_1 = LSTM(2048, return_sequences=True)
        self.dropout_1 = Dropout(0.3)
        #self.lstm_2 = LSTM(512, return_sequences=True)
        self.lstm_3 = LSTM(4096)
        self.dropout_2 = Dropout(0.3)
        self.dense = Dense(num_classes, activation='softmax')

    def call(self, x):
        x = self.lstm_1(x)
        x = self.dropout_1(x)
        # x = self.lstm_2(x)
        # x = self.dropout_2(x)
        x = self.lstm_3(x)
        x = self.dropout_2(x)
        x = self.dense(x)
        return x

In [20]:
# Compile model
model = EmojiNet_V2()

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00001),
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

In [21]:
# Fix the size of all sentences to max_len
def convert_sentences_to_embeddings(X):
    emb_dim = words_to_vec["cucumber"].shape[0]  # define dimensionality of your GloVe word vectors (= 50)
    emb_matrix = np.zeros((X.shape[0], max_len, emb_dim))
    for i in range(X.shape[0]):
        words = X[i].lower().split()
        for j in range(len(words)):
            emb_matrix[i, j, :] = words_to_vec[words[j]]
    return emb_matrix

In [None]:
# Test convert_sentences_to_embeddings function
X_me = np.array(["funny lol", "lets play baseball", "food is ready for you"])
print(X_me)
print(convert_sentences_to_embeddings(X_me))

In [23]:
# Run convert_sentences_to_embeddings function for training data 
X_train_embs =convert_sentences_to_embeddings(X_train)
X_train_embs.shape

(132, 10, 50)

In [None]:
model.fit(X_train_embs, Y_train_oh, epochs=200, batch_size=4, shuffle=True)

In [25]:
# Evaluation
X_test_embs = convert_sentences_to_embeddings(X_test)
print(X_test_embs.shape)
model.evaluate(X_test_embs, Y_test_oh)

(56, 10, 50)


[1.1150354146957397, 0.8214285969734192]

In [26]:
# Inference
X_me = np.array(["not happy", "i adore you", "i love you", "funny lol", "lets play with a ball", "food is ready", "not feeling happy", "not feeling happy and funny"])
#Y_me = np.array([[2], [0], [0], [2], [1], [4], [3]])
X_me_embed = convert_sentences_to_embeddings(X_me) 

pred = model.predict(X_me_embed)

for i in range(X_me.shape[0]):
    print(X_me[i], label_to_emoji(np.argmax(pred[i])))

not happy 😞
i adore you ❤️
i love you ❤️
funny lol 😄
lets play with a ball 🏐
food is ready 🍴
not feeling happy 😞
not feeling happy and funny 😄
