In [120]:
import pandas as pd
import numpy as np
from keras.layers import LSTM, Activation, Dropout, Dense, Input, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from keras.models import Model
from keras.utils import np_utils
import string
import re
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing.sequence import pad_sequences
import keras
import emoji
from sklearn.metrics import f1_score


In [99]:
featFile = open('train_tweets_us_all.text', "r", encoding="utf-8");
labelFile = open('train_tweets_us_all.labels', "r", encoding="utf-8");

testFeat = open('us_test.text', "r", encoding="utf-8");
testLabel = open('us_test.labels', "r", encoding="utf-8");

validationFeat = open('us_trial.text', "r", encoding="utf-8");
validationLabel = open('us_trial.labels', "r", encoding="utf-8");

X_train = featFile.readlines();
Y_train = labelFile.readlines();

X_test = testFeat.readlines();
Y_test = testLabel.readlines();

X_valid = validationFeat.readlines();
Y_valid = validationLabel.readlines();

In [100]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
words_to_index = tokenizer.word_index
len(words_to_index)

In [103]:
def read_glove_vector(glove_vec):
    with open(glove_vec, 'r', encoding='UTF-8') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            w_line = line.split()
            curr_word = w_line[0]
            word_to_vec_map[curr_word] = np.array(w_line[1:], dtype=np.float64)


    return word_to_vec_map
    

In [104]:
word_to_vec_map = read_glove_vector('./glove.6B.50d.txt')

In [105]:
maxLen = 50


In [106]:
vocab_len = len(words_to_index)
embed_vector_len = word_to_vec_map['moon'].shape[0]

emb_matrix = np.zeros((vocab_len, embed_vector_len))

for word, index in words_to_index.items():
    embedding_vector = word_to_vec_map.get(word)
    if embedding_vector is not None:
        emb_matrix[index, :] = embedding_vector

embedding_layer = Embedding(input_dim=vocab_len, output_dim=embed_vector_len, input_length=maxLen, weights = [emb_matrix], trainable=False)

In [107]:
def lstm_model(input_shape):

    X_indices = Input(input_shape)

    embeddings = embedding_layer(X_indices)

    X = LSTM(128, return_sequences=True)(embeddings)

    X = Dropout(0.6)(X)

    X = LSTM(128, return_sequences=True)(X)

    X = Dropout(0.6)(X)

    X = LSTM(128)(X)

    X = Dense(20, activation='sigmoid')(X)

    model = Model(inputs=X_indices, outputs=X)

    return model

In [108]:
model = lstm_model((maxLen,))
model.summary()

Model: "model_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_12 (InputLayer)        [(None, 50)]              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 50, 50)            14871100  
_________________________________________________________________
lstm_8 (LSTM)                (None, 50, 128)           91648     
_________________________________________________________________
dropout_9 (Dropout)          (None, 50, 128)           0         
_________________________________________________________________
lstm_9 (LSTM)                (None, 50, 128)           131584    
_________________________________________________________________
dropout_10 (Dropout)         (None, 50, 128)           0         
_________________________________________________________________
lstm_10 (LSTM)               (None, 128)               1315

In [109]:
X_train_indices = tokenizer.texts_to_sequences(X_train)
X_train_indices = pad_sequences(X_train_indices, maxlen=maxLen, padding='post')
X_train_indices.shape

(405987, 50)

In [111]:
Y_train_int = np.array(list(map(int, Y_train)))
Y_train_hot = np_utils.to_categorical(Y_train_int)

In [113]:
adam = keras.optimizers.Adam(learning_rate = 0.0001)
model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])

In [115]:
model.fit(X_train_indices, Y_train_hot, batch_size=1024, epochs=1)



<tensorflow.python.keras.callbacks.History at 0x21c04fc7b48>

In [116]:
X_test_indices = tokenizer.texts_to_sequences(X_test)
X_test_indices = pad_sequences(X_test_indices, maxlen=maxLen, padding='post')

In [117]:
Y_test_int = np.array(list(map(int, Y_test)))
Y_test_hot = np_utils.to_categorical(Y_test_int)

In [118]:
model.evaluate(X_test_indices, Y_test_hot)



[2.5743672847747803, 0.248539999127388]

In [127]:
# preds = model.predict(X_test_indices);
predicted_classes = np.argmax(preds, axis=1)
f1_score(Y_test_int, predicted_classes, average=None)


array([0.4056255 , 0.        , 0.28767123, 0.        , 0.23489933,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.08041775, 0.01212121, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.01027617, 0.        , 0.        ])

In [124]:
# Creating dictionary for some emoji's, consisting of key - number and value - emoji 
emoji_dict = { 0 : ":red_heart:", 1 : ":heart_eyes:", 2 : ":joy:", 3 : ":two_hearts:", 4 : ":fire:", 5: ":blush:", 6: ":sunglasses:",
             7: ":sparkles:", 8: ":blue_heart:", 9: ":kissing_heart:", 10: ":camera:", 11: ":us:", 12: ":sunny:", 13: ":purple_heart:",
             14: ":wink:", 15: ":100:", 16: ":grin:", 17: ":christmas_tree:", 18: "📸", 19: ":stuck_out_tongue_winking_eye:"}

# Printing each emoji icon by emojizing each emoji
# for ix in emoji_dict.keys():
#     print (ix,end=" ")
#     print (emoji.emojize(emoji_dict[ix], use_aliases=True, variant="emoji_type"))

In [128]:
# Printing the sentences with the predicted and labled emoji
# for ix in range(X_test_indices.shape[0]):
#     if (emoji.emojize(emoji_dict[predicted_classes[ix]], use_aliases=True) == emoji.emojize(emoji_dict[Y_test_int[ix]], use_aliases=True)):
#         print (emoji.emojize(emoji_dict[predicted_classes[ix]], use_aliases=True),end=" ")
#         print (emoji.emojize(emoji_dict[Y_test_int[ix]], use_aliases=True))

In [130]:
model.save_weights('./model_lstm_weights.hdf5')