In [2]:
import pandas as pd
import numpy as np
from keras.layers import LSTM, Activation, Dropout, Dense, Input, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from keras.models import Model
from keras.utils import np_utils
import string
import re
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing.sequence import pad_sequences
import keras
import emoji
from sklearn.metrics import f1_score


In [3]:
featFile = open('train_tweets_us_all.text', "r", encoding="utf-8");
labelFile = open('train_tweets_us_all.labels', "r", encoding="utf-8");

testFeat = open('us_test.text', "r", encoding="utf-8");
testLabel = open('us_test.labels', "r", encoding="utf-8");

validationFeat = open('us_trial.text', "r", encoding="utf-8");
validationLabel = open('us_trial.labels', "r", encoding="utf-8");

X_train = featFile.readlines();
Y_train = labelFile.readlines();

X_test = testFeat.readlines();
Y_test = testLabel.readlines();

X_valid = validationFeat.readlines();
Y_valid = validationLabel.readlines();

In [4]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
words_to_index = tokenizer.word_index
len(words_to_index)

297422

In [5]:
def read_glove_vector(glove_vec):
    with open(glove_vec, 'r', encoding='UTF-8') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            w_line = line.split()
            curr_word = w_line[0]
            word_to_vec_map[curr_word] = np.array(w_line[1:], dtype=np.float64)


    return word_to_vec_map

In [6]:
word_to_vec_map = read_glove_vector('./glove.6B.50d.txt')

In [7]:
maxLen = 50

In [8]:
vocab_len = len(words_to_index)
embed_vector_len = word_to_vec_map['moon'].shape[0]

emb_matrix = np.zeros((vocab_len, embed_vector_len))

for word, index in words_to_index.items():
    embedding_vector = word_to_vec_map.get(word)
    if embedding_vector is not None:
        emb_matrix[index, :] = embedding_vector

embedding_layer = Embedding(input_dim=vocab_len, output_dim=embed_vector_len, input_length=maxLen, weights = [emb_matrix], trainable=False)

In [9]:
def conv1d_model(input_shape):

    X_indices = Input(input_shape)

    embeddings = embedding_layer(X_indices)

    X = Conv1D(512,1,activation='relu')(embeddings)

    X = MaxPooling1D(1)(X)

    X = Conv1D(256,3,activation='relu')(X)

    X = MaxPooling1D(3)(X)

    X = Conv1D(256,3,activation='relu')(X)
    X = Dropout(0.8)(X)
    X = MaxPooling1D(3)(X)

    X = GlobalMaxPooling1D()(X)

    X = Dense(256, activation='relu')(X)
    X = Dense(20, activation='sigmoid')(X)

    model = Model(inputs=X_indices, outputs=X)

    return model

In [10]:
model_1d = conv1d_model((maxLen,))
model_1d.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 50)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 50, 50)            14871100  
_________________________________________________________________
conv1d (Conv1D)              (None, 50, 512)           26112     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 50, 512)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 48, 256)           393472    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 16, 256)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 14, 256)           196864

In [11]:
adam = keras.optimizers.Adam(learning_rate = 0.0001)
model_1d.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])

In [12]:
X_train_indices = tokenizer.texts_to_sequences(X_train)
X_train_indices = pad_sequences(X_train_indices, maxlen=maxLen, padding='post')
X_train_indices.shape

(405987, 50)

In [13]:
Y_train_int = np.array(list(map(int, Y_train)))
Y_train_hot = np_utils.to_categorical(Y_train_int)

In [14]:
model_1d.fit(X_train_indices, Y_train_hot, batch_size=1024, epochs=1)



<tensorflow.python.keras.callbacks.History at 0x26290aec6c8>

In [15]:
X_test_indices = tokenizer.texts_to_sequences(X_test)
X_test_indices = pad_sequences(X_test_indices, maxlen=maxLen, padding='post')
X_test_indices.shape

(50000, 50)

In [16]:
Y_test_int = np.array(list(map(int, Y_test)))
Y_test_hot = np_utils.to_categorical(Y_test_int)

In [17]:
model_1d.evaluate(X_test_indices, Y_test_hot)



[2.5649657249450684, 0.2885400056838989]

In [19]:
preds = model_1d.predict(X_test_indices);
predicted_classes = np.argmax(preds, axis=1)
f1_score(Y_test_int, predicted_classes, average=None)

array([0.59345605, 0.21189411, 0.2903927 , 0.14310714, 0.23725313,
       0.01674107, 0.00963391, 0.00217155, 0.        , 0.        ,
       0.17073171, 0.1377513 , 0.31342651, 0.        , 0.        ,
       0.        , 0.        , 0.32211538, 0.00082508, 0.        ])

In [20]:
# Creating dictionary for some emoji's, consisting of key - number and value - emoji 
emoji_dict = { 0 : ":red_heart:", 1 : ":heart_eyes:", 2 : ":joy:", 3 : ":two_hearts:", 4 : ":fire:", 5: ":blush:", 6: ":sunglasses:",
             7: ":sparkles:", 8: ":blue_heart:", 9: ":kissing_heart:", 10: ":camera:", 11: ":us:", 12: ":sunny:", 13: ":purple_heart:",
             14: ":wink:", 15: ":100:", 16: ":grin:", 17: ":christmas_tree:", 18: "📸", 19: ":stuck_out_tongue_winking_eye:"}

# Printing each emoji icon by emojizing each emoji
# for ix in emoji_dict.keys():
#     print (ix,end=" ")
#     print (emoji.emojize(emoji_dict[ix], use_aliases=True, variant="emoji_type"))

In [21]:
# Printing the sentences only with correct predicted emoji
# for ix in range(X_test_indices.shape[0]):
#     if (emoji.emojize(emoji_dict[predicted_classes[ix]], use_aliases=True) == emoji.emojize(emoji_dict[Y_test_int[ix]], use_aliases=True)):
#         print (emoji.emojize(emoji_dict[predicted_classes[ix]], use_aliases=True),end=" ")
#         print (emoji.emojize(emoji_dict[Y_test_int[ix]], use_aliases=True))

In [22]:
model_1d.save_weights('./model_con1vd_weights.hdf5')