In [None]:
import pandas as pd
import numpy as np
import re
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import urllib.request
import zipfile
import os
from keras.models import Sequential
from keras.layers import Embedding,Bidirectional,LSTM,GRU,Dense
import nltk
from nltk.tokenize import word_tokenize
import warnings
nltk.download('punkt')
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
num_classes=5
embed_num_dims=300
max_seq_len=500
class_names=['joy','fear','anger','sadness','neutral']

In [None]:
data_train=pd.read_csv('drive/My Drive/data_train.csv',encoding='utf-8')
data_test=pd.read_csv('drive/My Drive/data_test.csv',encoding='utf-8')
X_train=data_train['Text']
X_test=data_test['Text']
y_train=data_train['Emotion']
y_test=data_test['Emotion']
data=data_train.append(data_test,ignore_index=True)

In [None]:
def clean_text(data):
    data=re.sub(r"(#[\d\w\.]+)", '', data)
    data=re.sub(r"(@[\d\w\.]+)", '', data)
    data=word_tokenize(data)
    return data

In [None]:
texts=[' '.join(clean_text(text)) for text in data.Text]
texts_train=[' '.join(clean_text(text)) for text in X_train]
texts_test=[' '.join(clean_text(text)) for text in X_test]

a bit ? I 'm extremely annoyed that he did n't phone me when he promised me that he would ! He 's such a liar .


In [None]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(texts)
sequence_train=tokenizer.texts_to_sequences(texts_train)
sequence_test=tokenizer.texts_to_sequences(texts_test)
index_of_words=tokenizer.word_index
vocab_size=len(index_of_words)+1
print('Number of unique words: {}'.format(len(index_of_words)))

Number of unique words: 12088


In [None]:
X_train_pad=pad_sequences(sequence_train,maxlen=max_seq_len)
X_test_pad=pad_sequences(sequence_test,maxlen=max_seq_len)

array([[    0,     0,     0, ...,   119,    51,   345],
       [    0,     0,     0, ...,    37,   277,   154],
       [    0,     0,     0, ...,    16,     2,  1210],
       ...,
       [    0,     0,     0, ...,   876,     4,   909],
       [    0,     0,     0, ...,     1,     6,   117],
       [    0,     0,     0, ..., 10259,   173,    13]], dtype=int32)

In [None]:
encoding={'joy':0,'fear':1,'anger':2,'sadness':3,'neutral':4}
y_train=[encoding[x] for x in data_train.Emotion]
y_test=[encoding[x] for x in data_test.Emotion]
y_train=to_categorical(y_train)
y_test=to_categorical(y_test)

In [None]:
def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size=len(word_index)+1
    embedding_matrix=np.zeros((vocab_size,embedding_dim))
    with open(filepath) as f:
        for line in f:
            word,*vector=line.split()
            if word in word_index:
                idx=word_index[word]
                embedding_matrix[idx] = np.array(vector,dtype=np.float32)[:embedding_dim]
    return embedding_matrix

In [None]:
fname='embeddings/wiki-news-300d-1M.vec'
if not os.path.isfile(fname):
    print('Downloading word vectors...')
    urllib.request.urlretrieve('https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip','wiki-news-300d-1M.vec.zip')
    print('Unzipping...')
    with zipfile.ZipFile('wiki-news-300d-1M.vec.zip', 'r') as zip_ref:
        zip_ref.extractall('embeddings')
    print('done.')
    os.remove('wiki-news-300d-1M.vec.zip')
embedd_matrix=create_embedding_matrix(fname,index_of_words,embed_num_dims)
embedd_matrix.shape

Downloading word vectors...
Unzipping...
done.


(12089, 300)

In [None]:
embedd_layer=Embedding(vocab_size,embed_num_dims,input_length=max_seq_len,weights=[embedd_matrix],trainable=False)

In [None]:
gru_output_size=128
bidirectional=True
model=Sequential()
model.add(embedd_layer)
if bidirectional:
    model.add(Bidirectional(GRU(units=gru_output_size,dropout=0.2,recurrent_dropout=0.2)))
else:
    model.add(GRU(units=gru_output_size,dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(num_classes, activation='softmax'))



In [None]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 300)          3626700   
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               330240    
_________________________________________________________________
dense (Dense)                (None, 5)                 1285      
Total params: 3,958,225
Trainable params: 331,525
Non-trainable params: 3,626,700
_________________________________________________________________


In [None]:
batch_size=128
epochs=15
model.fit(X_train_pad,y_train,batch_size=batch_size,epochs=epochs,validation_data=(X_test_pad,y_test))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [None]:
message=['I am so happy I am going to cry.']
seq=tokenizer.texts_to_sequences(message)
padded=pad_sequences(seq,maxlen=max_seq_len)
pred=model.predict(padded)
print('Message:'+str(message))
print('Emotion:',pred)

Message:['I am so happy I am going to cry.']
Emotion: [[9.576078e-01 3.789285e-03 3.220642e-03 3.461444e-02 7.678335e-04]]


In [None]:
tf.keras.models.save_model(model,'drive/My Drive/textmodel1',overwrite=True,include_optimizer=True,save_format=None,signatures=None,options=None)

In [None]:
textmodel2=tf.keras.models.load_model('drive/My Drive/textmodel2',custom_objects=None,compile=True,options=None)