In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../Data/text_emotion.csv')
df.head()

Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...


In [3]:
print('Number of tweets : ', df.size)

Number of tweets :  160000


## Different Sentiments

In [4]:
classes = set(df['sentiment'])

In [5]:
classes

{'anger',
 'boredom',
 'empty',
 'enthusiasm',
 'fun',
 'happiness',
 'hate',
 'love',
 'neutral',
 'relief',
 'sadness',
 'surprise',
 'worry'}

In [6]:
no_classes = len(classes)

# Imports

In [7]:
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model, Sequential
from tensorflow.keras.layers import Embedding,Bidirectional,LSTM,GRU,Dense

from nltk.tokenize import word_tokenize

import re
import numpy as np

from sklearn.model_selection import train_test_split

In [8]:
def clean_text(data):
    data=re.sub(r"(#[\d\w\.]+)", '', data)
    data=re.sub(r"(@[\d\w\.]+)", '', data)
    data=word_tokenize(data)
    return data

In [9]:
df['cleaned_text'] = df['content'].apply(clean_text)

In [10]:
df.head()

Unnamed: 0,tweet_id,sentiment,author,content,cleaned_text
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...,"[i, know, i, was, listenin, to, bad, habit, ea..."
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...,"[Layin, n, bed, with, a, headache, ughhhh, ......"
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...,"[Funeral, ceremony, ..., gloomy, friday, ...]"
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!,"[wants, to, hang, out, with, friends, SOON, !]"
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...,"[We, want, to, trade, with, someone, who, has,..."


In [11]:
texts = [' '.join(i) for i in df['cleaned_text']]

In [12]:
df_train, df_test = train_test_split(df, test_size=0.3)

In [13]:
texts_train= [' '.join(i) for i in df_train['cleaned_text']]
texts_test = [' '.join(i) for i in df_test['cleaned_text']]

In [14]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(texts)
sequence_train=tokenizer.texts_to_sequences(texts_train)
sequence_test=tokenizer.texts_to_sequences(texts_test)
index_of_words=tokenizer.word_index
vocab_size=len(index_of_words)+1

In [15]:
max_len = max([len(i) for i in texts])
embed_num_dims = 300
class_names = list(classes)

In [16]:
class_mapping = {}
for idx, name in enumerate(class_names):
    class_mapping[name] = idx 

In [17]:
X_train_paded = pad_sequences(sequence_train, maxlen=max_len)
X_test_paded = pad_sequences(sequence_test, maxlen = max_len)

In [18]:
y_train = [class_mapping[i] for i in df_train.sentiment]
y_test = [class_mapping[i] for i in df_test.sentiment]

y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [19]:
def create_embedding_matrix(filepath,word_index,embedding_dim):
    vocab_size=len(word_index)+1
    embedding_matrix=np.zeros((vocab_size,embedding_dim))
    with open(filepath, 'rb') as f:
        for line in f:
            word,*vector=line.split()
            if word in word_index:
                idx=word_index[word]
                embedding_matrix[idx] = np.array(vector,dtype=np.float32)[:embedding_dim]
    return embedding_matrix

fname='./embeddings/wiki-news-300d-1M.vec'
embedd_matrix=create_embedding_matrix(fname,index_of_words,embed_num_dims)

In [23]:
embedd_layer=Embedding(vocab_size,embed_num_dims,input_length=max_len,weights=[embedd_matrix],trainable=False)

gru_output_size=128

bidirectional=True

model=Sequential()
model.add(embedd_layer)
model.add(Bidirectional(GRU(units=gru_output_size,dropout=0.2,recurrent_dropout=0.2)))
model.add(Dense(no_classes, activation='softmax'))

model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])



In [None]:
batch_size=128
epochs=8
hist=model.fit(X_train_paded, y_train, batch_size=batch_size,
               epochs=epochs,
               validation_data=(X_test_paded,y_test))

Epoch 1/8

In [None]:
message=['I am sad.']
seq=tokenizer.texts_to_sequences(message)
padded=pad_sequences(seq,maxlen=max_seq_len)
pred=model.predict(padded)
print('Message:'+str(message))
print('Emotion:',class_names[np.argmax(pred)])