In [1]:
import pandas as pd
from tqdm import tqdm

In [2]:
df = pd.read_csv('../Data/text_emotion.csv')

## Different Sentiments

In [3]:
classes = set(df['sentiment'])
no_classes = len(classes)

# Imports

In [4]:
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model, Sequential
from tensorflow.keras.layers import Embedding,Bidirectional,LSTM,GRU,Dense

from nltk.tokenize import word_tokenize

import re
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from nltk.corpus import stopwords

## Pre-processing

In [5]:
stop = stopwords.words('english')
df['content'] = df['content'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [6]:
def clean_text(data):
    data=re.sub(r"(#[\d\w\.]+)", '', data)
    data = data.lower()
    data=word_tokenize(data)
    return data

In [7]:
df['cleaned_text'] = df['content'].apply(clean_text)

In [8]:
df.head()

Unnamed: 0,tweet_id,sentiment,author,content,cleaned_text
0,1956967341,empty,xoshayzers,@tiffanylue know listenin bad habit earlier st...,"[@, tiffanylue, know, listenin, bad, habit, ea..."
1,1956967666,sadness,wannamama,Layin n bed headache ughhhh...waitin call...,"[layin, n, bed, headache, ughhhh, ..., waitin,..."
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...,"[funeral, ceremony, ..., gloomy, friday, ...]"
3,1956967789,enthusiasm,czareaquino,wants hang friends SOON!,"[wants, hang, friends, soon, !]"
4,1956968416,neutral,xkilljoyx,@dannycastillo We want trade someone Houston t...,"[@, dannycastillo, we, want, trade, someone, h..."


In [9]:
texts = [' '.join(i) for i in df['cleaned_text']]

In [10]:
df_train, df_test = train_test_split(df, test_size=0.3)
df_train, df_val = train_test_split(df_train, test_size=0.3)

In [11]:
texts_train= [' '.join(i) for i in df_train['cleaned_text']]
texts_val = [' '.join(i) for i in df_val['cleaned_text']]
texts_test = [' '.join(i) for i in df_test['cleaned_text']]

In [12]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(texts)

sequence_train = tokenizer.texts_to_sequences(texts_train)
sequence_val   = tokenizer.texts_to_sequences(texts_val)
sequence_test  = tokenizer.texts_to_sequences(texts_test)


index_of_words=tokenizer.word_index
vocab_size=len(index_of_words)+1

In [13]:
max_len = max([len(i) for i in texts])
embed_num_dims = 300
class_names = list(classes)

In [14]:
class_mapping = {}
for idx, name in enumerate(class_names):
    class_mapping[name] = idx 

In [15]:
X_train_paded = pad_sequences(sequence_train, maxlen = max_len)
X_val_paded = pad_sequences(sequence_val, maxlen = max_len)
X_test_paded = pad_sequences(sequence_test, maxlen = max_len)

In [16]:
y_train = [class_mapping[i] for i in df_train.sentiment]
y_val = [class_mapping[i] for i in df_val.sentiment]
y_test = [class_mapping[i] for i in df_test.sentiment]

y_train = to_categorical(y_train)
y_val = to_categorical(y_val)
y_test = to_categorical(y_test)

In [17]:
def create_embedding_matrix(filepath,word_index,embedding_dim):
    vocab_size=len(word_index)+1
    embedding_matrix=np.zeros((vocab_size,embedding_dim))
    with open(filepath, 'rb') as f:
        for line in f:
            word,*vector=line.split()
            if word in word_index:
                idx=word_index[word]
                embedding_matrix[idx] = np.array(vector,dtype=np.float32)[:embedding_dim]
    return embedding_matrix

fname='./embeddings/wiki-news-300d-1M.vec'
embedd_matrix=create_embedding_matrix(fname,index_of_words,embed_num_dims)

In [18]:
embedd_layer=Embedding(vocab_size,embed_num_dims,input_length=max_len,weights=[embedd_matrix],trainable=False)

gru_output_size=128

bidirectional=True

model=Sequential()
model.add(embedd_layer)
model.add(Bidirectional(GRU(units=gru_output_size,dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(128, activation='relu'))
model.add(Dense(no_classes, activation='softmax'))

model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])



In [19]:
batch_size = 128
epochs = 2
hist = model.fit(X_train_paded, y_train, batch_size=batch_size,
               epochs=epochs,
               validation_data=(X_val_paded,y_val))

Epoch 1/2
Epoch 2/2


In [20]:
model.evaluate(X_test_paded, y_test)



[2.38437557220459, 0.21199999749660492]

In [21]:
pred = model.predict(X_test_paded)

In [22]:
print('Prediction is complete...')

Prediction is complete...


In [23]:
pred_idx = []
target_idx = []

for i in tqdm(range(len(pred))):
    pred_idx.append(np.argmax(pred[i]))
    target_idx.append(np.argmax(y_test[i]))

100%|████████████████████████████████████████████████████████████████████████| 12000/12000 [00:00<00:00, 173927.45it/s]


In [24]:
print(classification_report(pred_idx, target_idx, target_names=class_names))

              precision    recall  f1-score   support

      relief       0.00      0.00      0.00         0
        hate       0.00      0.00      0.00         0
  enthusiasm       0.00      0.00      0.00         0
    surprise       0.00      0.00      0.00         0
   happiness       0.00      0.00      0.00         0
     sadness       0.00      0.00      0.00         0
        love       0.00      0.00      0.00         0
     neutral       1.00      0.21      0.35     12000
       anger       0.00      0.00      0.00         0
         fun       0.00      0.00      0.00         0
     boredom       0.00      0.00      0.00         0
       empty       0.00      0.00      0.00         0
       worry       0.00      0.00      0.00         0

    accuracy                           0.21     12000
   macro avg       0.08      0.02      0.03     12000
weighted avg       1.00      0.21      0.35     12000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
confusion_matrix(pred_idx, target_idx)

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0],
       [ 471,  370,  224,  659, 1539, 1552, 1138, 2544,   32,  557,   69,
         247, 2598],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0],
       [   0,    0,    0,    0,    0,    0,    0, 