In [1]:
import pandas as pd
from tqdm import tqdm


import numpy as np

In [2]:
df = pd.read_csv('../Data/text_emotion.csv')

## Classes

In [3]:
classes = sorted(list(set(df['sentiment'])))
no_classes = len(classes)

## Pre-processing

In [4]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

import tensorflow as tf

In [5]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import re

stop = stopwords.words('english')
#df['text'] = df['content'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [6]:
def clean_text(data):
    data=re.sub(r"(#[\d\w\.]+)", '', data)
    data = data.lower()
    data=word_tokenize(data)
    data = ' '.join(data)
    return data

In [7]:
df['text'] = df['content'].apply(clean_text)

In [8]:
class_mapping = {}
for idx, name in enumerate(classes):
    class_mapping[name] = idx 

In [9]:
_ = df.pop('tweet_id')
_ = df.pop('author')

In [10]:
df['label'] = df['sentiment'].apply(lambda x : class_mapping[x])

In [11]:
class_weight = {}
for i in range(no_classes):
    num = len(df[df['label'] == i])
    class_weight[i] = num / len(df) 

### Train - Test Split

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
def build_dataset(df, feature='text', target='label'):
    dataset = (
                tf.data.Dataset.from_tensor_slices(
                    (
                        tf.cast(df[feature].values, tf.string),
                        tf.cast(df[target].values, tf.int32)
                    )
                )
            )
    return dataset

In [14]:
df_train, df_test = train_test_split(df, test_size = 0.2)
df_train, df_val = train_test_split(df_train, test_size = 0.2)

train_dataset = build_dataset(df_train)
val_dataset = build_dataset(df_val)
test_dataset = build_dataset(df_test)

In [15]:
BUFFER_SIZE = 10000
BATCH_SIZE = 128

In [16]:
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [17]:
VOCAB_SIZE = 1000
encoder = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))

In [18]:
model = tf.keras.Sequential([
    encoder,
    
    tf.keras.layers.Embedding(input_dim=len(encoder.get_vocabulary()),
                              output_dim=64, mask_zero=True),
    
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences = True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    
    tf.keras.layers.Dense(64, activation='relu'),
    #tf.keras.layers.Dropout(0.5), 
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(no_classes)
])

In [19]:
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [20]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                                  verbose=1,patience=10,
                                                  mode='max', restore_best_weights=True)

In [21]:
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [22]:
history = model.fit(train_dataset, epochs = 50,
                    validation_data=val_dataset,
                    validation_steps=30,
                    class_weight = class_weight)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
 32/200 [===>..........................] - ETA: 21s - loss: 0.1963 - accuracy: 0.3914

KeyboardInterrupt: 

In [23]:
import matplotlib.pyplot as plt

plt.figure(1)
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='val')
plt.legend()
plt.show()

plt.figure(2)
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='val')
plt.legend()
plt.show()

NameError: name 'history' is not defined

<Figure size 432x288 with 0 Axes>

## Evaluate it against test

In [24]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [25]:
model.evaluate(test_dataset)



[2.167066812515259, 0.3319999873638153]

In [26]:
pred = model.predict(test_dataset)
y_test = list(df_test.label)

In [27]:
import numpy as np

pred_idx = []
target_idx = []

for i in tqdm(range(len(pred))):
    pred_idx.append(np.argmax(pred[i]))
    target_idx.append(y_test[i])

100%|██████████████████████████████████████████████████████████████████████████| 8000/8000 [00:00<00:00, 296315.16it/s]


In [28]:
print(classification_report(target_idx, pred_idx, target_names=classes))

              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        21
     boredom       0.00      0.00      0.00        36
       empty       0.00      0.00      0.00       165
  enthusiasm       0.00      0.00      0.00       141
         fun       0.00      0.00      0.00       381
   happiness       0.28      0.43      0.34      1041
        hate       0.00      0.00      0.00       260
        love       0.47      0.27      0.35       765
     neutral       0.36      0.56      0.44      1739
      relief       0.00      0.00      0.00       330
     sadness       0.00      0.00      0.00      1035
    surprise       0.00      0.00      0.00       454
       worry       0.31      0.63      0.42      1632

    accuracy                           0.33      8000
   macro avg       0.11      0.15      0.12      8000
weighted avg       0.22      0.33      0.26      8000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [29]:
print(confusion_matrix(target_idx, pred_idx))

[[   0    0    0    0    0    2    0    0    7    0    0    0   12]
 [   0    0    0    0    0    0    0    0   11    0    0    0   25]
 [   0    0    0    0    0   14    0    1   90    0    0    0   60]
 [   0    0    0    0    0   39    0    3   61    0    0    0   38]
 [   0    0    0    0    0  129    0   24  124    0    0    0  104]
 [   0    0    0    0    0  448    0   79  336    0    0    0  178]
 [   0    0    0    0    0   18    0    4   56    0    0    0  182]
 [   0    0    0    0    0  290    0  208  133    0    0    0  134]
 [   0    0    0    0    0  229    0   34  976    0    0    0  500]
 [   0    0    0    0    0   89    0    9  130    0    0    0  102]
 [   0    0    0    0    0   71    0   19  213    0    0    0  732]
 [   0    0    0    0    0  102    0   22  160    0    0    0  170]
 [   0    0    0    0    0  149    0   35  424    0    0    0 1024]]
