In [1]:
import re

import io
from tqdm import tqdm

import pandas as pd
import numpy as np

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

import tensorflow as tf

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split

In [2]:
from tensorflow.keras.layers import TextVectorization

In [3]:
def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in tqdm(fin):
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = map(float, tokens[1:])
    return data

In [4]:
def load_glove(fname):
    embeddings_index = {}
    with open(fname, encoding="utf8") as f:
        for line in tqdm(f):
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, "f", sep=" ")
            embeddings_index[word] = coefs
    
    return embeddings_index

In [5]:
parameters = {}
parameters['embedding_name'] = './embeddings/glove.6B.100d.txt'

In [6]:
embedding_index = load_glove(parameters['embedding_name'])
print('Number of word vectors : ', len(embedding_index))

400000it [00:35, 11160.81it/s]

Number of word vectors :  400000





## Reading and Pre-processing

In [33]:
df = pd.read_csv('../Data/text_emotion.csv')

In [34]:
def clean_text(data):
    data=re.sub(r"(#[\d\w\.]+)", '', data)
    data = data.lower()
    return data

In [35]:
df['text'] = df['content'].apply(clean_text)

In [36]:
classes = sorted(list(set(df['sentiment'])))
no_classes = len(classes)

class_mapping = {}
for idx, name in enumerate(classes):
    class_mapping[name] = idx 

In [37]:
_ = df.pop('tweet_id')
_ = df.pop('author')

In [38]:
df['label'] = df['sentiment'].apply(lambda x : class_mapping[x])

In [39]:
def build_dataset(df, feature='text', target='label'):
    dataset = (
                tf.data.Dataset.from_tensor_slices(
                    (
                        tf.cast(df[feature].values, tf.string),
                        tf.cast(df[target].values, tf.int32)
                    )
                )
            )
    return dataset

In [40]:
df_train, df_test = train_test_split(df, test_size = 0.2)
df_train, df_val = train_test_split(df_train, test_size = 0.2)

train_dataset = build_dataset(df_train)
val_dataset = build_dataset(df_val)
test_dataset = build_dataset(df_test)

In [41]:
BUFFER_SIZE = 10000
BATCH_SIZE = 128

In [42]:
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [43]:
vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)
vectorizer.adapt(train_dataset.map(lambda text, label: text))

In [44]:
vectorizer.get_vocabulary()[:5]

['', '[UNK]', 'i', 'to', 'the']

In [45]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [46]:
num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 11940 words (8060 misses)


In [47]:
from tensorflow.keras.layers import Embedding

embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
    trainable=False,
)

In [49]:
x_train = vectorizer(np.array([[s] for s in list(df_train.text)])).numpy()
x_val = vectorizer(np.array([[s] for s in list(df_val.text)])).numpy()

y_train = np.array(list(df_train.label))
y_val = np.array(list(df_val.label))

In [65]:
from tensorflow.keras import layers

int_sequences_input = tf.keras.Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)
embedded_sequences.trainable = False 
x = layers.Conv1D(128, 5, activation="relu")(embedded_sequences)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)
preds = layers.Dense(no_classes, activation="softmax")(x)
model = tf.keras.Model(int_sequences_input, preds)
model.summary()

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding_1 (Embedding)      (None, None, 100)         2000200   
_________________________________________________________________
conv1d_9 (Conv1D)            (None, None, 128)         64128     
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, None, 128)         0         
_________________________________________________________________
conv1d_10 (Conv1D)           (None, None, 128)         82048     
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, None, 128)         0         
_________________________________________________________________
conv1d_11 (Conv1D)           (None, None, 128)         8204

In [66]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["acc"])
model.fit(x_train, y_train, batch_size=128, epochs=5, validation_data=(x_val, y_val))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2579b84af10>

In [67]:
x_test = vectorizer(np.array([[s] for s in list(df_test.text)])).numpy()
y_test = np.array(list(df_test.label))

In [68]:
model.evaluate(x_test, y_test)



[1.977430820465088, 0.3296250104904175]

In [69]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [70]:
pred = model.predict(x_test)

pred_idx = []

for i in tqdm(range(len(pred))):
    pred_idx.append(np.argmax(pred[i]))

100%|██████████████████████████████████████████████████████████████████████████| 8000/8000 [00:00<00:00, 176921.66it/s]


In [71]:
print(classification_report(y_test, pred_idx, target_names=classes))

              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        21
     boredom       0.00      0.00      0.00        27
       empty       0.00      0.00      0.00       172
  enthusiasm       0.00      0.00      0.00       143
         fun       0.00      0.00      0.00       377
   happiness       0.31      0.40      0.35      1058
        hate       0.00      0.00      0.00       272
        love       0.38      0.41      0.40       726
     neutral       0.38      0.43      0.40      1726
      relief       0.00      0.00      0.00       305
     sadness       0.43      0.09      0.14      1054
    surprise       0.00      0.00      0.00       399
       worry       0.30      0.63      0.40      1720

    accuracy                           0.33      8000
   macro avg       0.14      0.15      0.13      8000
weighted avg       0.28      0.33      0.27      8000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
