In [1]:
# importing necessary libraries
import csv
import tensorflow as tf
import numpy as np
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd

print(tf.__version__)

2.8.2


In [2]:
# downloading stopwords for preprocessing
import nltk
nltk.download('stopwords')

STOPWORDS = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
vocab_size = 5000
embedding_dim = 64
max_length = 200
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'

In [4]:
# Loading data
df_train = pd.read_csv('/content/train_ekmann.csv')
df_val = pd.read_csv('/content/val_ekmann.csv')
df_test = pd.read_csv('/content/test_ekmann.csv')

num_classes = len(df_train["Emotion"].value_counts())

In [5]:
df_train.head()

Unnamed: 0,Text,Emotion,Id
0,My favourite food is anything I didn't have to...,neutral,eebbqej
1,"Now if he does off himself, everyone will thin...",neutral,ed00q6i
2,WHY THE FUCK IS BAYLESS ISOING,anger,eezlygj
3,To make her feel threatened,fear,ed7ypvh
4,Dirty Southern Wankers,anger,ed0bdzj


In [6]:
df_train.describe()

Unnamed: 0,Text,Emotion,Id
count,43410,43410,43410
unique,43227,7,43410
top,Thank you.,joy,eebbqej
freq,13,16217,1


In [7]:
# separating train, test and val features and labels
x_train = df_train["Text"] 
y_train = df_train["Emotion"]
x_val = df_val["Text"] 
y_val = df_val["Emotion"]
x_test = df_test["Text"]
y_test = df_test["Emotion"]

In [8]:
# Tokenizing x_train
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(x_train)
word_index = tokenizer.word_index

In [9]:
# turning training tokens into lists of sequence.
train_sequences = tokenizer.texts_to_sequences(x_train)

# Padding sequences according to max_length
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [10]:
# tuning validaion tokens into lists of sequence.
validation_sequences = tokenizer.texts_to_sequences(x_val)

# Padding sequences according to max_length
validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [11]:
# encoding output
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(y_train)

training_label_seq = np.array(label_tokenizer.texts_to_sequences(y_train))
validation_label_seq = np.array(label_tokenizer.texts_to_sequences(y_val))

**The model consists of an embedding layer followed by a bi-directional lstm and followed by two-dense layers.**

In [12]:
# model architecture and summary
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
    tf.keras.layers.Dense(embedding_dim, activation='relu'),
    tf.keras.layers.Dense(num_classes + 1, activation='softmax')
])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 64)          320000    
                                                                 
 bidirectional (Bidirectiona  (None, 128)              66048     
 l)                                                              
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dense_1 (Dense)             (None, 8)                 520       
                                                                 
Total params: 394,824
Trainable params: 394,824
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Training the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
num_epochs = 10
history = model.fit(train_padded, training_label_seq, epochs=num_epochs, validation_data=(validation_padded, validation_label_seq), verbose=2)

Epoch 1/10
1357/1357 - 292s - loss: 1.2248 - accuracy: 0.5524 - val_loss: 1.0839 - val_accuracy: 0.6121 - 292s/epoch - 215ms/step
Epoch 2/10
1357/1357 - 287s - loss: 0.9887 - accuracy: 0.6427 - val_loss: 1.0369 - val_accuracy: 0.6237 - 287s/epoch - 211ms/step
Epoch 3/10
1357/1357 - 286s - loss: 0.9086 - accuracy: 0.6693 - val_loss: 1.0134 - val_accuracy: 0.6292 - 286s/epoch - 211ms/step
Epoch 4/10
1357/1357 - 286s - loss: 0.8444 - accuracy: 0.6919 - val_loss: 1.0272 - val_accuracy: 0.6233 - 286s/epoch - 211ms/step
Epoch 5/10
1357/1357 - 285s - loss: 0.7825 - accuracy: 0.7146 - val_loss: 1.0445 - val_accuracy: 0.6215 - 285s/epoch - 210ms/step
Epoch 6/10
1357/1357 - 282s - loss: 0.7188 - accuracy: 0.7362 - val_loss: 1.0882 - val_accuracy: 0.6207 - 282s/epoch - 208ms/step
Epoch 7/10
1357/1357 - 283s - loss: 0.6557 - accuracy: 0.7609 - val_loss: 1.1656 - val_accuracy: 0.6122 - 283s/epoch - 208ms/step
Epoch 8/10
1357/1357 - 283s - loss: 0.5933 - accuracy: 0.7850 - val_loss: 1.2990 - val_acc

In [None]:
# Function to predict
def predict_class(texts):
  return [np.argmax(pred) for pred in model.predict(texts)]

In [None]:
# turning test tokens into lists of sequence.
test_sequences = tokenizer.texts_to_sequences(x_test)

# Padding sequences according to max_length
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
# encoding output
test_label_seq = np.array(label_tokenizer.texts_to_sequences(y_test))

In [None]:
# printign classification report
from sklearn.metrics import classification_report


y_pred = predict_class(test_padded)
print(classification_report(test_label_seq, y_pred))

              precision    recall  f1-score   support

           1       0.72      0.73      0.73      1978
           2       0.54      0.61      0.57      1648
           3       0.45      0.40      0.42       677
           4       0.44      0.33      0.37       572
           5       0.44      0.50      0.47       355
           6       0.48      0.33      0.39       116
           7       0.62      0.46      0.52        81

    accuracy                           0.58      5427
   macro avg       0.53      0.48      0.50      5427
weighted avg       0.58      0.58      0.58      5427



**RESULT** - As seen from the above classification report, the f1-macro average is 0.5

**MODEL DESCRIPTION** - The model consists of an embedding layer followed by a bi-directional lstm and followed by two-dense layers.

**PREPROCESSING** - The train, test and validation data were explored during EDA and were then preprocessed. A vocab size of 5000 was chosen and the maximum sentence length was 200.

Pretrained embeddings were not used in the final model. In earlier models, pre-trained embeddings of 100 and 200 dimensions (glove.6B.100d.txt       glove.6B.200d.txt) were used but they resulted in overfitting despite large number of parameters. 

**The better result was achieved from this smaller model having only 394,824 params.**