In [None]:
try:
  import google.colab
  IN_COLAB= True
except:
  IN_COLAB = False

In [None]:
if IN_COLAB:
  from google.colab import drive
  drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [None]:
!pip install numba

In [None]:
import numpy as np
import pandas as pd
from numba import cuda

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

from sklearn.metrics import classification_report

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re

import sys
import warnings

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## Load and Preprocess datasets

In [None]:
train_set = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/DPL_Assignment3/train.csv')

test_set = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/DPL_Assignment3/test.csv')

validation_set = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/DPL_Assignment3/validation.csv')

Remove empty values and NaNs

In [None]:
filter = train_set["Tweet"] != ""
train_set = train_set[filter]
train_set = train_set.dropna()


filter = test_set["Tweet"] != ""
test_set = test_set[filter]
test_set = test_set.dropna()


filter = validation_set["Tweet"] != ""
validation_set = validation_set[filter]
validation_set = validation_set.dropna()

Convert Boolean values to binary values

In [None]:
train_set = train_set.replace(True, 1)
train_set = train_set.replace(False, 0)

test_set = test_set.replace(True, 1)
test_set = test_set.replace(False, 0)

validation_set = validation_set.replace(True, 1)
validation_set = validation_set.replace(False, 0)

In [None]:
# Convert to lowercase.
train_set['Tweet'] = train_set["Tweet"].str.lower()
test_set['Tweet'] = test_set["Tweet"].str.lower()
validation_set['Tweet'] = validation_set["Tweet"].str.lower()

# Remove special characters and punctuation
train_set['Tweet'] = train_set["Tweet"].replace('[^\sa-zA-Z]+', '', regex=True)
test_set['Tweet'] = test_set["Tweet"].replace('[^\sa-zA-Z]+', '', regex=True)
validation_set['Tweet'] = validation_set["Tweet"].replace('[^\sa-zA-Z]+', '', regex=True)

In [None]:
stop_words = set(stopwords.words('english'))

re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)

def remove_stopwords(sentence):
  global re_stop_words
  return re_stop_words.sub(" ", sentence)


train_set["Tweet"] = train_set["Tweet"].apply(remove_stopwords)
test_set["Tweet"] = test_set["Tweet"].apply(remove_stopwords)
validation_set["Tweet"] = validation_set["Tweet"].apply(remove_stopwords)

In [None]:
stemmer = SnowballStemmer('english')

def stemming(sentence):
  stemSentence = ""
  for word in sentence.split():
    stem = stemmer.stem(word)
    stemSentence += stem + " "
  stemSentence = stemSentence.strip()
  return stemSentence

train_set["Tweet"] = train_set["Tweet"].apply(stemming)
test_set["Tweet"] = test_set["Tweet"].apply(stemming)
validation_set["Tweet"] = validation_set["Tweet"].apply(stemming)

## Making Dataset

In [None]:
def make_labels_list(dataframe):
  labels = []
  labels_list = pd.DataFrame(columns=['Tweet','Terms'])

  for i in range(len(dataframe)):
    for j in range(2, 13):
      if j == 1 and dataframe.iat[i,j] == 1:
        labels.append('anger')
      if j == 2 and dataframe.iat[i,j] == 1:
        labels.append('anticipation')
      if j == 3 and dataframe.iat[i,j] == 1:
        labels.append('disgust')
      if j == 4 and dataframe.iat[i,j] == 1:
        labels.append('fear')
      if j == 5 and dataframe.iat[i,j] == 1:
        labels.append('joy')
      if j == 6 and dataframe.iat[i,j] == 1:
        labels.append('love')
      if j == 7 and dataframe.iat[i,j] == 1:
        labels.append('optimism')
      if j == 8 and dataframe.iat[i,j] == 1:
        labels.append('pessimism')
      if j == 9 and dataframe.iat[i,j] == 1:
        labels.append('sadness')
      if j == 10 and dataframe.iat[i,j] == 1:
        labels.append('surprise')
      if j == 11 and dataframe.iat[i,j] == 1:
        labels.append('trust')

    labels_list.at[i, 'Tweet'] = dataframe.at[i, 'Tweet']
    labels_list.at[i, 'Terms'] = labels
    labels = []

  return labels_list

In [None]:
train_dataset = make_labels_list(train_set)
test_dataset = make_labels_list(test_set)
validation_dataset = make_labels_list(validation_set)

In [None]:
terms = tf.ragged.constant(train_dataset["Terms"].values)
lookup = tf.keras.layers.StringLookup(output_mode="multi_hot")
lookup.adapt(terms)
vocab = lookup.get_vocabulary()


def invert_multi_hot(encoded_labels):
    """Reverse a single multi-hot encoded label to a tuple of vocab terms."""
    hot_indices = np.argwhere(encoded_labels == 1.0)[..., 0]
    return np.take(vocab, hot_indices)


print("Vocabulary:\n")
print(vocab)

Vocabulary:

['[UNK]', 'fear', 'anticipation', 'love', 'surprise', 'pessimism', 'joy', 'disgust', 'sadness', 'optimism', 'trust']


In [None]:
max_seqlen = 15
batch_size = 64
padding_token = "<pad>"
auto = tf.data.AUTOTUNE

def make_dataset(dataframe, is_train=True):

    labels = tf.ragged.constant(dataframe["Terms"].values)
    label_binarized = lookup(labels).numpy()
    dataset = tf.data.Dataset.from_tensor_slices(
        (dataframe["Tweet"].values, label_binarized)
    )
    dataset = dataset.shuffle(batch_size * 10) if is_train else dataset
    return dataset.batch(batch_size)

In [None]:
labels = tf.ragged.constant(test_dataset["Terms"].values)
binarized_test_labels = lookup(labels)

In [None]:
train_dataset = make_dataset(train_dataset, is_train=True)
validation_dataset = make_dataset(validation_dataset, is_train=False)

In [None]:
vocabulary = set()
train_set["Tweet"].str.lower().str.split().apply(vocabulary.update)
vocabulary_size = len(vocabulary)
print(vocabulary_size)

13312


In [None]:
text_vectorizer = layers.TextVectorization(
    max_tokens=vocabulary_size,
    ngrams=2,
    output_mode="tf_idf"
)

text_vectorizer.adapt(train_dataset.map(lambda text, label: text))

train_dataset = train_dataset.map(
    lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto).prefetch(auto)

validation_dataset = validation_dataset.map(
    lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto).prefetch(auto)

test_dataset = text_vectorizer(test_dataset["Tweet"])

## Build Model

In [None]:
model = Sequential([
    tf.keras.layers.Reshape((2, -1), input_shape=(13312,)),
    layers.Bidirectional(layers.LSTM(3328, input_shape=(6656, 2), return_sequences=True)),
    layers.Flatten(),
    layers.Dense(1664, activation="relu"),
    layers.Dense(lookup.vocabulary_size(), activation="sigmoid")
])

In [None]:
model.compile(
  optimizer=tf.keras.optimizers.Adam(1e-4),
  loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
  metrics=[tf.keras.metrics.Recall()])

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 reshape (Reshape)           (None, 2, 6656)           0         
                                                                 
 bidirectional (Bidirection  (None, 2, 6656)           265840640 
 al)                                                             
                                                                 
 flatten (Flatten)           (None, 13312)             0         
                                                                 
 dense (Dense)               (None, 1664)              22152832  
                                                                 
 dense_1 (Dense)             (None, 11)                18315     
                                                                 
Total params: 288011787 (1.07 GB)
Trainable params: 288011787 (1.07 GB)
Non-trainable params: 0 (0.00 Byte)
______________

## Fit and predict

In [None]:
history = model.fit(
    train_dataset,
    epochs=53,
    validation_data=(validation_dataset)
    )

Epoch 1/53
Epoch 2/53
Epoch 3/53
Epoch 4/53
Epoch 5/53
Epoch 6/53
Epoch 7/53
Epoch 8/53
Epoch 9/53
Epoch 10/53
Epoch 11/53
Epoch 12/53
Epoch 13/53
Epoch 14/53
Epoch 15/53
Epoch 16/53
Epoch 17/53
Epoch 18/53
Epoch 19/53
Epoch 20/53
Epoch 21/53
Epoch 22/53
Epoch 23/53
Epoch 24/53
Epoch 25/53
Epoch 26/53
Epoch 27/53
Epoch 28/53
Epoch 29/53
Epoch 30/53
Epoch 31/53
Epoch 32/53
Epoch 33/53
Epoch 34/53
Epoch 35/53
Epoch 36/53
Epoch 37/53
Epoch 38/53
Epoch 39/53
Epoch 40/53
Epoch 41/53
Epoch 42/53
Epoch 43/53
Epoch 44/53
Epoch 45/53
Epoch 46/53
Epoch 47/53
Epoch 48/53
Epoch 49/53
Epoch 50/53
Epoch 51/53
Epoch 52/53
Epoch 53/53


In [None]:
test_loss, test_acc = model.evaluate(test_dataset, binarized_test_labels)

print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)

Test Loss: 1.8444955348968506
Test Accuracy: 0.5659668445587158


In [None]:
predicted_probabilities = model.predict(test_dataset)



In [None]:
labels = ['anger','anticipation','disgust','fear','joy','love','optimism','pessimism','sadness','surprise','trust']

print(classification_report(binarized_test_labels.numpy(), predicted_probabilities.round(), target_names=labels))

              precision    recall  f1-score   support

       anger       0.00      0.00      0.00         0
anticipation       0.60      0.55      0.58      1099
     disgust       0.70      0.60      0.65      1101
        fear       0.73      0.78      0.75      1442
         joy       0.55      0.55      0.55       960
        love       0.58      0.56      0.57      1143
    optimism       0.55      0.66      0.60       485
   pessimism       0.21      0.22      0.21       425
     sadness       0.29      0.24      0.26       375
    surprise       0.52      0.50      0.51       516
       trust       0.19      0.22      0.21       170

   micro avg       0.57      0.57      0.57      7716
   macro avg       0.45      0.44      0.44      7716
weighted avg       0.57      0.57      0.57      7716
 samples avg       0.58      0.58      0.55      7716



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Reset Device RAM

In [None]:
device = cuda.get_current_device()
device.reset()