### Imports

In [1]:
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
import csv, json, time
import pandas as pd

### Mounting the google drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
data_path = '/content/drive/MyDrive/NLP Data/corona-tweets'
os.path.exists(data_path)

True

In [4]:
data_frame = pd.read_csv(os.path.join(data_path, 'Corona_NLP_test.csv'))
sentiments = data_frame.Sentiment.values
tweets = data_frame.OriginalTweet.values

### Data preparation.
We are going to prepare our data so that labels will be one hot encoded vectors and we will remove some punctuation marks hashtags for all the tweets.

In [27]:
tweets[2]

'Find out how you can protect yourself and loved ones from #coronavirus. ?'

In [8]:
import re

In [91]:
def process_clean_text(text:str)->str:
  text = text.lower()
  text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ', text)
  text = re.sub(r"[^a-z0-9.,?;']", ' ', text)
  text = re.sub(r'\s+', ' ', text)
  return text
process_clean_text(tweets[1])

"when i couldn't find hand sanitizer at fred meyer, i turned to amazon. but 114.97 for a 2 pack of purell?? check out how coronavirus concerns are driving up prices. "

### Label Processing.

The following are labels that we have in our dataset.

In [92]:
from collections import Counter
counts = Counter(sentiments)
counts

Counter({'Extremely Negative': 592,
         'Extremely Positive': 599,
         'Negative': 1041,
         'Neutral': 619,
         'Positive': 947})

### Visualizing labels using PrettyTable.

In [93]:
from prettytable import PrettyTable
def tabulate(column_names, data, title):
  table = PrettyTable(column_names)
  table.title = title
  for row in data:
    table.add_row(row)
  print(table)

In [94]:
data_rows = []
for label, count in counts.items():
  data_rows.append([label.upper(), count])
data_columns = ["LABEL", "COUNTS"]
title = "LABELS COUNTS"
tabulate(data_columns, data_rows, title )

+-----------------------------+
|        LABELS COUNTS        |
+--------------------+--------+
|       LABEL        | COUNTS |
+--------------------+--------+
| EXTREMELY NEGATIVE |  592   |
|      POSITIVE      |  947   |
| EXTREMELY POSITIVE |  599   |
|      NEGATIVE      |  1041  |
|      NEUTRAL       |  619   |
+--------------------+--------+


### Let's process the labels.
* We are going to convert labels to numeric
* We are also going to `one_hot` encode labels using `scikit-learn`.

In [95]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [96]:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(sentiments)

### Now the labels are looking as follows:

```
['Extremely Negative', 'Extremely Positive', 'Negative', 'Neutral', "Positive"] == [0, 1, 2, 3, 4]

```

In [97]:
def one_hot_encode(index, depth=5):
  return np.eye(depth, dtype="float32")[index]

In [98]:
labels_one_hot = np.array(list(map(one_hot_encode, encoded_labels)))

### Text (tweets).
Now let's map for all the features and get the cleaned version of data.

In [99]:
tweets_cleaned = list(map(process_clean_text, tweets))

### Spliting datasets.

We are going to split the data into 3 sets:
* train `90%` (validation 10% + training 80%)
* test `10%`
* validation (validation during training) using the `validation_split`.

In [100]:
test_size = int(.1 * len(tweets_cleaned))
test_features = tweets_cleaned[:test_size]
test_labels = labels_one_hot[:test_size]

train_features = tweets_cleaned[test_size:]
train_labels = labels_one_hot[test_size:]


data_columns = ["SET", "EXAMPLE(s)"]
title = "LABELS COUNTS"
data_rows = ["TESTING", len(test_labels)], ["TRAINING", len(train_labels)]
tabulate(data_columns, data_rows, title )


+-----------------------+
|     LABELS COUNTS     |
+----------+------------+
|   SET    | EXAMPLE(s) |
+----------+------------+
| TESTING  |    379     |
| TRAINING |    3419    |
+----------+------------+


### Processing the text (features).
* Create a word vocabulary.
* Create `stoi` from each sentence.
* pad the sentences so that they will have the same size.

* We are going to join the `train` and `validation` features and labels, and then we will split them during training.

**We are not going to touch the test data.**

In [101]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [102]:
counter = Counter()
for sent in train_features:
  words = word_tokenize(sent)
  for word in words:
    counter[word] += 1

counter.most_common(9)

[('.', 4231),
 ('the', 3854),
 ('to', 3367),
 (',', 3128),
 ('?', 2314),
 ('covid', 2231),
 ('and', 2191),
 ('19', 2131),
 ('of', 1861)]

### Vocabulary size (aka) number of unique words.

In [103]:
vocab_size = len(counter)
print(f"Vocabulary size: {vocab_size:,}")

Vocabulary size: 11,350


### Creating word vectors.

In [104]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [105]:
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(train_features)

In [106]:
word_indices = tokenizer.word_index
word_indices_reversed = dict([(v, k) for (k, v) in word_indices.items()])

### Helper functions.

We are going to create two helper function. One will convert the text given to sequences and the other will take sequences and convert them to text.


In [107]:
def sequence_to_text(sequences):
    return " ".join(word_indices_reversed[i] for i in sequences)
def text_to_sequence(sent):
  words = word_tokenize(sent.lower())
  sequences = []
  for word in words:
    try:
      sequences.append(word_indices[word])
    except:
      sequences.append(0)
  return sequences

### Loading pretrainned weights glove.6B.
We are going to load this pretrained weights from our google drive. I've uploaded them on my google drive.

In [108]:
embedding_path = "/content/drive/MyDrive/NLP Data/glove.6B/glove.6B.100d.txt"

In [110]:
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

In [111]:
embeddings_dictionary = dict()
start = time.time()
with open(embedding_path, encoding='utf8') as glove_file:
    for line in glove_file:
        records = line.split()
        word  = records[0]
        vectors = np.asarray(records[1:], dtype='float32')
        embeddings_dictionary[word] = vectors

print(f"ETA: {hms_string(time.time() - start)}")

ETA: 0:00:09.23


> Creating an `embedding matrix` that suits our data.

In [112]:
start = time.time()
embedding_matrix = np.zeros((vocab_size, 100))
for word, index in tokenizer.word_index.items():
    vector = embeddings_dictionary.get(word)
    if vector is not None:
      try:
        embedding_matrix[index] = vector
      except:
        pass
print(f"ETA: {hms_string(time.time() - start)}")

ETA: 0:00:00.02


### Creating sequences.

In [113]:
sequence_tokens = tokenizer.texts_to_sequences(train_features)

In [114]:
sequence_to_text(sequence_tokens[0])

"why is toilet paper so important for coronavirus if i'm stuck at home i'm going to stock up on food so i can fuckin eat"

### Padding sequences.
We now want our sequences to have the same size.

In [115]:
max_words = 100
tokens_sequence_padded = pad_sequences(sequence_tokens, maxlen=max_words, padding="post", truncating="post")

### Building the model.

### Model achitecture.

```
                [ Embedding Layer]
                        |
                        |
[ LSTM ] <---- [Bidirectional Layer] ----> [GRU] (forward_layer)
 (backward_layer)       |
                        |
        [  Gated Recurrent Unit  (GRU)  ]
                        |
                        |
        [ Long Short Term Memory (LSTM) ]
                        |
                        |
                [ Flatten Layer]
                        |
                        |
                 [Dense Layer 1]
                        |
                        | 
                   [ Dropout ]
                        |
                        |   
                 [Dense Layer 2]
                        |
                        |
                 [Dense Layer 3] (output [6 classes])
```

In [151]:
forward_layer = keras.layers.GRU(128, return_sequences=True, dropout=.5 )
backward_layer = keras.layers.LSTM(128, activation='tanh', return_sequences=True,
                       go_backwards=True, dropout=.5)
input_layer = keras.layers.Input(shape=(100, ), name="input_layer")

embedding_layer = keras.layers.Embedding(
      vocab_size, 
      100, 
      input_length=max_words,
      weights=[embedding_matrix], 
      trainable=True,
      name = "embedding_layer"
)(input_layer)
bidirectional_layer = keras.layers.Bidirectional(
    forward_layer,
    backward_layer = backward_layer,
    name= "bidirectional_layer"
)(embedding_layer)

gru_layer = keras.layers.GRU(
    512, return_sequences=True,
   dropout=.5,
    name= "gru_layer"
)(bidirectional_layer)
lstm_layer = keras.layers.LSTM(
    512, return_sequences=True,
    dropout=.5,
    name="lstm_layer"
)(gru_layer)
conv_layer_1 = keras.layers.Conv1D(64, 3, activation='relu')(gru_layer)
conv_layer_2 = keras.layers.Conv1D(512, 3, activation='relu')(conv_layer_1)
flatten_layer = keras.layers.Flatten(name="flatten_layer")(conv_layer_2)
fc_1 = keras.layers.Dense(64, activation='relu', name="dense_layer_1")(flatten_layer)
dropout_layer = keras.layers.Dropout(rate=0.5, name="dropout_layer")(fc_1)
fc_2 = keras.layers.Dense(512, activation='relu', name="dense_layer_2")(dropout_layer)
output_layer = keras.layers.Dense(5, activation='softmax')(fc_2)
covid_tweets_model = keras.Model(inputs=input_layer, outputs=output_layer, name="covid_tweets_model")
covid_tweets_model.summary()

Model: "covid_tweets_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_layer (InputLayer)     [(None, 100)]             0         
_________________________________________________________________
embedding_layer (Embedding)  (None, 100, 100)          1135000   
_________________________________________________________________
bidirectional_layer (Bidirec (None, 100, 256)          205568    
_________________________________________________________________
gru_layer (GRU)              (None, 100, 512)          1182720   
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 98, 64)            98368     
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 96, 512)           98816     
_________________________________________________________________
flatten_layer (Flatten)      (None, 49152)      

### Compiling and training the model.

In [152]:
early_stoping = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=5,
    verbose=1,
    mode='auto',
    baseline=None,
    restore_best_weights=False,
)
covid_tweets_model.compile(
    loss = keras.losses.CategoricalCrossentropy(from_logits=False),
    optimizer = keras.optimizers.Adam(1e-3, 0.5),
    metrics = ['accuracy']
)

In [153]:
covid_tweets_model.fit(
    tokens_sequence_padded,
    train_labels,
    epochs = 10,
    verbose = 1,
    validation_split = .2,
    shuffle=True,
    batch_size= 32,
    validation_batch_size = 16,
    callbacks = [early_stoping]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7ff899d7c990>

### BERT - TEXT CLASSIFICATION
As we can see that our model is not performing well, it is not improving from the achitecture that works perfectly from the `emotionals-nlp-notebook`. We are going to use transfare learning to get reasonable accuracy for this task. Specifically we are going to use the `BERT` model.


### Evaluating the model.


In [None]:
def text_to_padded_sequences(sent):
  tokens = text_to_sequence(sent)
  padded_tokens = pad_sequences([tokens], maxlen=max_words, padding="post", truncating="post")
  return tf.squeeze(padded_tokens)

X_test = np.array(list(map(text_to_padded_sequences, X_test_values)))
emotion_model.evaluate(X_test, y_test_labels_one_hot, verbose=1, batch_size=32)




[0.13748641312122345, 0.9334999918937683]

### Inference.

In [None]:
def tabulate(column_names, data):
  table = PrettyTable(column_names)
  table.align[column_names[0]] = "l"
  table.align[column_names[1]] = "l"
  for row in data:
    table.add_row(row)

  print(table.get_string(title="EMOTION PREDICTIONS TABLE"))

In [None]:
def predict(model, sent):
    classes = ['anger', 'fear', 'joy', 'love', 'sadness', 'surprise' ]
    tokens = text_to_sequence(sent)
    padded_tokens = pad_sequences([tokens], maxlen=max_words, padding="post", truncating="post")
    probabilities = model.predict(padded_tokens)
    prediction = tf.argmax(probabilities, axis=1).numpy()[0]
    class_name = classes[prediction]
    emoji_text = emoji.emojize(emotions_emojis[class_name], language='en', use_aliases=True)
    
    table_headers =["KEY", "VALUE"]
    table_data = [
        ["PREDICTED CLASS",  prediction],
        ["PREDICTED CLASS NAME",  class_name],
        ["PREDICTED CLASS EMOJI",  emoji_text],
        ["CONFIDENCE OVER OTHER CLASSES", f'{probabilities[0][prediction] * 100:.2f}%']       
    ]
    tabulate(table_headers, table_data)

### Sadness

In [None]:
predict(emotion_model, "im updating my blog because i feel shitty.")

+-----------------------------------------+
|        EMOTION PREDICTIONS TABLE        |
+-------------------------------+---------+
| KEY                           | VALUE   |
+-------------------------------+---------+
| PREDICTED CLASS               | 4       |
| PREDICTED CLASS NAME          | sadness |
| PREDICTED CLASS EMOJI         | 😞      |
| CONFIDENCE OVER OTHER CLASSES | 99.65%  |
+-------------------------------+---------+


### Fear

In [None]:
predict(emotion_model, "i am feeling apprehensive about it but also wildly excited")

+-----------------------------------------+
|        EMOTION PREDICTIONS TABLE        |
+-------------------------------+---------+
| KEY                           | VALUE   |
+-------------------------------+---------+
| PREDICTED CLASS               | 1       |
| PREDICTED CLASS NAME          | fear    |
| PREDICTED CLASS EMOJI         | 😨      |
| CONFIDENCE OVER OTHER CLASSES | 100.00% |
+-------------------------------+---------+


### Joy

In [None]:
predict(emotion_model, "i feel a little mellow today.")

+-----------------------------------------+
|        EMOTION PREDICTIONS TABLE        |
+-------------------------------+---------+
| KEY                           | VALUE   |
+-------------------------------+---------+
| PREDICTED CLASS               | 2       |
| PREDICTED CLASS NAME          | joy     |
| PREDICTED CLASS EMOJI         | 😄      |
| CONFIDENCE OVER OTHER CLASSES | 100.00% |
+-------------------------------+---------+


### Surprise

In [None]:
predict(emotion_model, "i feel shocked and sad at the fact that there are so many sick people.")

+------------------------------------------+
|        EMOTION PREDICTIONS TABLE         |
+-------------------------------+----------+
| KEY                           | VALUE    |
+-------------------------------+----------+
| PREDICTED CLASS               | 5        |
| PREDICTED CLASS NAME          | surprise |
| PREDICTED CLASS EMOJI         | 😮       |
| CONFIDENCE OVER OTHER CLASSES | 99.97%   |
+-------------------------------+----------+


### Love

In [None]:
predict(emotion_model, "i want each of you to feel my gentle embrace.")

+----------------------------------------+
|       EMOTION PREDICTIONS TABLE        |
+-------------------------------+--------+
| KEY                           | VALUE  |
+-------------------------------+--------+
| PREDICTED CLASS               | 3      |
| PREDICTED CLASS NAME          | love   |
| PREDICTED CLASS EMOJI         | 😍     |
| CONFIDENCE OVER OTHER CLASSES | 97.07% |
+-------------------------------+--------+


### Anger.

In [None]:
predict(emotion_model, "i feel like my irritable sensitive combination skin has finally met it s match.")

+-----------------------------------------+
|        EMOTION PREDICTIONS TABLE        |
+-------------------------------+---------+
| KEY                           | VALUE   |
+-------------------------------+---------+
| PREDICTED CLASS               | 0       |
| PREDICTED CLASS NAME          | anger   |
| PREDICTED CLASS EMOJI         | 😠      |
| CONFIDENCE OVER OTHER CLASSES | 100.00% |
+-------------------------------+---------+


### Saving the model.

In [None]:
emotion_model.save(os.path.join(data_path, "emotional_model.h5"))
print("Model Saved!!")

Model Saved!!
