### Imports

In [28]:
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
import csv, json, time
import pandas as pd

### Mounting the google drive

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
data_path = '/content/drive/MyDrive/NLP Data/emotions-nlp'
os.path.exists(data_path)

True

### So here we have three files which are:
1. test.txt
2. train.txt
3. val.txt

And each of these file contains lines with a respective lable. The text in these files looks as follows:

```txt
im feeling quite sad and sorry for myself but ill snap out of it soon;sadness
i feel like i am still looking at a blank canvas blank pieces of paper;sadness
i feel like a faithful servant;love
```

### Data Procesing.

I want to create csv files from these text files:
1. train.csv
2. test.csv
3. validation.csv

In [11]:
with open(os.path.join(data_path, 'test.txt'), 'r') as reader:
  test_data = reader.read().splitlines()

with open(os.path.join(data_path, 'train.txt'), 'r') as reader:
  train_data = reader.read().splitlines()

with open(os.path.join(data_path, 'val.txt'), 'r') as reader:
  valid_data = reader.read().splitlines()

In [12]:
from prettytable import PrettyTable
def tabulate(column_names, data):
  table = PrettyTable(column_names)
  for row in data:
    table.add_row(row)
  print(table)

### Checking how many examples do we have for each set.

In [13]:
column_names = ["SET", "NUM EXAMPLE(S)"]
data_table = [
  ["TESTING", len(test_data)],
  ["TRAINING", len(train_data)],
  ["VALIDATING", len(valid_data)],
]
tabulate(column_names, data_table)

+------------+----------------+
|    SET     | NUM EXAMPLE(S) |
+------------+----------------+
|  TESTING   |      2000      |
|  TRAINING  |     16000      |
| VALIDATING |      2000      |
+------------+----------------+


### Creating Labels.
Each line in these set contains a text and it's respective label superated by a seimicolon `;`.

In [15]:
test_data[0].split(';')

['im feeling rather rotten so im not very ambitious right now', 'sadness']

### A timer formater functions

In [18]:
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

### A function that create csv files from list of data.

In [27]:
def create_csv_data(data_set, file_name):
  start = time.time()
  row_list = [
      ["text", "emotion"]
  ]
  for line in data_set:
    text_emotion = line.split(';')
    row_list.append(text_emotion)
  
  with open(os.path.join(data_path, file_name), 'w', newline='') as file:
      writer = csv.writer(file)
      writer.writerows(row_list)
  print(
    f"Created file: {file_name},  ETA: {hms_string(time.time()- start)}"
  )

"""
CREATING CSV FILES FOR ALL THE SETS.
"""

create_csv_data(train_data, 'train.csv')
create_csv_data(valid_data, 'valid.csv')
create_csv_data(test_data, 'test.csv')

Created file: train.csv,  ETA: 0:00:00.07
Created file: valid.csv,  ETA: 0:00:00.01
Created file: test.csv,  ETA: 0:00:00.01


### Testing if we loaded the data corectly.

In [34]:
train_df = pd.read_csv(os.path.join(data_path, 'train.csv'))
test_df = pd.read_csv(os.path.join(data_path, 'test.csv'))
valid_df = pd.read_csv(os.path.join(data_path, 'valid.csv'))
train_df.emotion.unique(), test_df.emotion.unique(), valid_df.emotion.unique()

(array(['sadness', 'anger', 'love', 'surprise', 'fear', 'joy'],
       dtype=object),
 array(['sadness', 'joy', 'fear', 'anger', 'love', 'surprise'],
       dtype=object),
 array(['sadness', 'love', 'anger', 'joy', 'fear', 'surprise'],
       dtype=object))

So as we can see that we have `6` emotions. What are we going to do next.

### Preparing data for the model.

In [36]:
X_train_values = train_df.text.values
y_train_values = train_df.emotion.values

X_valid_values = valid_df.text.values
y_valid_values = valid_df.emotion.values

X_test_values = test_df.text.values
y_test_values = test_df.emotion.values


In [38]:
from sklearn.preprocessing import LabelEncoder

### We want to preprocess the labels first
* Convert them to digits
```
['anger', 'fear', 'joy', 'love', 'sadness', 'surprise' ]
[0, 1, 2, 3, 4, 5]
```
* For the label encoding we are going to use the `skit-learn` `LabelEncoder()` class.

* We `one_hot_encode` them.

In [39]:
encoder = LabelEncoder()
encoder.fit(y_train_values)

LabelEncoder()

In [43]:
y_train_labels = encoder.transform(y_train_values)
y_test_labels = encoder.transform(y_test_values)
y_valid_label = encoder.transform(y_valid_values)

### Now we can convert our labels to `one_hot` encoded vectors. 
There are a lot of ways of doing this we can use:

1. `tf.one_hot()`
2. sklearn `OneHotEncoder()` class.
3. numpy `eye()` function.

We are going to use numpy `eye()` function.



In [46]:
def one_hot_encode(index, depth=6):
  return np.eye(depth)[index]
one_hot_encode(3)

array([0., 0., 0., 1., 0., 0.])

In [62]:
y_train_labels_one_hot = np.array(list(map(one_hot_encode, y_train_labels ))).astype('float32')
y_test_labels_one_hot = np.array(list(map(one_hot_encode, y_test_labels ))).astype('float32')
y_valid_labels_one_hot = np.array(list(map(one_hot_encode, y_valid_label ))).astype('float32')

### Processing the text (features).
* Create a word vocabulary.
* Create `stoi` from each sentence.
* pad the sentences so that they will have the same size.

* We are going to join the `train` and `validation` features and labels, and then we will split them during training.

**We are not going to torch the test data.**

In [63]:
features = np.concatenate([X_train_values, X_valid_values])
labels = np.concatenate([y_train_labels_one_hot, y_valid_labels_one_hot])
features.shape, labels.shape

((18000,), (18000, 6))

In [53]:
from nltk.tokenize import word_tokenize
from collections import Counter

In [64]:
counter = Counter()

for sent in features:
  words = word_tokenize(sent)
  for word in words:
    counter[word] += 1

counter.most_common(9)

[('i', 29044),
 ('feel', 12544),
 ('and', 10766),
 ('to', 10086),
 ('the', 9383),
 ('a', 6982),
 ('feeling', 5785),
 ('that', 5701),
 ('of', 5587)]

### Vocabulary size (aka) number of unique words.

In [65]:
vocab_size = len(counter)
print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 16194


### Creating word vectors.

In [101]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [81]:
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(features)

In [82]:
word_indices = tokenizer.word_index
word_indices_reversed = dict([(v, k) for (k, v) in word_indices.items()])

### Helper functions.

We are going to create two helper function. One will convert the text given to sequences and the other will take sequences and convert them to text.


In [83]:
def sequence_to_text(sequences):
    return " ".join(word_indices_reversed[i] for i in sequences)
def text_to_sequence(sent):
  words = word_tokenize(sent.lower())
  sequences = []
  for word in words:
    try:
      sequences.append(word_indices[word])
    except:
      sequences.append(0)
  return sequences

### Loading pretrainned weights glove.6B.
We are going to load this pretrained weights from our google drive. I've uploaded them on my google drive.

In [84]:
embedding_path = "/content/drive/MyDrive/NLP Data/glove.6B/glove.6B.100d.txt"

In [85]:
embeddings_dictionary = dict()
with open(embedding_path, encoding='utf8') as glove_file:
    for line in glove_file:
        records = line.split()
        word  = records[0]
        vectors = np.asarray(records[1:], dtype='float32')
        embeddings_dictionary[word] = vectors

> Creating an `embedding matrix` that suits our data.

In [95]:
embedding_matrix = np.zeros((vocab_size, 100))
for word, index in tokenizer.word_index.items():
    vector = embeddings_dictionary.get(word)
    if vector is not None:
      try:
        embedding_matrix[index] = vector
      except:
        pass

### Creating sequences.

In [97]:
sequence_tokens = tokenizer.texts_to_sequences(features)

In [99]:
sequence_to_text(sequence_tokens[0])

'i didnt feel humiliated'

### Padding sequences.
We now want our sequences to have the same size.

In [102]:
max_words = 100
tokens_sequence_padded = pad_sequences(sequence_tokens, maxlen=max_words, padding="post", truncating="post")

### Building the model.

### Model achitecture.

```
                [ Embedding Layer]
                        |
                        |
[ LSTM ] <---- [Bidirectional Layer] ----> [GRU] (forward_layer)
 (backward_layer)       |
                        |
        [  Gated Recurrent Unit  (GRU)  ]
                        |
                        |
        [ Long Short Term Memory (LSTM) ]
                        |
                        |
                [ Flatten Layer]
                        |
                        |
                 [Dense Layer 1]
                        |
                        | 
                   [ Dropout ]
                        |
                        |   
                 [Dense Layer 2]
                        |
                        |
                 [Dense Layer 3] (output [6 classes])
```

In [132]:
forward_layer = keras.layers.GRU(128, return_sequences=True, dropout=.25 )
backward_layer = keras.layers.LSTM(128, activation='tanh', return_sequences=True,
                       go_backwards=True, dropout=.25)
input_layer = keras.layers.Input(shape=(100, ), name="input_layer")

embedding_layer = keras.layers.Embedding(
      vocab_size, 
      100, 
      input_length=max_words,
      weights=[embedding_matrix], 
      trainable=True,
      name = "embedding_layer"
)(input_layer)
bidirectional_layer = keras.layers.Bidirectional(
    forward_layer,
    backward_layer = backward_layer,
    name= "bidirectional_layer"
)(embedding_layer)

gru_layer = keras.layers.GRU(
    512, return_sequences=True,
   dropout=.5,
    name= "gru_layer"
)(bidirectional_layer)

lstm_layer = keras.layers.LSTM(
    512, return_sequences=True,
   dropout=.5,
    name="lstm_layer"
)(gru_layer)
flatten_layer = keras.layers.Flatten(name="flatten_layer")(lstm_layer)
fc_1 = keras.layers.Dense(64, activation='relu', name="dense_1")(flatten_layer)
dropout_layer = keras.layers.Dropout(rate=0.5, name="dropout_layer")(fc_1)
fc_2 = keras.layers.Dense(512, activation='relu', name="dense_2")(dropout_layer)
output_layer = keras.layers.Dense(6, activation='softmax')(fc_2)
emotion_model = keras.Model(inputs=input_layer, outputs=output_layer, name="emotional_model")
emotion_model.summary()

Model: "emotional_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_layer (InputLayer)     [(None, 100)]             0         
_________________________________________________________________
embedding_layer (Embedding)  (None, 100, 100)          1619400   
_________________________________________________________________
bidirectional_layer (Bidirec (None, 100, 256)          205568    
_________________________________________________________________
gru_layer (GRU)              (None, 100, 512)          1182720   
_________________________________________________________________
lstm_layer (LSTM)            (None, 100, 512)          2099200   
_________________________________________________________________
flatten_layer (Flatten)      (None, 51200)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)            

### Compiling and training the model.

In [133]:
early_stoping = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=5,
    verbose=1,
    mode='auto',
    baseline=None,
    restore_best_weights=False,
)

emotion_model.compile(
    loss = keras.losses.CategoricalCrossentropy(from_logits=False),
    optimizer = keras.optimizers.Adam(1e-3, 0.5),
    metrics = ['accuracy']
)

In [134]:
emotion_model.fit(
    tokens_sequence_padded,
    labels,
    epochs = 10,
    verbose = 1,
    validation_split = .2,
    shuffle=True,
    batch_size= 32,
    validation_batch_size = 16,
    callbacks = [early_stoping]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fe3743ee6d0>

### Evaluating the model.


In [135]:
def text_to_padded_sequences(sent):
  tokens = text_to_sequence(sent)
  padded_tokens = pad_sequences([tokens], maxlen=max_words, padding="post", truncating="post")
  return tf.squeeze(padded_tokens)

X_test = np.array(list(map(text_to_padded_sequences, X_test_values)))
emotion_model.evaluate(X_test, y_test_labels_one_hot, verbose=1, batch_size=32)




[0.13748641312122345, 0.9334999918937683]

### Inference.

In [136]:
def predict(model, sent):
    classes = ['anger', 'fear', 'joy', 'love', 'sadness', 'surprise' ]
    tokens = text_to_sequence(sent)
    padded_tokens = pad_sequences([tokens], maxlen=max_words, padding="post", truncating="post")
    prediction = tf.argmax(model.predict(padded_tokens), axis=1).numpy()[0]
    class_name = classes[prediction]
    print(f'Predicted Class:\t {prediction}\nPredicted Category:\t{class_name}')

### Sadness

In [137]:
predict(emotion_model, "im updating my blog because i feel shitty")

Predicted Class:	 4
Predicted Category:	sadness


### Fear

In [138]:
predict(emotion_model, "i am feeling apprehensive about it but also wildly excited")

Predicted Class:	 1
Predicted Category:	fear


### Joy

In [139]:
predict(emotion_model, "i feel a little mellow today.")

Predicted Class:	 2
Predicted Category:	joy


### Surprise

In [140]:
predict(emotion_model, "i feel shocked and sad at the fact that there are so many sick people.")

Predicted Class:	 5
Predicted Category:	surprise


### Love

In [141]:
predict(emotion_model, "i want each of you to feel my gentle embrace.")

Predicted Class:	 3
Predicted Category:	love


### Anger.

In [142]:
predict(emotion_model, "i feel like my irritable sensitive combination skin has finally met it s match.")

Predicted Class:	 0
Predicted Category:	anger


### Saving the model.

In [143]:
emotion_model.save(os.path.join(data_path, "emotional_model.h5"))
print("Model Saved!!")

Model Saved!!
