<a href="https://colab.research.google.com/github/GabboM/NNDS/blob/master/S_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Neural Networks for Data Science Applications

Code and work is related to this [Paper](https://arxiv.org/pdf/1805.02474.pdf)
and some code is adapted from [here](https://keras.io/examples/nlp/pretrained_word_embeddings/) and [here](https://medium.com/softmax/tensorflow-keras-lstm-source-code-line-by-line-explained-125a6dae0622)

In [1]:
import tensorflow_datasets as tfds
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

import numpy as np
import os

# Data

### Loading IMDB_reviews and splitting in Train/Test

In [None]:
(ds_train, ds_test), ds_info = tfds.load('imdb_reviews/plain_text',
                                          split=['train', 'test'],
                                          shuffle_files=True,
                                          as_supervised=True,
                                          with_info=True)

creating a list `ds` of all the reviews in plain text

In [None]:
it = list(ds_train)
ds = []
for i in it:
  ds.append(i[0].numpy().decode())

Let's download the GloVe word embeddings. We will use dim=100 instead of 300 to speed up the training

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

--2020-09-05 12:49:50--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2020-09-05 12:49:50--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2020-09-05 12:49:50--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip.2’


2020

creating a mapping of the words to their vector representation by GloVe

In [None]:
path_to_glove_file = os.path.join(
    "glove.6B.100d.txt"
)

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


Now we should have a tokenizer for the raw text. We train a word tokenizer on the training corpus and create a dictionary with the corresponding vocabulary

In [None]:
text_dataset = tf.data.Dataset.from_tensor_slices(ds)
max_features = 20000  # Maximum vocab size.
max_len = 200  # Sequence length to pad the outputs to.

# Create the layer.
vectorize = TextVectorization(
 max_tokens=max_features,
 output_mode='int',
 output_sequence_length=max_len)

# Now that the vocab layer has been created, call `adapt` on the text-only
# dataset to create the vocabulary. You don't have to batch, but for large
# datasets this means we're not keeping spare copies of the dataset.
vectorize.adapt(text_dataset.batch(64))

In [None]:
voc = vectorize.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

creating an embedding matrix

In [None]:
num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 18723 words (1277 misses)


In [None]:
len(embeddings_index.keys())

400000

In [None]:
embedding_matrix.shape

(20002, 100)

# Model

In [None]:
class SLSTMcell(keras.layers.Layer):
    def __init__(self, units=32):
        super(SLTMcell, self).__init__()
        self.units = units

    def build(self, input_shape):
        self.Wi = self.add_weight(
            shape=(input_shape[-1], self.units),
            initializer="random_normal",
            trainable=True,
        )
        self.Ui = self.add_weight(
            shape=(input_shape[-1], self.units),
            initializer="random_normal",
            trainable=True,
        )
        self.Vi = self.add_weight(
            shape=(input_shape[-1], self.units),
            initializer="random_normal",
            trainable=True,
        )
        self.bi = self.add_weight(
            shape=(self.units,), initializer="random_normal", trainable=True
        )

    def call(self, inputs):
        return tf.matmul(inputs, self.w) + self.b

/content


In [29]:
from tensorflow.python.keras import backend as K

In [33]:
x = tf.constant([1,2,3], shape=(3,1))
W = tf.constant([[1,1,1],[2,2,2]])

In [34]:
K.dot(W,x)

<tf.Tensor: shape=(2, 1), dtype=int32, numpy=
array([[ 6],
       [12]], dtype=int32)>

In [36]:
K.int_shape(W)

(2, 3)

In [None]:
def build(self, input_shape):
  input_dim = input_shape[-1]

  self.U = self.add_weight(shape=(input_dim, self.units * 4),
                                name='kernel',
                                initializer=self.kernel_initializer,
                                regularizer=self.kernel_regularizer,
                                constraint=self.kernel_constraint)
  self.recurrent_kernel = self.add_weight(
      shape=(self.units, self.units * 4),
      name='recurrent_kernel',
      initializer=self.recurrent_initializer,
      regularizer=self.recurrent_regularizer,
      constraint=self.recurrent_constraint)

  if self.use_bias:
      if self.unit_forget_bias:
          @K.eager
          def bias_initializer(_, *args, **kwargs):
              return K.concatenate([
                  self.bias_initializer((self.units,), *args, **kwargs),
                  initializers.Ones()((self.units,), *args, **kwargs),
                  self.bias_initializer((self.units * 2,), *args, **kwargs),
              ])
      else:
          bias_initializer = self.bias_initializer
      self.bias = self.add_weight(shape=(self.units * 4,),
                                  name='bias',
                                  initializer=bias_initializer,
                                  regularizer=self.bias_regularizer,
                                  constraint=self.bias_constraint)
  else:
      self.bias = None

  self.kernel_i = self.kernel[:, :self.units]
  self.kernel_f = self.kernel[:, self.units: self.units * 2]
  self.kernel_c = self.kernel[:, self.units * 2: self.units * 3]
  self.kernel_o = self.kernel[:, self.units * 3:]

  self.recurrent_kernel_i = self.recurrent_kernel[:, :self.units]
  self.recurrent_kernel_f = (
      self.recurrent_kernel[:, self.units: self.units * 2])
  self.recurrent_kernel_c = (
      self.recurrent_kernel[:, self.units * 2: self.units * 3])
  self.recurrent_kernel_o = self.recurrent_kernel[:, self.units * 3:]

  if self.use_bias:
      self.bias_i = self.bias[:self.units]
      self.bias_f = self.bias[self.units: self.units * 2]
      self.bias_c = self.bias[self.units * 2: self.units * 3]
      self.bias_o = self.bias[self.units * 3:]
  else:
      self.bias_i = None
      self.bias_f = None
      self.bias_c = None
      self.bias_o = None
  self.built = True