<a href="https://colab.research.google.com/github/GabboM/NNDS/blob/master/S_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Neural Networks for Data Science Applications

Code and work is related to this [Paper](https://arxiv.org/pdf/1805.02474.pdf)
and some code is adapted from [here](https://keras.io/examples/nlp/pretrained_word_embeddings/) and [here](https://medium.com/softmax/tensorflow-keras-lstm-source-code-line-by-line-explained-125a6dae0622)

In [49]:
import tensorflow_datasets as tfds
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.python.keras import backend as K
from tensorflow.keras.activations import sigmoid, tanh
from tensorflow.keras.layers import Embedding
from tensorflow.keras import Sequential, losses, optimizers, metrics


import numpy as np
import os

# Data

### Loading IMDB_reviews and splitting in Train/Test

In [50]:
(ds_train, ds_test), ds_info = tfds.load('imdb_reviews/plain_text',
                                          split=['train', 'test'],
                                          shuffle_files=True,
                                          as_supervised=True,
                                          with_info=True)

creating a list `ds` of all the reviews in plain text

In [51]:
it = list(ds_train)
ds = []
for _ in range(10000): #manual tokens
  ds.append('startofsentence endofsentence')
for i in it:
  ds.append(i[0].numpy().decode())

Let's download the GloVe word embeddings. We will use dim=100 instead of 300 to speed up the training

In [52]:
# !wget http://nlp.stanford.edu/data/glove.6B.zip
# !unzip -q glove.6B.zip

creating a mapping of the words to their vector representation by GloVe

In [53]:
path_to_glove_file = os.path.join(
    "drive/My Drive/NNDS/glove.6B.100d.txt"
)

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


Now we should have a tokenizer for the raw text. We train a word tokenizer on the training corpus and create a dictionary with the corresponding vocabulary

In [54]:
text_dataset = tf.data.Dataset.from_tensor_slices(ds)
max_features = 20000  # Maximum vocab size.
max_len = 200  # Sequence length to pad the outputs to.

# Create the layer.
vectorize = TextVectorization(
 max_tokens=max_features,
 output_mode='int',
 output_sequence_length=max_len)

# Now that the vocab layer has been created, call `adapt` on the text-only
# dataset to create the vocabulary. You don't have to batch, but for large
# datasets this means we're not keeping spare copies of the dataset.
vectorize.adapt(text_dataset.batch(64))

In [55]:
voc = vectorize.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [56]:
print('startofsentence' in voc)
print('endofsentence' in voc)

True
True


creating an embedding matrix

In [57]:
num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 18721 words (1279 misses)


In [58]:
len(embeddings_index.keys())

400000

In [59]:
embedding_matrix.shape

(20002, 100)

In [60]:
embedding_matrix[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

# Model

In [61]:
embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=True,
)

In [62]:
def preprocess_sentence(x):
  return('startofsentence ' + x + ' endofsentence')

In [63]:
sent = preprocess_sentence("the cat sat on the mat")
output = vectorize([sent])
output

<tf.Tensor: shape=(1, 200), dtype=int64, numpy=
array([[   71,     2,  1149,  1753,    21,     2, 12530,    72,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0, 

In [64]:
output = embedding_layer(output)

In [65]:
train_samples = []
train_labels = []
for i in ds_train:
  train_samples.append(i[0].numpy().decode('utf-8'))
  train_labels.append(i[1].numpy())



In [66]:
x_train = vectorize(np.array([[preprocess_sentence(sent)] for sent in train_samples])).numpy()
y_train = np.array(train_labels)

In [67]:
class SLSTMcell(keras.layers.Layer):
  def __init__(self, units=100, window=3, use_bias=True, recurrent_fn=sigmoid, activation_fn=tanh, seq_len=max_len, 
               kernel_initializer='uniform', kernel_regularizer=None, kernel_constraint=None,
               bias_initializer='zeros', bias_regularizer=None, bias_constraint=None):
    super(SLSTMcell, self).__init__()
    self.units = units
    self.use_bias = use_bias
    self.recurrent_function = recurrent_fn
    self.activation_fn = activation_fn
    self.seq_len = seq_len
    self.window = window
    self.kernel_initializer = kernel_initializer
    self.kernel_regularizer = kernel_regularizer
    self.kernel_constraint = kernel_constraint
    self.bias_initializer = bias_initializer
    self.bias_regularizer = bias_regularizer
    self.bias_constraint = bias_constraint
    
  def build(self, input_shape):
    input_dim = input_shape[-1]
    self.W = self.add_weight(shape=(input_dim * self.window, self.units * 10),
                                  name='W',
                                  initializer=self.kernel_initializer,
                                  regularizer=self.kernel_regularizer,
                                  constraint=self.kernel_constraint)
    
    self.W2 = self.add_weight(shape=(input_dim, self.units * 3),
                                  name='W2',
                                  initializer=self.kernel_initializer,
                                  regularizer=self.kernel_regularizer,
                                  constraint=self.kernel_constraint)
    
    self.U = self.add_weight(shape=(input_dim, self.units * 10),
                                  name='U',
                                  initializer=self.kernel_initializer,
                                  regularizer=self.kernel_regularizer,
                                  constraint=self.kernel_constraint)
    
    self.V = self.add_weight(shape=(input_dim, self.units * 7),
                                  name='V',
                                  initializer=self.kernel_initializer,
                                  regularizer=self.kernel_regularizer,
                                  constraint=self.kernel_constraint)

    if self.use_bias:
        # bias_initializer = self.bias_initializer
        self.bias = self.add_weight(shape=(self.units * 10,),
                                    name='bias',
                                    initializer=self.bias_initializer,
                                    regularizer=self.bias_regularizer,
                                    constraint=self.bias_constraint)
    else:
        self.bias = None

    self.W_i = self.W[:, :self.units]
    self.W_l = self.W[:, self.units * 1: self.units * 2]
    self.W_r = self.W[:, self.units * 2: self.units * 3]
    self.W_f = self.W[:, self.units * 3: self.units * 4]
    self.W_s = self.W[:, self.units * 4: self.units * 5]
    self.W_o = self.W[:, self.units * 5: self.units * 6]
    self.W_u = self.W[:, self.units * 6: self.units * 7]
    self.W_g = self.W2[:, :self.units]
    self.W_f2 = self.W2[:, self.units * 1: self.units * 2]
    self.W_o2 = self.W2[:, self.units * 2:]

    self.U_i = self.U[:, :self.units]
    self.U_l = self.U[:, self.units * 1: self.units * 2]
    self.U_r = self.U[:, self.units * 2: self.units * 3]
    self.U_f = self.U[:, self.units * 3: self.units * 4]
    self.U_s = self.U[:, self.units * 4: self.units * 5]
    self.U_o = self.U[:, self.units * 5: self.units * 6]
    self.U_u = self.U[:, self.units * 6: self.units * 7]
    self.U_g = self.U[:, self.units * 7: self.units * 8]
    self.U_f2 = self.U[:, self.units * 8: self.units * 9]
    self.U_o2 = self.U[:, self.units * 9:]
    
    self.V_i = self.V[:, :self.units]
    self.V_l = self.V[:, self.units * 1: self.units * 2]
    self.V_r = self.V[:, self.units * 2: self.units * 3]
    self.V_f = self.V[:, self.units * 3: self.units * 4]
    self.V_s = self.V[:, self.units * 4: self.units * 5]
    self.V_o = self.V[:, self.units * 5: self.units * 6]
    self.V_u = self.V[:, self.units * 6:]

    if self.use_bias:
        self.bias_i = self.bias[:self.units]
        self.bias_l = self.bias[self.units * 1: self.units * 2]
        self.bias_r = self.bias[self.units * 2: self.units * 3]
        self.bias_f = self.bias[self.units * 3: self.units * 4]
        self.bias_s = self.bias[self.units * 4: self.units * 5]
        self.bias_o = self.bias[self.units * 5: self.units * 6]
        self.bias_u = self.bias[self.units * 6: self.units * 7]
        self.bias_g = self.bias[self.units * 7: self.units * 8]
        self.bias_f2 = self.bias[self.units * 8: self.units * 9]
        self.bias_o2 = self.bias[self.units * 9:]
    else:
        self.bias_i = None
        self.bias_l = None
        self.bias_r = None
        self.bias_f = None
        self.bias_s = None
        self.bias_o = None
        self.bias_u = None
        self.bias_g = None
        self.bias_f2 = None
        self.bias_o2 = None
    self.built = True

  def call(self, inputs, states, training=None):

    H_tm1 = states[0]
    c_tm1 = states[1]

    H = tf.identity(H_tm1[0:1, :])
    c = tf.identity(c_tm1[0:1, :])

    for i in range(1, self.seq_len - 1):
      x_i = inputs[:, i-1]
      x_l = inputs[:, i-1]
      x_r = inputs[:, i-1]
      x_f = inputs[:, i-1]
      x_s = inputs[:, i-1]
      x_o = inputs[:, i-1]
      x_u = inputs[:, i-1]
      Ux_i = K.dot(x_i, self.U_i)
      Ux_l = K.dot(x_l, self.U_l)
      Ux_r = K.dot(x_r, self.U_r)
      Ux_f = K.dot(x_f, self.U_f)
      Ux_s = K.dot(x_s, self.U_s)
      Ux_o = K.dot(x_o, self.U_o)
      Ux_u = K.dot(x_u, self.U_u)
      if self.use_bias:
        Ux_i = K.bias_add(Ux_i, self.bias_i)
        Ux_l = K.bias_add(Ux_l, self.bias_l)
        Ux_r = K.bias_add(Ux_r, self.bias_r)
        Ux_f = K.bias_add(Ux_f, self.bias_f)
        Ux_s = K.bias_add(Ux_s, self.bias_s)
        Ux_o = K.bias_add(Ux_o, self.bias_o)
        Ux_u = K.bias_add(Ux_u, self.bias_u)

      csi_tm1_i = tf.concat((H_tm1[i-1:i, :], H_tm1[i:i+1, :], H_tm1[i+1:i+2, :]), axis=1)
      csi_tm1_l = tf.concat((H_tm1[i-1:i, :], H_tm1[i:i+1, :], H_tm1[i+1:i+2, :]), axis=1)
      csi_tm1_r = tf.concat((H_tm1[i-1:i, :], H_tm1[i:i+1, :], H_tm1[i+1:i+2, :]), axis=1)
      csi_tm1_f = tf.concat((H_tm1[i-1:i, :], H_tm1[i:i+1, :], H_tm1[i+1:i+2, :]), axis=1)
      csi_tm1_s = tf.concat((H_tm1[i-1:i, :], H_tm1[i:i+1, :], H_tm1[i+1:i+2, :]), axis=1)
      csi_tm1_o = tf.concat((H_tm1[i-1:i, :], H_tm1[i:i+1, :], H_tm1[i+1:i+2, :]), axis=1)
      csi_tm1_u = tf.concat((H_tm1[i-1:i, :], H_tm1[i:i+1, :], H_tm1[i+1:i+2, :]), axis=1)

      Wcsi_i = K.dot(csi_tm1_i, self.W_i)
      Wcsi_l = K.dot(csi_tm1_l, self.W_l)
      Wcsi_r = K.dot(csi_tm1_r, self.W_r)
      Wcsi_f = K.dot(csi_tm1_f, self.W_f)
      Wcsi_s = K.dot(csi_tm1_s, self.W_s)
      Wcsi_o = K.dot(csi_tm1_o, self.W_o)
      Wcsi_u = K.dot(csi_tm1_u, self.W_u)
      
      g_tm1_i = H_tm1[self.seq_len: self.seq_len + 1, :]
      g_tm1_l = H_tm1[self.seq_len: self.seq_len + 1, :]
      g_tm1_r = H_tm1[self.seq_len: self.seq_len + 1, :]
      g_tm1_f = H_tm1[self.seq_len: self.seq_len + 1, :]
      g_tm1_s = H_tm1[self.seq_len: self.seq_len + 1, :]
      g_tm1_o = H_tm1[self.seq_len: self.seq_len + 1, :]
      g_tm1_u = H_tm1[self.seq_len: self.seq_len + 1, :]
      Vg_i = K.dot(g_tm1_i, self.V_i)
      Vg_l = K.dot(g_tm1_l, self.V_l)
      Vg_r = K.dot(g_tm1_r, self.V_r)
      Vg_f = K.dot(g_tm1_f, self.V_f)
      Vg_s = K.dot(g_tm1_s, self.V_s)
      Vg_o = K.dot(g_tm1_o, self.V_o)
      Vg_u = K.dot(g_tm1_u, self.V_u)

      i_hat = self.recurrent_function(Wcsi_i + Ux_i + Vg_i)
      l_hat = self.recurrent_function(Wcsi_l + Ux_l + Vg_l)
      r_hat = self.recurrent_function(Wcsi_r + Ux_r + Vg_r)
      f_hat = self.recurrent_function(Wcsi_f + Ux_f + Vg_f)
      s_hat = self.recurrent_function(Wcsi_s + Ux_s + Vg_s)
      o_ = self.recurrent_function(Wcsi_o + Ux_o + Vg_o)
      u_ = self.activation_fn(Wcsi_u + Ux_u + Vg_u)
      
      i_, l_, r_, f_, s_ = [tf.keras.activations.softmax(t, axis=0) for t in [i_hat, l_hat, r_hat, f_hat, s_hat]]

      c_ = l_ * c_tm1[i-1:i, :] + f_ * c_tm1[i:i+1, :] + r_ * c_tm1[i+1:i+2, :] + \
                                s_ * c_tm1[self.seq_len:, :] + i_ * u_

      h_ = o_ * self.activation_fn(c_)

      H = tf.concat((H, h_), axis=0)
      c = tf.concat((c, c_), axis=0)                                           
    
    H = tf.concat((H, H_tm1[self.seq_len - 1 : self.seq_len, :]), axis=0)
    c = tf.concat((c, c_tm1[self.seq_len - 1 : self.seq_len, :]), axis=0) 

    # now the calculation to update g

    h_bar = tf.reduce_mean(H_tm1[:-1,:], axis=0, keepdims=True)
    g_tm1 = H_tm1[self.seq_len : self.seq_len + 1, :]
    c_tm1_g = c_tm1[self.seq_len : self.seq_len + 1, :]
          
    Wg_g = K.dot(g_tm1, self.W_g)
    Wg_f2 = K.dot(g_tm1, self.W_f2)
    Wg_o2 = K.dot(g_tm1, self.W_o2)
    if self.use_bias:
      Wg_g = K.bias_add(Wg_g, self.bias_g)
      Wg_f2 = K.bias_add(Wg_f2, self.bias_f2)
      Wg_o2 = K.bias_add(Wg_o2, self.bias_o2)
    Uh_g = K.dot(h_bar, self.U_g)
    # this U is different for each i -> so it's in the for loop
    Uh_o2 = K.dot(h_bar, self.U_o2)

    f_g = self.recurrent_function(Wg_g + Uh_g)
    
    Uh_f2 = K.dot(H[0:1, :], self.U_f2)
    F_ = tf.keras.activations.softmax(self.recurrent_function(Wg_f2 + Uh_f2), axis=0)
    for i in range(1, self.seq_len + 1):
      Uh_f2 = K.dot(H_tm1[i:i+1, :], self.U_f2)
      f_i = tf.keras.activations.softmax(self.recurrent_function(Wg_f2 + Uh_f2), axis=0)
      F_ = tf.concat((F_, f_i), axis=0)
    o_t = self.recurrent_function(Wg_o2 + Uh_o2)

    c_g = F_[self.seq_len : self.seq_len + 1, :] * c_tm1_g + tf.math.reduce_sum((F_[:self.seq_len, :] * c_tm1[:self.seq_len, :]), axis=0, keepdims=True)
    g_t = o_t * self.activation_fn(c_g)

    H = tf.concat((H, g_t,), axis=0)
    c = tf.concat((c, c_g,), axis=0)
      
    return inputs, [H, c], g_t

In [68]:
# units = 100

# slstm = SLSTMcell(units=units)
# H = tf.zeros(shape=(max_len+1, embedding_dim))
# c = tf.zeros(shape=(max_len+1, units))
# states = [tf.identity(H), tf.identity(c)]
# for t in range(9):
#   output, states, g_ = slstm(output, states)
# print(g_)

In [69]:
class SLSTM(keras.layers.Layer):
  def __init__(self, cell, n_cells,
               kernel_initializer='uniform', kernel_regularizer=None, kernel_constraint=None):
    super(SLSTM, self).__init__()
    self.cell = cell
    self.n_cells = n_cells
    self.units = self.cell.units
    self.kernel_initializer = kernel_initializer
    self.kernel_regularizer = kernel_regularizer
    self.kernel_constraint = kernel_constraint

  def build(self, input_shape, seq_len=200):
    if not seq_len:
      seq_len = input_shape[-2]
    self.H = self.add_weight(shape=(seq_len + 1, self.units),
                                  name='H',
                                  initializer=self.kernel_initializer,
                                  regularizer=self.kernel_regularizer,
                                  constraint=self.kernel_constraint)
    self.c = self.add_weight(shape=(seq_len + 1, self.units),
                                  name='c',
                                  initializer=self.kernel_initializer,
                                  regularizer=self.kernel_regularizer,
                                  constraint=self.kernel_constraint)
    
  def call(self, inputs, training=None):
    states = [tf.identity(self.H), tf.identity(self.c)]
    for _ in range(self.n_cells):
      inputs, states, g_ = self.cell(inputs, states)
    return g_
    

In [70]:
# units = 100

# slstm = SLSTM(SLSTMcell(units=units), 9)
# g_ = slstm(output, states)

In [71]:
units = 100
model = Sequential([Embedding(num_tokens, embedding_dim, embeddings_initializer=keras.initializers.Constant(embedding_matrix), trainable=True,),
                    SLSTM(SLSTMcell(units=units), 3),
                    keras.layers.Dense(2, activation='softmax')])

In [72]:
loss = losses.BinaryCrossentropy()
optimizer = optimizers.Adam()
acc = metrics.BinaryAccuracy()

In [73]:
train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train))

In [74]:
model.compile(loss=loss, optimizer=optimizer, metrics=[acc])
print(model.summary())

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 100)         2000200   
_________________________________________________________________
slstm_2 (SLSTM)              (None, 100)               541200    
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 202       
Total params: 2,541,602
Trainable params: 2,541,602
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
history = model.fit(train_data.shuffle(1000).batch(32), batch_size=32, epochs=20, verbose=2)

Epoch 1/20








In [None]:
string_input = keras.Input(shape=(1,), dtype="string")
x = vectorize(string_input)
preds = model(x)
end_to_end_model = keras.Model(string_input, preds)

In [None]:
end_to_end_model.summary()

In [None]:
end_to_end_model.predict('ciao')