In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

# Data Processing

1. Split dataset into sentences
2. Remove punctuation
3. All letters lowercase
4. Split sentences into words
5. Create a vocabulary (all known words of the ML model)
6. Tokenize words
7. Add and tags in vocabulary ?

In [2]:
questions = []
answers = []

with open('data/input/input.txt') as f:
    lines = f.readlines()
    for line in lines:
        if line[0] == 'I':
            questions.append(line.strip())
        else:
            answers.append(line.strip())

answers = list(filter(None, answers))
len(questions), len(answers)

(78, 78)

In [3]:
# remove 'Intrebarea' si 'Raspunsul'
import re 

def remove_preposition(text):
    q_regex = 'Intrebarea\ [0-9]+\:\ '
    a_regex = 'Raspuns\ [0-9]+\:\ '
    
    if 'Intrebarea' in text:
        text = re.sub(q_regex, '', text)
    else:
        text = re.sub(a_regex, '', text)
    return text

In [4]:
# remove punctuation
import string

def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

In [5]:
#remove stopwords
def remove_stopwords(text, path_to_stopwords):
    stopwords = open(path_to_stopwords).readline().split(',')
    return [x for x in text if x not in stopwords]

In [6]:
# all lowercase
def to_lower(text):
    return text.lower()

In [7]:
# split into words
def split_words(text):
    tokens = re.split(' ',text)
    return tokens

In [8]:
# stemming
import nltk
from nltk.stem.snowball import RomanianStemmer

def stemming(text):
    snow_stemmer = RomanianStemmer()
    return [snow_stemmer.stem(word) for word in text]


# Creating the model architecture

In [9]:
import tensorflow as tf
import tensorflow_addons as tfa

from sklearn.model_selection import train_test_split

I've decided to create a Seq-to-Seq model. This type of model requires an Encoder and a Decoder. The example is from the [TensorFlow official documentation](https://www.tensorflow.org/addons/tutorials/networks_seq2seq_nmt)

In [10]:
class QADataset:
    def __init__(self):
        self.tokenizer = None


    def preprocess_sentence(self, sentence):
        text = remove_preposition(sentence)
        text = remove_punctuation(text)
        text = split_words(text)
        text = remove_stopwords(text, 'data/input/stopwords-ro.txt')
        text = list(filter(None, text))
        text = [to_lower(x) for x in text]
        text = stemming(text)
        text = ['<start>'] + text 
        text = text + ['<end>']
        
        return text
        
        
    def preprocess_dataset(self, _list):
        return [self.preprocess_sentence(text) for text in _list]

    def tokenize(self, lang):
        # lang = list of sentences in a language

        # print(len(lang), "example sentence: {}".format(lang[0]))
        lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<oov>')
        lang_tokenizer.fit_on_texts(lang)

        ## tf.keras.preprocessing.text.Tokenizer.texts_to_sequences converts string (w1, w2, w3, ......, wn) 
        ## to a list of correspoding integer ids of words (id_w1, id_w2, id_w3, ...., id_wn)
        tensor = lang_tokenizer.texts_to_sequences(lang) 

        ## tf.keras.preprocessing.sequence.pad_sequences takes argument a list of integer id sequences 
        ## and pads the sequences to match the longest sequences in the given input
        tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')

        return tensor, lang_tokenizer

    def load_dataset(self):
        # creating cleaned input, output pairs
        questions = []
        answers = []

        with open('data/input/input.txt') as f:
            lines = f.readlines()
            for line in lines:
                if line[0] == 'I':
                    questions.append(line.strip())
                else:
                    answers.append(line.strip())

        answers = list(filter(None, answers))
        
        questions = self.preprocess_dataset(questions)
        answers = self.preprocess_dataset(answers)
        

        input_tensor, inp_lang_tokenizer = self.tokenize(questions)
        target_tensor, target_lang_tokenizer = self.tokenize(answers)

        return input_tensor, target_tensor, inp_lang_tokenizer, target_lang_tokenizer

    def call(self, BUFFER_SIZE, BATCH_SIZE):
        input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer = self.load_dataset()

        input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

        train_dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train))
        train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

        val_dataset = tf.data.Dataset.from_tensor_slices((input_tensor_val, target_tensor_val))
        val_dataset = val_dataset.batch(BATCH_SIZE, drop_remainder=True)

        return train_dataset, val_dataset, inp_lang_tokenizer, targ_lang_tokenizer

In [11]:
BUFFER_SIZE = 32000
BATCH_SIZE = 8
# Let's limit the #training examples for faster training

dataset_creator = QADataset()
train_dataset, val_dataset, inp_lang, targ_lang = dataset_creator.call(BUFFER_SIZE, BATCH_SIZE)

Metal device set to: Apple M1 Pro

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



2022-04-04 18:13:18.010286: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-04-04 18:13:18.010414: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [12]:
example_input_batch, example_target_batch = next(iter(train_dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([8, 45]), TensorShape([8, 49]))

In [13]:
# Parameters

vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1
max_length_input = example_input_batch.shape[1]
max_length_output = example_target_batch.shape[1]
num_examples = 3000 # an arbitrary value that can be changed later

embedding_dim = 256
units = 1024
steps_per_epoch = num_examples//BATCH_SIZE

In [14]:
print("max_length_inp, max_length_out, vocab_size_questions, vocab_size_answers")
max_length_input, max_length_output, vocab_inp_size, vocab_tar_size

max_length_inp, max_length_out, vocab_size_questions, vocab_size_answers


(45, 49, 278, 353)

# Encoder

In [15]:
##### 

class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

    ##-------- LSTM layer in Encoder ------- ##
    self.lstm_layer = tf.keras.layers.LSTM(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')



  def call(self, x, hidden):
    x = self.embedding(x)
    output, h, c = self.lstm_layer(x, initial_state = hidden)
    return output, h, c

  def initialize_hidden_state(self):
    return [tf.zeros((self.batch_sz, self.enc_units)), tf.zeros((self.batch_sz, self.enc_units))]

In [16]:
## Test Encoder Stack

encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)


# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_h, sample_c = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder h vector shape: (batch size, units) {}'.format(sample_h.shape))
print ('Encoder c vector shape: (batch size, units) {}'.format(sample_c.shape))

Encoder output shape: (batch size, sequence length, units) (8, 45, 1024)
Encoder h vector shape: (batch size, units) (8, 1024)
Encoder c vector shape: (batch size, units) (8, 1024)


# Decoder

In [17]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz, attention_type='luong'):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.attention_type = attention_type

    # Embedding Layer
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

    #Final Dense layer on which softmax will be applied
    self.fc = tf.keras.layers.Dense(vocab_size)

    # Define the fundamental cell for decoder recurrent structure
    self.decoder_rnn_cell = tf.keras.layers.LSTMCell(self.dec_units)



    # Sampler
    self.sampler = tfa.seq2seq.sampler.TrainingSampler()

    # Create attention mechanism with memory = None
    self.attention_mechanism = self.build_attention_mechanism(self.dec_units, 
                                                              None, self.batch_sz*[max_length_input], self.attention_type)

    # Wrap attention mechanism with the fundamental rnn cell of decoder
    self.rnn_cell = self.build_rnn_cell(batch_sz)

    # Define the decoder with respect to fundamental rnn cell
    self.decoder = tfa.seq2seq.BasicDecoder(self.rnn_cell, sampler=self.sampler, output_layer=self.fc)


  def build_rnn_cell(self, batch_sz):
    rnn_cell = tfa.seq2seq.AttentionWrapper(self.decoder_rnn_cell, 
                                  self.attention_mechanism, attention_layer_size=self.dec_units)
    return rnn_cell

  def build_attention_mechanism(self, dec_units, memory, memory_sequence_length, attention_type='luong'):
    # ------------- #
    # typ: Which sort of attention (Bahdanau, Luong)
    # dec_units: final dimension of attention outputs 
    # memory: encoder hidden states of shape (batch_size, max_length_input, enc_units)
    # memory_sequence_length: 1d array of shape (batch_size) with every element set to max_length_input (for masking purpose)

    if(attention_type=='bahdanau'):
      return tfa.seq2seq.BahdanauAttention(units=dec_units, memory=memory, memory_sequence_length=memory_sequence_length)
    else:
      return tfa.seq2seq.LuongAttention(units=dec_units, memory=memory, memory_sequence_length=memory_sequence_length)

  def build_initial_state(self, batch_sz, encoder_state, Dtype):
    decoder_initial_state = self.rnn_cell.get_initial_state(batch_size=batch_sz, dtype=Dtype)
    decoder_initial_state = decoder_initial_state.clone(cell_state=encoder_state)
    return decoder_initial_state


  def call(self, inputs, initial_state):
    x = self.embedding(inputs)
    outputs, _, _ = self.decoder(x, initial_state=initial_state, sequence_length=self.batch_sz*[max_length_output-1])
    return outputs

In [18]:
# Test decoder stack

decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE, 'luong')
sample_x = tf.random.uniform((BATCH_SIZE, max_length_output))
decoder.attention_mechanism.setup_memory(sample_output)
initial_state = decoder.build_initial_state(BATCH_SIZE, [sample_h, sample_c], tf.float32)


sample_decoder_outputs = decoder(sample_x, initial_state)

print("Decoder Outputs Shape: ", sample_decoder_outputs.rnn_output.shape)

Decoder Outputs Shape:  (8, 48, 353)


In [19]:
optimizer = tf.keras.optimizers.Adam()


def loss_function(real, pred):
  # real shape = (BATCH_SIZE, max_length_output)
  # pred shape = (BATCH_SIZE, max_length_output, tar_vocab_size )
  cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
  loss = cross_entropy(y_true=real, y_pred=pred)
  mask = tf.logical_not(tf.math.equal(real,0))   #output 0 for y=0 else output 1
  mask = tf.cast(mask, dtype=loss.dtype)  
  loss = mask* loss
  loss = tf.reduce_mean(loss)
  return loss

In [20]:
import os

checkpoint_dir = './data/output'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [21]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_h, enc_c = encoder(inp, enc_hidden)


    dec_input = targ[ : , :-1 ] # Ignore <end> token
    real = targ[ : , 1: ]         # ignore <start> token

    # Set the AttentionMechanism object with encoder_outputs
    decoder.attention_mechanism.setup_memory(enc_output)

    # Create AttentionWrapperState as initial_state for decoder
    decoder_initial_state = decoder.build_initial_state(BATCH_SIZE, [enc_h, enc_c], tf.float32)
    pred = decoder(dec_input, decoder_initial_state)
    logits = pred.rnn_output
    loss = loss_function(real, logits)

  variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))

  return loss

In [25]:
EPOCHS = 10
import time

for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0
  # print(enc_hidden[0].shape, enc_hidden[1].shape)

  for (batch, (inp, targ)) in enumerate(train_dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 2 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
  # saving (checkpoint) the model every 2 epochs
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

2022-04-04 18:14:20.864325: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-04-04 18:14:20.866769: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-04-04 18:14:20.980540: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "GPU" } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }
2022-04-04 18:14:21.171140: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-04-04 18:14:27.807506: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 1 Batch 0 Loss 1.7876
Epoch 1 Batch 2 Loss 1.6109
Epoch 1 Batch 4 Loss 1.6625
Epoch 1 Batch 6 Loss 1.3005
Epoch 1 Loss 0.0312
Time taken for 1 epoch 49.6650869846344 sec

Epoch 2 Batch 0 Loss 1.9546
Epoch 2 Batch 2 Loss 1.7259
Epoch 2 Batch 4 Loss 1.2999
Epoch 2 Batch 6 Loss 1.1079
Epoch 2 Loss 0.0319
Time taken for 1 epoch 46.49965739250183 sec

Epoch 3 Batch 0 Loss 1.3743
Epoch 3 Batch 2 Loss 1.4879
Epoch 3 Batch 4 Loss 1.7528
Epoch 3 Batch 6 Loss 0.8044
Epoch 3 Loss 0.0288
Time taken for 1 epoch 47.074408769607544 sec

Epoch 4 Batch 0 Loss 1.4260
Epoch 4 Batch 2 Loss 0.9186
Epoch 4 Batch 4 Loss 1.2607
Epoch 4 Batch 6 Loss 1.4765
Epoch 4 Loss 0.0272
Time taken for 1 epoch 47.43459463119507 sec

Epoch 5 Batch 0 Loss 1.6010
Epoch 5 Batch 2 Loss 1.3728
Epoch 5 Batch 4 Loss 1.1104
Epoch 5 Batch 6 Loss 1.8492
Epoch 5 Loss 0.0249
Time taken for 1 epoch 46.95644426345825 sec

Epoch 6 Batch 0 Loss 1.2551
Epoch 6 Batch 2 Loss 1.0489
Epoch 6 Batch 4 Loss 1.6414
Epoch 6 Batch 6 Loss 1.147

In [32]:
def evaluate_sentence(sentence):
  sentence = dataset_creator.preprocess_sentence(sentence)
    
  inputs = [inp_lang.word_index[i] for i in sentence]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                          maxlen=max_length_input,
                                                          padding='post')
  inputs = tf.convert_to_tensor(inputs)
  inference_batch_size = inputs.shape[0]
  result = ''

  enc_start_state = [tf.zeros((inference_batch_size, units)), tf.zeros((inference_batch_size,units))]
  enc_out, enc_h, enc_c = encoder(inputs, enc_start_state)

  dec_h = enc_h
  dec_c = enc_c

  start_tokens = tf.fill([inference_batch_size], targ_lang.word_index['<start>'])
  end_token = targ_lang.word_index['<end>']

  greedy_sampler = tfa.seq2seq.GreedyEmbeddingSampler()

  # Instantiate BasicDecoder object
  decoder_instance = tfa.seq2seq.BasicDecoder(cell=decoder.rnn_cell, sampler=greedy_sampler, output_layer=decoder.fc)
  # Setup Memory in decoder stack
  decoder.attention_mechanism.setup_memory(enc_out)

  # set decoder_initial_state
  decoder_initial_state = decoder.build_initial_state(inference_batch_size, [enc_h, enc_c], tf.float32)


  ### Since the BasicDecoder wraps around Decoder's rnn cell only, you have to ensure that the inputs to BasicDecoder 
  ### decoding step is output of embedding layer. tfa.seq2seq.GreedyEmbeddingSampler() takes care of this. 
  ### You only need to get the weights of embedding layer, which can be done by decoder.embedding.variables[0] and pass this callabble to BasicDecoder's call() function

  decoder_embedding_matrix = decoder.embedding.variables[0]

  outputs, _, _ = decoder_instance(decoder_embedding_matrix, start_tokens = start_tokens, end_token= end_token, initial_state=decoder_initial_state)
  return outputs.sample_id.numpy()

def predict_sentence(sentence):
  result = evaluate_sentence(sentence)
  print(result)
  result = targ_lang.sequences_to_texts(result)
  print('Input: %s' % (sentence))
  print('Predicted output: {}'.format(result))

In [33]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x2b6d22340>

In [34]:
predict_sentence('Este internship-ul gratis?')

[[21  4  4  6  3]]
Input: Este internship-ul gratis?
Predicted output: ['nu internship internship assist <end>']
