In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

# Data Processing

1. Split dataset into sentences
2. Remove punctuation
3. All letters lowercase
4. Split sentences into words
5. Create a vocabulary (all known words of the ML model)
6. Tokenize words
7. Add and tags in vocabulary ?

In [2]:
questions = []
answers = []

with open('data/input/input.txt') as f:
    lines = f.readlines()
    for line in lines:
        if line[0] == 'I':
            questions.append(line.strip())
        else:
            answers.append(line.strip())

answers = list(filter(None, answers))
len(questions), len(answers)

(78, 78)

In [3]:
# remove 'Intrebarea' si 'Raspunsul'
import re 

def remove_preposition(text):
    q_regex = 'Intrebarea\ [0-9]+\:\ '
    a_regex = 'Raspuns\ [0-9]+\:\ '
    
    if 'Intrebarea' in text:
        text = re.sub(q_regex, '', text)
    else:
        text = re.sub(a_regex, '', text)
    return text

In [4]:
# remove punctuation
import string

def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

In [5]:
#remove stopwords

def remove_stopwords(text, path_to_stopwords):
    stopwords = open(path_to_stopwords).readline().split(',')
    return [x for x in text if x not in stopwords]

In [6]:
# all lowercase
def to_lower(text):
    return text.lower()

In [7]:
# split into words
def split_words(text):
    tokens = re.split(' ',text)
    return tokens

In [8]:
# stemming

import nltk
from nltk.stem.snowball import RomanianStemmer

def stemming(text):
    snow_stemmer = RomanianStemmer()
    return [snow_stemmer.stem(word) for word in text]


In [9]:
# putting together the pipeline

def preprocessing_pipeline(_list):
    new_list = []
    for text in _list:
        text = remove_preposition(text)
        text = remove_punctuation(text)
        text = split_words(text)
        text = remove_stopwords(text, 'data/input/stopwords-ro.txt')
        text = list(filter(None, text))
        text = [to_lower(x) for x in text]
        text = stemming(text)
        text = ['<start>'] + text 
        text.append('<end>')
        new_list.append(text)
    return new_list

In [10]:
preprocessing_pipeline(questions)

[['<START>', 'se', 'fac', 'internshipur', 'individual', '<END>'],
 ['<START>', 'pot', 'evalu', 'internship', 'onlin', '<END>'],
 ['<START>', 'este', 'internship', 'gratis', '<END>'],
 ['<START>', 'cin', 'aplic', '<END>'],
 ['<START>', 'car', 'indatorir', 'respons', 'accept', '<END>'],
 ['<START>',
  'fapt',
  'moment',
  'aflu',
  'republ',
  'moldov',
  'influent',
  'etap',
  'select',
  'interviur',
  'hr',
  'tehnic',
  '<END>'],
 ['<START>', 'exist', 'opțiun', 'intership', 'devrem', 'apriliem', '<END>'],
 ['<START>', 'internship', 'loc', 'onlin', 'sed', 'compan', '<END>'],
 ['<START>', 'cum', 'internship', 'onlineofflin', '<END>'],
 ['<START>', 'car', 'progr', 'zilnic', 'internship', '<END>'],
 ['<START>',
  'e',
  'posibil',
  'incep',
  'internship',
  'devrem',
  'iulieaugust',
  '<END>'],
 ['<START>',
  'am',
  'vazut',
  'intership',
  'student',
  'persoan',
  'sas',
  'schimb',
  'carier',
  'caz',
  'vreau',
  'sam',
  'schimb',
  'carier',
  'treb',
  'astept',
  'pan',
 

In [98]:
# creating the Tokenizer and the vocabulary

# ----this is deprecated for now

# inp_lang_tokenizer =  tf.keras.preprocessing.text.Tokenizer()
# targ_lang_tokenizer =  tf.keras.preprocessing.text.Tokenizer()

# lang = questions + answers
# tokenizer.fit_on_texts(text_corpus)

# new_texts = ['cine poate participa la internship?']
# print(tokenizer.texts_to_sequences(new_texts))
# print(tokenizer.word_index) # the vocabulary

NameError: name 'tokenizer' is not defined

# Creating the model architecture

In [13]:
import tensorflow as tf
import tensorflow_addons as tfa

from sklearn.model_selection import train_test_split

I've decided to create a Seq-to-Seq model. This type of model requires an Encoder and a Decoder. The example is from the [TensorFlow official documentation](https://www.tensorflow.org/addons/tutorials/networks_seq2seq_nmt)

In [74]:
class QADataset:
    def __init__(self):
        self.inp_lang_tokenizer = None
        self.targ_lang_tokenizer = None
        
    def preprocess_sentence(self, sentence):
        text = remove_preposition(sentence)
        text = remove_punctuation(text)
        text = split_words(text)
        text = remove_stopwords(text, 'data/input/stopwords-ro.txt')
        text = list(filter(None, text))
        text = [to_lower(x) for x in text]
        text = stemming(text)
        text = ['<START>'] + text 
        text.append('<END>')

    def tokenize(self, lang):
        # lang = list of sentences in a language

        # print(len(lang), "example sentence: {}".format(lang[0]))
        lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<OOV>')
        lang_tokenizer.fit_on_texts(lang)

        ## tf.keras.preprocessing.text.Tokenizer.texts_to_sequences converts string (w1, w2, w3, ......, wn) 
        ## to a list of correspoding integer ids of words (id_w1, id_w2, id_w3, ...., id_wn)
        tensor = lang_tokenizer.texts_to_sequences(lang) 

        ## tf.keras.preprocessing.sequence.pad_sequences takes argument a list of integer id sequences 
        ## and pads the sequences to match the longest sequences in the given input
        tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')

        return tensor, lang_tokenizer

    def load_dataset(self):
        # creating cleaned input, output pairs
        targ_lang, inp_lang = questions, answers

        input_tensor, inp_lang_tokenizer = self.tokenize(inp_lang)
        target_tensor, targ_lang_tokenizer = self.tokenize(targ_lang)

        return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

    def call(self, BUFFER_SIZE, BATCH_SIZE):
        input_tensor, target_tensor, self.inp_lang_tokenizer, self.targ_lang_tokenizer = self.load_dataset()

        input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

        train_dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train))
        train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

        val_dataset = tf.data.Dataset.from_tensor_slices((input_tensor_val, target_tensor_val))
        val_dataset = val_dataset.batch(BATCH_SIZE, drop_remainder=True)

        return train_dataset, val_dataset, self.inp_lang_tokenizer, self.targ_lang_tokenizer

In [40]:
BUFFER_SIZE = 32000
BATCH_SIZE = 5
# Let's limit the #training examples for faster training

dataset_creator = QADataset()
train_dataset, val_dataset, inp_lang, targ_lang = dataset_creator.call(BUFFER_SIZE, BATCH_SIZE)

In [43]:
example_input_batch, example_target_batch = next(iter(train_dataset))
example_input_batch.shape, example_target_batch.shape

2022-03-30 10:38:12.000663: W tensorflow/core/data/root_dataset.cc:200] Optimization loop failed: CANCELLED: Operation was cancelled


(TensorShape([5, 108]), TensorShape([5, 81]))

In [45]:
# Parameters

vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1
max_length_input = example_input_batch.shape[1]
max_length_output = example_target_batch.shape[1]
num_examples = 100 # an arbitrary value that can be changed later

embedding_dim = 256
units = 1024
steps_per_epoch = num_examples//BATCH_SIZE

In [47]:
print("max_length_inp, max_length_out, vocab_size_questions, vocab_size_answers")
max_length_input, max_length_output, vocab_inp_size, vocab_tar_size

max_length_inp, max_length_out, vocab_size_questions, vocab_size_answers


(108, 81, 683, 507)

# Encoder

In [48]:
##### 

class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

    ##-------- LSTM layer in Encoder ------- ##
    self.lstm_layer = tf.keras.layers.LSTM(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')



  def call(self, x, hidden):
    x = self.embedding(x)
    output, h, c = self.lstm_layer(x, initial_state = hidden)
    return output, h, c

  def initialize_hidden_state(self):
    return [tf.zeros((self.batch_sz, self.enc_units)), tf.zeros((self.batch_sz, self.enc_units))]

In [49]:
## Test Encoder Stack

encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)


# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_h, sample_c = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder h vector shape: (batch size, units) {}'.format(sample_h.shape))
print ('Encoder c vector shape: (batch size, units) {}'.format(sample_c.shape))

Encoder output shape: (batch size, sequence length, units) (5, 108, 1024)
Encoder h vecotr shape: (batch size, units) (5, 1024)
Encoder c vector shape: (batch size, units) (5, 1024)


# Decoder

In [50]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz, attention_type='luong'):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.attention_type = attention_type

    # Embedding Layer
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

    #Final Dense layer on which softmax will be applied
    self.fc = tf.keras.layers.Dense(vocab_size)

    # Define the fundamental cell for decoder recurrent structure
    self.decoder_rnn_cell = tf.keras.layers.LSTMCell(self.dec_units)



    # Sampler
    self.sampler = tfa.seq2seq.sampler.TrainingSampler()

    # Create attention mechanism with memory = None
    self.attention_mechanism = self.build_attention_mechanism(self.dec_units, 
                                                              None, self.batch_sz*[max_length_input], self.attention_type)

    # Wrap attention mechanism with the fundamental rnn cell of decoder
    self.rnn_cell = self.build_rnn_cell(batch_sz)

    # Define the decoder with respect to fundamental rnn cell
    self.decoder = tfa.seq2seq.BasicDecoder(self.rnn_cell, sampler=self.sampler, output_layer=self.fc)


  def build_rnn_cell(self, batch_sz):
    rnn_cell = tfa.seq2seq.AttentionWrapper(self.decoder_rnn_cell, 
                                  self.attention_mechanism, attention_layer_size=self.dec_units)
    return rnn_cell

  def build_attention_mechanism(self, dec_units, memory, memory_sequence_length, attention_type='luong'):
    # ------------- #
    # typ: Which sort of attention (Bahdanau, Luong)
    # dec_units: final dimension of attention outputs 
    # memory: encoder hidden states of shape (batch_size, max_length_input, enc_units)
    # memory_sequence_length: 1d array of shape (batch_size) with every element set to max_length_input (for masking purpose)

    if(attention_type=='bahdanau'):
      return tfa.seq2seq.BahdanauAttention(units=dec_units, memory=memory, memory_sequence_length=memory_sequence_length)
    else:
      return tfa.seq2seq.LuongAttention(units=dec_units, memory=memory, memory_sequence_length=memory_sequence_length)

  def build_initial_state(self, batch_sz, encoder_state, Dtype):
    decoder_initial_state = self.rnn_cell.get_initial_state(batch_size=batch_sz, dtype=Dtype)
    decoder_initial_state = decoder_initial_state.clone(cell_state=encoder_state)
    return decoder_initial_state


  def call(self, inputs, initial_state):
    x = self.embedding(inputs)
    outputs, _, _ = self.decoder(x, initial_state=initial_state, sequence_length=self.batch_sz*[max_length_output-1])
    return outputs

In [51]:
# Test decoder stack

decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE, 'luong')
sample_x = tf.random.uniform((BATCH_SIZE, max_length_output))
decoder.attention_mechanism.setup_memory(sample_output)
initial_state = decoder.build_initial_state(BATCH_SIZE, [sample_h, sample_c], tf.float32)


sample_decoder_outputs = decoder(sample_x, initial_state)

print("Decoder Outputs Shape: ", sample_decoder_outputs.rnn_output.shape)

Decoder Outputs Shape:  (5, 80, 507)


In [52]:
optimizer = tf.keras.optimizers.Adam()


def loss_function(real, pred):
  # real shape = (BATCH_SIZE, max_length_output)
  # pred shape = (BATCH_SIZE, max_length_output, tar_vocab_size )
  cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
  loss = cross_entropy(y_true=real, y_pred=pred)
  mask = tf.logical_not(tf.math.equal(real,0))   #output 0 for y=0 else output 1
  mask = tf.cast(mask, dtype=loss.dtype)  
  loss = mask* loss
  loss = tf.reduce_mean(loss)
  return loss

In [54]:
import os

checkpoint_dir = './data/output'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [55]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_h, enc_c = encoder(inp, enc_hidden)


    dec_input = targ[ : , :-1 ] # Ignore <end> token
    real = targ[ : , 1: ]         # ignore <start> token

    # Set the AttentionMechanism object with encoder_outputs
    decoder.attention_mechanism.setup_memory(enc_output)

    # Create AttentionWrapperState as initial_state for decoder
    decoder_initial_state = decoder.build_initial_state(BATCH_SIZE, [enc_h, enc_c], tf.float32)
    pred = decoder(dec_input, decoder_initial_state)
    logits = pred.rnn_output
    loss = loss_function(real, logits)

  variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))

  return loss

In [57]:
EPOCHS = 10
import time

for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0
  # print(enc_hidden[0].shape, enc_hidden[1].shape)

  for (batch, (inp, targ)) in enumerate(train_dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
  # saving (checkpoint) the model every 2 epochs
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

2022-03-30 10:41:09.626536: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-03-30 10:41:09.629146: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-03-30 10:41:09.741700: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "GPU" } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }
2022-03-30 10:41:09.919733: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-03-30 10:41:41.541545: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 1 Batch 0 Loss 1.7137
Epoch 1 Loss 0.6537
Time taken for 1 epoch 307.77152395248413 sec

Epoch 2 Batch 0 Loss 1.6936
Epoch 2 Loss 0.5870
Time taken for 1 epoch 303.0016119480133 sec

Epoch 3 Batch 0 Loss 0.7554
Epoch 3 Loss 0.5622
Time taken for 1 epoch 308.2094042301178 sec

Epoch 4 Batch 0 Loss 0.6721
Epoch 4 Loss 0.5478
Time taken for 1 epoch 306.85611510276794 sec

Epoch 5 Batch 0 Loss 0.5752
Epoch 5 Loss 0.5190
Time taken for 1 epoch 314.1287651062012 sec

Epoch 6 Batch 0 Loss 0.8160
Epoch 6 Loss 0.4990
Time taken for 1 epoch 301.7817573547363 sec

Epoch 7 Batch 0 Loss 0.6610
Epoch 7 Loss 0.4898
Time taken for 1 epoch 307.19200015068054 sec

Epoch 8 Batch 0 Loss 1.2488
Epoch 8 Loss 0.4716
Time taken for 1 epoch 264.7693381309509 sec

Epoch 9 Batch 0 Loss 0.6134
Epoch 9 Loss 0.4607
Time taken for 1 epoch 281.0602502822876 sec

Epoch 10 Batch 0 Loss 0.6829
Epoch 10 Loss 0.4439
Time taken for 1 epoch 307.8385169506073 sec



In [100]:
def evaluate_sentence(sentence):
#   sentence = dataset_creator.preprocess_sentence(sentence)

  text = remove_preposition(sentence)
  text = remove_punctuation(text)
  text = split_words(text)
  text = remove_stopwords(text, 'data/input/stopwords-ro.txt')
  text = list(filter(None, text))
  text = [to_lower(x) for x in text]
  text = stemming(text)
    
  print(inp_lang.word_index)

  inputs = [inp_lang.word_index[i] for i in text]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                          maxlen=max_length_input,
                                                          padding='post')
  inputs = tf.convert_to_tensor(inputs)
  inference_batch_size = inputs.shape[0]
  result = ''

  enc_start_state = [tf.zeros((inference_batch_size, units)), tf.zeros((inference_batch_size,units))]
  enc_out, enc_h, enc_c = encoder(inputs, enc_start_state)

  dec_h = enc_h
  dec_c = enc_c

  start_tokens = tf.fill([inference_batch_size], targ_lang.word_index['<start>'])
  end_token = targ_lang.word_index['<end>']

  greedy_sampler = tfa.seq2seq.GreedyEmbeddingSampler()

  # Instantiate BasicDecoder object
  decoder_instance = tfa.seq2seq.BasicDecoder(cell=decoder.rnn_cell, sampler=greedy_sampler, output_layer=decoder.fc)
  # Setup Memory in decoder stack
  decoder.attention_mechanism.setup_memory(enc_out)

  # set decoder_initial_state
  decoder_initial_state = decoder.build_initial_state(inference_batch_size, [enc_h, enc_c], tf.float32)


  ### Since the BasicDecoder wraps around Decoder's rnn cell only, you have to ensure that the inputs to BasicDecoder 
  ### decoding step is output of embedding layer. tfa.seq2seq.GreedyEmbeddingSampler() takes care of this. 
  ### You only need to get the weights of embedding layer, which can be done by decoder.embedding.variables[0] and pass this callabble to BasicDecoder's call() function

  decoder_embedding_matrix = decoder.embedding.variables[0]

  outputs, _, _ = decoder_instance(decoder_embedding_matrix, start_tokens = start_tokens, end_token= end_token, initial_state=decoder_initial_state)
  return outputs.sample_id.numpy()

def predict(sentence):
  result = evaluate_sentence(sentence)
  print(result)
  result = targ_lang.sequences_to_texts(result)
  print('Input: %s' % (sentence))
  print('Predicted translation: {}'.format(result))

In [101]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x169455f10>

In [102]:
predict('Este internship-ul gratis? ')

{'<OOV>': 1, 'de': 2, 'raspuns': 3, 'la': 4, 'in': 5, 'sa': 6, 'si': 7, 'un': 8, 'pe': 9, 'se': 10, 'pentru': 11, 'vei': 12, 'care': 13, 'a': 14, 'internship': 15, 'o': 16, 'din': 17, 'daca': 18, 'fi': 19, 'cu': 20, 'ar': 21, 'nu': 22, 'mai': 23, 'vor': 24, 'este': 25, 'internship.': 26, 'da,': 27, 'internship-urile': 28, 'sediul': 29, 'te': 30, 'poti': 31, 'sau': 32, 'ai': 33, 'cat': 34, 'trebui': 35, 'despre': 36, 'au': 37, 'loc': 38, 'internship-ul': 39, 'aplica': 40, 'putea': 41, 'doua': 42, 'unui': 43, 'limbaj': 44, 'parcursul': 45, 'assist.': 46, 'ce': 47, 'exista': 48, 'desfasoara': 49, 'practica': 50, 'lucra': 51, 'echipa,': 52, 'sunt': 53, 'prin': 54, 'pot': 55, 'concluzie': 56, 'urma': 57, 'ca': 58, 'are': 59, 'cadrul': 60, 'il': 61, 'iti': 62, 'cunostinte': 63, 'cel': 64, 'experienta': 65, 'vom': 66, 'assist': 67, 'cum': 68, 'avea': 69, 'dar': 70, 'studiu': 71, 'va': 72, 'individual,': 73, 'grup': 74, 'ideal': 75, 'proiect': 76, 'internship-ului': 77, 'proiecte': 78, 'posibi

KeyError: 'gratis'

In [97]:
sentence = 'ana are mrere'
for i in sentence.split(' '):
    print(i)

ana
are
mrere
