<a href="https://colab.research.google.com/github/AdicherlaVenkataSai/NLP-Zero-to-Hero/blob/master/2.%20Sequencing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sequencing - Turning sentences into data

In [0]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer



In [0]:
sentences = [
             'i love my dog',
             'i love my cat ',
             'you love my dog',
             'do you think my dog is amazing?'
]

In [27]:
#creating an instance of it
tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(sentences)
#tokenizer gets the word index from the sentences and create it as sequence
word_index = tokenizer.word_index
word_index

{'amazing': 10,
 'cat': 6,
 'do': 7,
 'dog': 3,
 'i': 4,
 'is': 9,
 'love': 2,
 'my': 1,
 'think': 8,
 'you': 5}

In [28]:
#sentences are converted into sequence of numbers
# tokemizer has a method texts_to_sequence 
# it creates sequences of tokens representing each sentence
sequences = tokenizer.texts_to_sequences(sentences)
sequences


[[4, 2, 1, 3], [4, 2, 1, 6], [5, 2, 1, 3], [7, 5, 8, 1, 3, 9, 10]]

## Testing

In [0]:
test_data = [
        'i really love my dog',
        'my dog loves my manatee'
]
#these test data contains words which doesnt have tokens
#and aren't present in word index

In [30]:
test_seq = tokenizer.texts_to_sequences(test_data)
test_seq

[[4, 2, 1, 3], [1, 3, 1]]

In [0]:
#the sequences are of diff length when compared it with the sentence
#that becauses words like (really, loves, manatee) is not avail in word_index
# we really need big word_index to handle the word

#OOV

In [32]:
#in order to not to lose the length of sequence,
#using OOV token property and setting it as something
#that you would not expect to see in the corpus like <OOV>
#the tokenizer will create a token for that and
#replace work which is doesnt recoginze with the Out Of Vocabulary token instead


tokenizer1 = Tokenizer(num_words=100, oov_token='<OOV>')
tokenizer1.fit_on_texts(sentences)
word_index = tokenizer1.word_index
#print(word_index)
sequences1 = tokenizer1.texts_to_sequences(sentences)

test_data = [
        'i really love my dog',
        'my dog loves my manatee'
]
test_seq = tokenizer1.texts_to_sequences(test_data)
print(test_seq)

[[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]]


In [0]:
# we have achieved the length , but still lost some meaning

In [0]:
# in neural networks to handle the sizes of different lengths
# RaggedTensor
#simpler sol is padding

#Padding


In [0]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [38]:
# sentences
# tokenizer 1 instance
# test_data
# test_seq

padded = pad_sequences(sequences)
print(word_index, '\n')
print(sequences, '\n')
print(padded)

{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11} 

[[4, 2, 1, 3], [4, 2, 1, 6], [5, 2, 1, 3], [7, 5, 8, 1, 3, 9, 10]] 

[[ 0  0  0  4  2  1  3]
 [ 0  0  0  4  2  1  6]
 [ 0  0  0  5  2  1  3]
 [ 7  5  8  1  3  9 10]]


In [40]:
# the overall length will be equal to longest sentence available  here =7
#  note: <OOV> : 1 not 0, only padded bits are 0
# if we want to pad at the end
padded = pad_sequences(sequences, padding = 'post')
print(padded)

[[ 4  2  1  3  0  0  0]
 [ 4  2  1  6  0  0  0]
 [ 5  2  1  3  0  0  0]
 [ 7  5  8  1  3  9 10]]


In [44]:
# if we dont want the padded sequneces with equal length to longest sentence, ex as len = 5
# truncating = pre/ post it coulld either cut off from begining or end
padded = pad_sequences(sequences, padding = 'post', maxlen = 5, truncating = 'post')
print(padded)

[[4 2 1 3 0]
 [4 2 1 6 0]
 [5 2 1 3 0]
 [7 5 8 1 3]]
