In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [7]:
#test sentence
sentences = [
	'I like dogs.',
    'My name is Dhruv',
    'I am a student at KMV.',
    'I hate onions.'
]

In [8]:
MAX_VOCAB_SIZE = 50000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

In [9]:
print(sequences)

[[1, 2, 3], [4, 5, 6, 7], [1, 8, 9, 10, 11, 12], [1, 13, 14]]


Word to index mapping

In [10]:
tokenizer.word_index

{'i': 1,
 'like': 2,
 'dogs': 3,
 'my': 4,
 'name': 5,
 'is': 6,
 'dhruv': 7,
 'am': 8,
 'a': 9,
 'student': 10,
 'at': 11,
 'kmv': 12,
 'hate': 13,
 'onions': 14}

In [11]:
padded_seq1 = pad_sequences(sequences)
print(padded_seq1)  #def pads as pre

[[ 0  0  0  1  2  3]
 [ 0  0  4  5  6  7]
 [ 1  8  9 10 11 12]
 [ 0  0  0  1 13 14]]


In [12]:
MAX_SEQ_LEN = 5
padded_seq2 = pad_sequences(sequences, maxlen=MAX_SEQ_LEN)
print(padded_seq2)  
#here the overflowing value in the third row gets clipped/ removed from the beginning

[[ 0  0  1  2  3]
 [ 0  4  5  6  7]
 [ 8  9 10 11 12]
 [ 0  0  1 13 14]]


In [13]:
#post padding
padded_seq3 = pad_sequences(sequences, maxlen=MAX_SEQ_LEN, padding='post')
print(padded_seq3)  

[[ 1  2  3  0  0]
 [ 4  5  6  7  0]
 [ 8  9 10 11 12]
 [ 1 13 14  0  0]]


In [14]:
#too much padding
EXTRA_PADDED_LEN = 10
padded_seq4 = pad_sequences(sequences, maxlen=EXTRA_PADDED_LEN, padding='post')
print(padded_seq4)

[[ 1  2  3  0  0  0  0  0  0  0]
 [ 4  5  6  7  0  0  0  0  0  0]
 [ 1  8  9 10 11 12  0  0  0  0]
 [ 1 13 14  0  0  0  0  0  0  0]]


In [15]:
#truncating
padded_seq5 = pad_sequences(sequences, maxlen=MAX_SEQ_LEN, truncating='post')
print(padded_seq5)  

[[ 0  0  1  2  3]
 [ 0  4  5  6  7]
 [ 1  8  9 10 11]
 [ 0  0  1 13 14]]
