<a href="https://colab.research.google.com/github/Dansah2/Udacity_Tutorials/blob/main/Udacity_NLP_Padding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [4]:
# create sentences
sentences = [
    'My favorite food is ice cream',
    'do you like ice cream too?',
    'My dog like ice cream!',
    'your favorite flavor of icecream is chocolate',
    "chocolate isn't goog for dogs",
    'your dog, your cat, and your parrot prefer broccoli'
]
print(sentences)

['My favorite food is ice cream', 'do you like ice cream too?', 'My dog like ice cream!', 'your favorite flavor of icecream is chocolate', "chocolate isn't goog for dogs", 'your dog, your cat, and your parrot prefer broccoli']


In [5]:
# create tokenizer and define out of vocab token
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')

In [6]:
#tokenize the words
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'<OOV>': 1, 'your': 2, 'ice': 3, 'cream': 4, 'my': 5, 'favorite': 6, 'is': 7, 'like': 8, 'dog': 9, 'chocolate': 10, 'food': 11, 'do': 12, 'you': 13, 'too': 14, 'flavor': 15, 'of': 16, 'icecream': 17, "isn't": 18, 'goog': 19, 'for': 20, 'dogs': 21, 'cat': 22, 'and': 23, 'parrot': 24, 'prefer': 25, 'broccoli': 26}


In [7]:
#convert sentences into sequences
sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)

[[5, 6, 11, 7, 3, 4], [12, 13, 8, 3, 4, 14], [5, 9, 8, 3, 4], [2, 6, 15, 16, 17, 7, 10], [10, 18, 19, 20, 21], [2, 9, 2, 22, 23, 2, 24, 25, 26]]


In [8]:
# making sequences all the same length using padding
padded = pad_sequences(sequences)
print('\nWord Index =', word_index)
print('\nSequences = ', sequences)
print('\nPadded Sequences:')
print(padded)


Word Index = {'<OOV>': 1, 'your': 2, 'ice': 3, 'cream': 4, 'my': 5, 'favorite': 6, 'is': 7, 'like': 8, 'dog': 9, 'chocolate': 10, 'food': 11, 'do': 12, 'you': 13, 'too': 14, 'flavor': 15, 'of': 16, 'icecream': 17, "isn't": 18, 'goog': 19, 'for': 20, 'dogs': 21, 'cat': 22, 'and': 23, 'parrot': 24, 'prefer': 25, 'broccoli': 26}

Sequences =  [[5, 6, 11, 7, 3, 4], [12, 13, 8, 3, 4, 14], [5, 9, 8, 3, 4], [2, 6, 15, 16, 17, 7, 10], [10, 18, 19, 20, 21], [2, 9, 2, 22, 23, 2, 24, 25, 26]]

Padded Sequences:
[[ 0  0  0  5  6 11  7  3  4]
 [ 0  0  0 12 13  8  3  4 14]
 [ 0  0  0  0  5  9  8  3  4]
 [ 0  0  2  6 15 16 17  7 10]
 [ 0  0  0  0 10 18 19 20 21]
 [ 2  9  2 22 23  2 24 25 26]]


In [9]:
# play with the maxlen hyperparameter by increasing it
padded = pad_sequences(sequences, maxlen=18)
print(padded)

[[ 0  0  0  0  0  0  0  0  0  0  0  0  5  6 11  7  3  4]
 [ 0  0  0  0  0  0  0  0  0  0  0  0 12 13  8  3  4 14]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  5  9  8  3  4]
 [ 0  0  0  0  0  0  0  0  0  0  0  2  6 15 16 17  7 10]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0 10 18 19 20 21]
 [ 0  0  0  0  0  0  0  0  0  2  9  2 22 23  2 24 25 26]]


In [11]:
# play with the padding hyperparameter by switching
#the padding to the end of the sequence
padded = pad_sequences(sequences, maxlen=18, padding='post')
print(padded)

[[ 5  6 11  7  3  4  0  0  0  0  0  0  0  0  0  0  0  0]
 [12 13  8  3  4 14  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 5  9  8  3  4  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 2  6 15 16 17  7 10  0  0  0  0  0  0  0  0  0  0  0]
 [10 18 19 20 21  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 2  9  2 22 23  2 24 25 26  0  0  0  0  0  0  0  0  0]]


In [13]:
# play with the maxlen hpyerparameter by restricting
#the length of the sequence
padded = pad_sequences(sequences, maxlen=5)
print(padded)

[[ 6 11  7  3  4]
 [13  8  3  4 14]
 [ 5  9  8  3  4]
 [15 16 17  7 10]
 [10 18 19 20 21]
 [23  2 24 25 26]]


In [14]:
# handeling out of vocab words by using out of vocab tokens

test_data = [
    "my best friend's favorite ice cream flavor is strawberry",
    "my dog's best friend is a manatee"
]
print(test_data)

# check which number corresponds to the out of vocab word
print("<OOV> has the", word_index['<OOV>'], 'in the word index')

# convert the test sentences to sequences
test_seq = tokenizer.texts_to_sequences(test_data)
print("\nTest Sequence = ", test_seq)

# pad the sequences
padded = pad_sequences(test_seq, maxlen=10)
print('\nPadded Test Sequences: ')

# note that 1 appears whever there's a word not in the index
print(padded)

["my best friend's favorite ice cream flavor is strawberry", "my dog's best friend is a manatee"]
<OOV> has the 1 in the word index

Test Sequence =  [[5, 1, 1, 6, 3, 4, 15, 7, 1], [5, 1, 1, 1, 7, 1, 1]]

Padded Test Sequences: 
[[ 0  5  1  1  6  3  4 15  7  1]
 [ 0  0  0  5  1  1  1  7  1  1]]
