In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
sentences = [
    "I love my dog",
    "I love my cat"
]

sentences

['I love my dog', 'I love my cat']

In [3]:
tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'i': 1, 'love': 2, 'my': 3, 'dog': 4, 'cat': 5}


In [4]:
# Sequences
sequences = tokenizer.texts_to_sequences(sentences)
sequences

[[1, 2, 3, 4], [1, 2, 3, 5]]

In [5]:
sentences = [
    "I love my dog",
    "I love my white cat",
    "you love my big black dog",
    "do you think my dog is very much amazing"
]
sentences

['I love my dog',
 'I love my white cat',
 'you love my big black dog',
 'do you think my dog is very much amazing']

In [6]:
tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'my': 1, 'love': 2, 'dog': 3, 'i': 4, 'you': 5, 'white': 6, 'cat': 7, 'big': 8, 'black': 9, 'do': 10, 'think': 11, 'is': 12, 'very': 13, 'much': 14, 'amazing': 15}


In [7]:
sequences = tokenizer.texts_to_sequences(sentences)
sequences

[[4, 2, 1, 3],
 [4, 2, 1, 6, 7],
 [5, 2, 1, 8, 9, 3],
 [10, 5, 11, 1, 3, 12, 13, 14, 15]]

In [8]:
test = [
    "i really love my dog",
    "my brother also like my dog"
]
test

['i really love my dog', 'my brother also like my dog']

In [9]:
test_seq = tokenizer.texts_to_sequences(test)
test_seq

[[4, 2, 1, 3], [1, 1, 3]]

In [10]:
# Out of vocabulary
tokenizer = Tokenizer(num_words=100,oov_token="<oov>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
word_index

{'<oov>': 1,
 'my': 2,
 'love': 3,
 'dog': 4,
 'i': 5,
 'you': 6,
 'white': 7,
 'cat': 8,
 'big': 9,
 'black': 10,
 'do': 11,
 'think': 12,
 'is': 13,
 'very': 14,
 'much': 15,
 'amazing': 16}

In [11]:
sequences = tokenizer.texts_to_sequences(sentences)
sequences

[[5, 3, 2, 4],
 [5, 3, 2, 7, 8],
 [6, 3, 2, 9, 10, 4],
 [11, 6, 12, 2, 4, 13, 14, 15, 16]]

In [12]:
test_seq = tokenizer.texts_to_sequences(test)
test_seq

[[5, 1, 3, 2, 4], [2, 1, 1, 1, 2, 4]]

In [13]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer(num_words=100,oov_token="<oov>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
word_index

{'<oov>': 1,
 'my': 2,
 'love': 3,
 'dog': 4,
 'i': 5,
 'you': 6,
 'white': 7,
 'cat': 8,
 'big': 9,
 'black': 10,
 'do': 11,
 'think': 12,
 'is': 13,
 'very': 14,
 'much': 15,
 'amazing': 16}

In [14]:
sequences = tokenizer.texts_to_sequences(sentences)
sequences

[[5, 3, 2, 4],
 [5, 3, 2, 7, 8],
 [6, 3, 2, 9, 10, 4],
 [11, 6, 12, 2, 4, 13, 14, 15, 16]]

In [15]:
padded = pad_sequences(sequences)
padded

array([[ 0,  0,  0,  0,  0,  5,  3,  2,  4],
       [ 0,  0,  0,  0,  5,  3,  2,  7,  8],
       [ 0,  0,  0,  6,  3,  2,  9, 10,  4],
       [11,  6, 12,  2,  4, 13, 14, 15, 16]])

In [16]:
padded = pad_sequences(sequences,padding='post')
padded

array([[ 5,  3,  2,  4,  0,  0,  0,  0,  0],
       [ 5,  3,  2,  7,  8,  0,  0,  0,  0],
       [ 6,  3,  2,  9, 10,  4,  0,  0,  0],
       [11,  6, 12,  2,  4, 13, 14, 15, 16]])

In [17]:
padded = pad_sequences(sequences,padding='post',maxlen=5)
padded

array([[ 5,  3,  2,  4,  0],
       [ 5,  3,  2,  7,  8],
       [ 3,  2,  9, 10,  4],
       [ 4, 13, 14, 15, 16]])

In [18]:
padded = pad_sequences(sequences,padding='post',maxlen=5,truncating='post')
padded

array([[ 5,  3,  2,  4,  0],
       [ 5,  3,  2,  7,  8],
       [ 6,  3,  2,  9, 10],
       [11,  6, 12,  2,  4]])