In [25]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [1]:
text = "I like my brother. Go home!. Do you like my brother?"

## 1- Manual Method

### 1-1- Standardization

In [2]:
text_standard = text.lower()
sentences = text_standard.split('. ')
print(sentences)

['i like my brother', 'go home!', 'do you like my brother?']


### 1-2- Tokenization + Indexing

In [22]:
tokenizer = Tokenizer(num_words=100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)

print(tokenizer.word_index)

word_index_inverse = dict([(v, k) for k, v in tokenizer.word_index.items()])
print(word_index_inverse)

{'<OOV>': 1, 'like': 2, 'my': 3, 'brother': 4, 'i': 5, 'go': 6, 'home': 7, 'do': 8, 'you': 9}
{1: '<OOV>', 2: 'like', 3: 'my', 4: 'brother', 5: 'i', 6: 'go', 7: 'home', 8: 'do', 9: 'you'}


### 1-3- Sequencing

In [16]:
sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)

[[5, 2, 3, 4], [6, 7], [8, 9, 2, 3, 4]]


In [35]:
lenghts = [len(s) for s in sequences]
print(lenghts)

[4, 2, 5]


#### 1-3-1- Out-of-Vocabulary Words

In [18]:
new_sentence = ["I like my sister"]
new_sequence = tokenizer.texts_to_sequences(new_sentence)
print(new_sequence)

print(tokenizer.sequences_to_texts(new_sequence))

[[5, 2, 3, 1]]
['i like my <OOV>']


### 1-4- Padding

In [36]:
padded_sequences = pad_sequences(sequences, padding='post', maxlen=max(lenghts))


input_data = np.array(padded_sequences, dtype="int32")
print(input_data)

[[5 2 3 4 0]
 [6 7 0 0 0]
 [8 9 2 3 4]]


## 2- Using TextVectorization Layer

In [28]:
from tensorflow.keras.layers import TextVectorization

In [29]:
sentences = text.split('. ')
print(sentences)

['I like my brother', 'Go home!', 'Do you like my brother?']


In [30]:
text_vectorization = TextVectorization()
text_vectorization.adapt(sentences)

In [33]:
vocab = text_vectorization.get_vocabulary()
print(vocab)

vocab_inverse = dict(enumerate(vocab))
print(vocab_inverse)

['', '[UNK]', 'my', 'like', 'brother', 'you', 'i', 'home', 'go', 'do']
{0: '', 1: '[UNK]', 2: 'my', 3: 'like', 4: 'brother', 5: 'you', 6: 'i', 7: 'home', 8: 'go', 9: 'do'}


In [34]:
encoded_sentences = text_vectorization(sentences)
print(encoded_sentences)

tf.Tensor(
[[6 3 2 4 0]
 [8 7 0 0 0]
 [9 5 3 2 4]], shape=(3, 5), dtype=int64)
