# NLP Text Pre-processing

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import TextVectorization

In [None]:
text = "I like my brother. Go home!. Do you like my brother?"

In [None]:
sentences = text.split('. ')
print(sentences)

['I like my brother', 'Go home!', 'Do you like my brother?']


## Method 1

### 1- Tokenization + Indexing

In [None]:
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)

print(tokenizer.word_index)

{'<OOV>': 1, 'like': 2, 'my': 3, 'brother': 4, 'i': 5, 'go': 6, 'home': 7, 'do': 8, 'you': 9}


### 2- Sequencing

In [None]:
sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)

[[5, 2, 3, 4], [6, 7], [8, 9, 2, 3, 4]]


In [None]:
new_sentence = ["I like my sister"]
new_sequence = tokenizer.texts_to_sequences(new_sentence)
print(new_sequence)

[[5, 2, 3, 1]]


### 3- Padding

In [None]:
padded_sequences = pad_sequences(sequences, padding='post', maxlen=10)
print(padded_sequences)
print(type(padded_sequences))

[[5 2 3 4 0 0 0 0 0 0]
 [6 7 0 0 0 0 0 0 0 0]
 [8 9 2 3 4 0 0 0 0 0]]
<class 'numpy.ndarray'>


## Method 2

### Using TextVectorization Layer

In [None]:
text_vectorization = TextVectorization()
text_vectorization.adapt(sentences)

print(text_vectorization.get_vocabulary())

['', '[UNK]', 'my', 'like', 'brother', 'you', 'i', 'home', 'go', 'do']


In [None]:
encoded_sequences = text_vectorization(sentences)
print(encoded_sequences)
print(type(encoded_sequences))

tf.Tensor(
[[6 3 2 4 0]
 [8 7 0 0 0]
 [9 5 3 2 4]], shape=(3, 5), dtype=int64)
<class 'tensorflow.python.framework.ops.EagerTensor'>
