In [5]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence
import keras

## Tokenize

In [2]:
# define the document
text = ''' Natural language processing (NLP) is a subfield of linguistics, 
computer science, information engineering, and artificial intelligence...'''

In [3]:
# tokenize the document
words = text_to_word_sequence(text)
print(words)

['natural', 'language', 'processing', 'nlp', 'is', 'a', 'subfield', 'of', 'linguistics', 'computer', 'science', 'information', 'engineering', 'and', 'artificial', 'intelligence']


## One-hot

In [7]:
from tensorflow.keras.preprocessing.text import one_hot

In [9]:
text = 'The cat sat on the mat. The dog ate my homework.'
words = text_to_word_sequence(text)
vocabs = set(words)
print('words:\n',words)
print('len of words:',len(words))
print('vocabs:\n', vocabs)
vocab_size = len(vocabs)
print('len of vocabs:', vocab_size)
# integer encode the document
result = one_hot(text, round(vocab_size*1.3))
print(result)

words:
 ['the', 'cat', 'sat', 'on', 'the', 'mat', 'the', 'dog', 'ate', 'my', 'homework']
len of words: 11
vocabs:
 {'on', 'dog', 'homework', 'mat', 'cat', 'the', 'ate', 'my', 'sat'}
len of vocabs: 9
[4, 8, 3, 7, 4, 10, 4, 8, 3, 6, 2]


## Tokenizer

In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [11]:
# define 5 documents
docs = ['Well done!','Good job','Great effort','Nice job','Excellent!']

In [12]:
# create the tokenizer
t = Tokenizer()
# fit the tokenizer on the documents
t.fit_on_texts(docs)

In [13]:
# summarize what was learned
print(t.word_counts)
print(t.document_count)
print(t.word_index)
print(t.word_docs)

OrderedDict([('well', 1), ('done', 1), ('good', 1), ('job', 2), ('great', 1), ('effort', 1), ('nice', 1), ('excellent', 1)])
5
{'job': 1, 'well': 2, 'done': 3, 'good': 4, 'great': 5, 'effort': 6, 'nice': 7, 'excellent': 8}
defaultdict(<class 'int'>, {'done': 1, 'well': 1, 'job': 2, 'good': 1, 'effort': 1, 'great': 1, 'nice': 1, 'excellent': 1})


In [14]:
#integer encode documents
encoded_docs = t.texts_to_matrix(docs, mode = 'count')
print(encoded_docs)

[[0. 0. 1. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1.]]


## Add Pad

In [15]:
reviews = ['movie was great',
           'really bad',
           'i loved it',
           'actor is handsome',
           's2',
           'the best movie ever',
           'could be better',
           'mer']

In [16]:
t1 = Tokenizer()
# fit the tokenizer on the documents
t1.fit_on_texts(reviews)

In [17]:
x = t1.texts_to_sequences(reviews)

In [18]:
x

[[1, 2, 3],
 [4, 5],
 [6, 7, 8],
 [9, 10, 11],
 [12],
 [13, 14, 1, 15],
 [16, 17, 18],
 [19]]

In [19]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [20]:
X = pad_sequences(x,maxlen=3,truncating ='post')

In [21]:
X

array([[ 1,  2,  3],
       [ 0,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [ 0,  0, 12],
       [13, 14,  1],
       [16, 17, 18],
       [ 0,  0, 19]], dtype=int32)