In [11]:
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim.corpora.dictionary import Dictionary

In [2]:
samples = ['The cat sat on the mat.', 'The dog ate my homework.']

In [19]:
tokenized_samples = [word_tokenize(sent.lower()) for sent in samples]
no_stop_no_punk = [[word for word in sent if word not in stopwords.words('english') and word.isalpha()] for sent in tokenized_samples]
dictionary = Dictionary(tokenized_samples)

corpus = [dictionary.doc2bow(sample) for sample in tokenized_samples]

In [20]:
dictionary.token2id

{'.': 0,
 'cat': 1,
 'mat': 2,
 'on': 3,
 'sat': 4,
 'the': 5,
 'ate': 6,
 'dog': 7,
 'homework': 8,
 'my': 9}

In [21]:
corpus

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 2)],
 [(0, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)]]

In [32]:
token_index = {}
for sample in samples:
    for word in sample.split():
        if word not in token_index:
            token_index[word] = len(token_index) + 1
            
total_words = len(token_index.values())
print("There are totally {} words".format(total_words))

There are totally 10 words


In [52]:
# only consider the first max_length word in each sample
max_length = 10

# use (num_samples, consider max_legnth words in each doc, token_index)
# to represent word in each sample, location in each sample, and location in corpus
results = np.zeros(shape = (
    len(samples),
    max_length,
    max(token_index.values()) + 1)
    )

for i, sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[0:max_length]:
        index = token_index.get(word)
        results[i,j,index] = 1

`[document, time_step, word_index]` format matrix

In [53]:
results

array([[[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0

In [54]:
from keras.preprocessing.text import Tokenizer

In [61]:
tokenizer = Tokenizer(num_words = 1000)
tokenizer.fit_on_texts(samples)
sequences = tokenizer.texts_to_sequences(samples)
print("Sequences: {}".format(sequences))
print("Samples:  {}".format(samples))

Sequences: [[1, 2, 3, 4, 1, 5], [1, 6, 7, 8, 9]]
Samples:  ['The cat sat on the mat.', 'The dog ate my homework.']


In [67]:
onehot_results = tokenizer.texts_to_matrix(samples,mode = 'binary')

In [74]:
word_index = tokenizer.word_index
word_index

{'the': 1,
 'cat': 2,
 'sat': 3,
 'on': 4,
 'mat': 5,
 'dog': 6,
 'ate': 7,
 'my': 8,
 'homework': 9}

In [73]:
num_unique_tokens = len(word_index)
print("Found {} unique tokens.".format(num_unique_tokens))

Found 9 unique tokens.
