In [1]:
import numpy as np
import pickle
from collections import Counter
from mltools.preprocessing import Tokenizer, Indexer, Pipeline, LabelIndexer
from keras.preprocessing import sequence

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
train_sents   = pickle.load(open('../00_data/snips/train_sents.pkl', 'rb'))
train_tags    = pickle.load(open('../00_data/snips/train_tags.pkl', 'rb'))
train_intents = pickle.load(open('../00_data/snips/train_intents.pkl', 'rb'))

test_sents    = pickle.load(open('../00_data/snips/val_sents.pkl', 'rb'))
test_tags     = pickle.load(open('../00_data/snips/val_tags.pkl', 'rb'))
test_intents  = pickle.load(open('../00_data/snips/val_intents.pkl', 'rb'))

In [3]:
vocab = list(set([w for s in train_sents for w in s]))
len(vocab)

10305

In [4]:
VOCABSIZE = 10000

In [5]:
lens = [len(s) for s in train_sents]
np.mean(lens), np.std(lens)

(9.243978525827046, 3.1407135430676423)

In [6]:
MAXLEN = 15

### label, sentence encoding

In [7]:
# label_indexing
intent_indexer = LabelIndexer(zero_pad=False)

In [8]:
intent_indexer.fit(train_intents)
int_trn_idxes = np.array(intent_indexer.transform(train_intents))[:, np.newaxis]
int_tst_idxes = np.array(intent_indexer.transform(test_intents))[:, np.newaxis]
int_trn_idxes.shape

(13784, 1)

In [9]:
label_indexer = LabelIndexer(zero_pad=True)

In [10]:
%%time
all_tags = train_tags + test_tags
label_indexer.fit(all_tags)
tags_trn_idxes = label_indexer.transform(train_tags)
tags_tst_idxes = label_indexer.transform(test_tags)
cx = label_indexer.inverse_transform(tags_tst_idxes[0:2])
print(cx[0])

['NONE', 'NONE', 'NONE', 'NONE', 'object_type', 'NONE', 'object_name', 'object_name']
CPU times: user 56.6 ms, sys: 1 µs, total: 56.6 ms
Wall time: 56.2 ms


In [11]:
tags_trn_idxes = sequence.pad_sequences(tags_trn_idxes, maxlen=MAXLEN, padding='post', truncating='post')
tags_tst_idxes = sequence.pad_sequences(tags_tst_idxes, maxlen=MAXLEN, padding='post', truncating='post')
tags_trn_idxes.shape

(13784, 15)

In [12]:
tags_trn_idxes = tags_trn_idxes[:, :, np.newaxis]
tags_tst_idxes = tags_tst_idxes[:, :, np.newaxis]

In [13]:
# char-based indexing
word_idxpipe = Pipeline([
    ('tknzr', Tokenizer(lower=False, min_count=1)),
    ('idxer', Indexer(max_len=MAXLEN, max_vocab=VOCABSIZE))
])

In [14]:
%%time
sent_trn_idxes = word_idxpipe.fit_transform([' '.join(s) for s in train_sents])
sent_tst_idxes = word_idxpipe.transform([' '.join(s) for s in test_sents])
cx = word_idxpipe.inverse_transform(sent_tst_idxes[0:2])
print(cx[0])

['wish', 'to', 'find', 'the', 'movie', 'the', 'heart', 'beat']
CPU times: user 312 ms, sys: 3.29 ms, total: 315 ms
Wall time: 315 ms


In [15]:
np.shape(sent_trn_idxes)

(13784, 15)

In [16]:
np.shape(tags_trn_idxes)

(13784, 15, 1)

In [17]:
np.shape(int_trn_idxes)

(13784, 1)

In [18]:
np.unique(int_trn_idxes), np.unique(int_tst_idxes)

(array([0, 1, 2, 3, 4, 5, 6]), array([0, 1, 2, 3, 4, 5, 6]))

In [19]:
np.save('../00_data/encoded/snips_x_train.npy', sent_trn_idxes)
np.save('../00_data/encoded/snips_x_test.npy', sent_tst_idxes)

np.save('../00_data/encoded/snips_y_tags_train.npy', tags_trn_idxes)
np.save('../00_data/encoded/snips_y_tags_test.npy', tags_tst_idxes)

np.save('../00_data/encoded/snips_y_int_train.npy', int_trn_idxes)
np.save('../00_data/encoded/snips_y_int_test.npy', int_tst_idxes)

In [20]:
pickle.dump(intent_indexer, open("../00_data/encoded/snips_intent_indexer.pkl", "wb"))
pickle.dump(label_indexer, open("../00_data/encoded/snips_label_indexer.pkl", "wb"))
pickle.dump(word_idxpipe, open("../00_data/encoded/snips_sent_indexer.pkl", "wb"))