In [7]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import GlobalAveragePooling1D
from faker import Factory
# use example here:
from keras.preprocessing import sequence, text
from keras.models import Model
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D

from model_data import create_data_sample

In [8]:
def create_ngram_set(input_list, ngram_value=2):
    """
    Extract a set of n-grams from a list of integers.
    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
    {(4, 9), (4, 1), (1, 4), (9, 4)}
    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
    [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
    """
    return set(zip(*[input_list[i:] for i in range(ngram_value)]))
 
def add_ngram(sequences, token_indice, ngram_range=2):
    """
    Augment the input list of list (sequences) by appending n-grams values.
    Example: adding bi-gram
    >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
    >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
    >>> add_ngram(sequences, token_indice, ngram_range=2)
    [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
    Example: adding tri-gram
    >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
    >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
    >>> add_ngram(sequences, token_indice, ngram_range=3)
    [[1, 3, 4, 5, 1337], [1, 3, 7, 9, 2, 1337, 2018]]
    """
    new_sequences = []
    for input_list in sequences:
        new_list = input_list[:]
        for i in range(len(new_list) - ngram_range + 1):
            for ngram_value in range(2, ngram_range + 1):
                ngram = tuple(new_list[i:i + ngram_value])
                if ngram in token_indice:
                    new_list.append(token_indice[ngram])
        new_sequences.append(new_list)

In [9]:
data, label = create_data_sample()
max_len = max([len(x.split()) for x in data])

In [10]:
x_train_text, x_test_text, y_train,  y_test = train_test_split(data, label, test_size=0.3, random_state=0)

In [11]:
# use keras tokenizer to convert sentence to sequence?
sent_to_seq = text.Tokenizer()
sent_to_seq.fit_on_texts(x_train_text)
x_train = sequence.pad_sequences(sent_to_seq.texts_to_sequences(x_train_text), maxlen=max_len)
x_test = sequence.pad_sequences(sent_to_seq.texts_to_sequences(x_test_text), maxlen=max_len)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))

1400 train sequences
600 test sequences
Average train sequence length: 28
Average test sequence length: 28


In [42]:
# see what each thing does...
print("Input: {}".format(x_train_text[0]))
print("Output: {}".format(sent_to_seq.texts_to_sequences(x_train_text[0])))

Input: About slave? furniture perfect at it sleep furniture empty by in colored turn human food food swat on off in.
Output: [[40], [], [55], [], [], [], [], [], [40], [], [], [], [], [], [], [], [], [106], [], [], [], [], [], [], [], [], [], [], [], [], [], [40], [], [], [106], [], [], [], [], [], [], [], [], [], [], [], [], [106], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [106], [], [], [], [55], [], [55], [], [], [], [], [], [], [], [], [], [], [], [], [40], [], [], [], [55], [55], [], [], [], [55], [55], [], [], [], [], [40], [], [], [55], [], [], [55], [], [], [], [106], [], []]


In [12]:
# config for model
ngram_range = 1
max_features = 20000
maxlen = 400
batch_size = 32
embedding_dims = 50
epochs = 5
if ngram_range > 1:
    print('Adding {}-gram features'.format(ngram_range))
    # Create set of unique n-gram from the training set.
    ngram_set = set()
    for input_list in x_train:
        for i in range(2, ngram_range + 1):
            set_of_ngram = create_ngram_set(input_list, ngram_value=i)
            ngram_set.update(set_of_ngram)
     
    # Dictionary mapping n-gram token to a unique integer.
    # Integer values are greater than max_features in order
    # to avoid collision with existing features.
    start_index = max_features + 1
    token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
    indice_token = {token_indice[k]: k for k in token_indice}
     
    # max_features is the highest integer that could be found in the dataset.
    max_features = np.max(list(indice_token.keys())) + 1
     
    # Augmenting x_train and x_test with n-grams features
    x_train = add_ngram(x_train, token_indice, ngram_range)
    x_test = add_ngram(x_test, token_indice, ngram_range)
    print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
    print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))

In [13]:
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('Build model...')

Pad sequences (samples x time)
x_train shape: (1400, 400)
x_test shape: (600, 400)
Build model...


In [26]:
model = Sequential()
# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features, embedding_dims, input_length=maxlen))
# we add a GlobalAveragePooling1D, which will average the embeddings
# of all words in the document
model.add(GlobalAveragePooling1D())
# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(embedding_dims, activation='sigmoid', name='wordembedding')) # this line is not in the original fasttext but is here for transfer learning purposes
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [27]:
x_train.shape

(1400, 400)

In [28]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 400, 50)           1000000   
_________________________________________________________________
global_average_pooling1d_3 ( (None, 50)                0         
_________________________________________________________________
wordembedding (Dense)        (None, 50)                2550      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 51        
Total params: 1,002,601
Trainable params: 1,002,601
Non-trainable params: 0
_________________________________________________________________


In [15]:
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))

Train on 1400 samples, validate on 600 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1ec8b9c9f60>

In [29]:
word_embedding = Model(inputs=model.input,
                       outputs=model.get_layer(name='wordembedding').output)

In [33]:
word_train = word_embedding.predict(x_train[0].reshape(1, -1))
word_train

array([[ 0.48725   ,  0.50409222,  0.49800101,  0.5019564 ,  0.50054312,
         0.49356806,  0.50185543,  0.49676517,  0.50155139,  0.51086712,
         0.50085634,  0.5041427 ,  0.48948705,  0.50370318,  0.49648932,
         0.50162876,  0.49838334,  0.4910228 ,  0.50204504,  0.49694443,
         0.50739795,  0.5007835 ,  0.50935251,  0.50635475,  0.49994463,
         0.49171507,  0.50275052,  0.49015096,  0.50876051,  0.50151145,
         0.50567245,  0.49855009,  0.4858366 ,  0.50136203,  0.48755947,
         0.48962745,  0.49617743,  0.49564189,  0.50012571,  0.50763088,
         0.50040954,  0.49668911,  0.50344276,  0.4970898 ,  0.5002985 ,
         0.50465459,  0.50815284,  0.49907047,  0.49568477,  0.49652776]], dtype=float32)