In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import GlobalAveragePooling1D
from faker import Factory
# use example here:
from keras.preprocessing import sequence, text
from keras.models import Model
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D

from model_data import create_data_sample

Using TensorFlow backend.


In [2]:
data, label = create_data_sample()
max_len = max([len(x.split()) for x in data])

In [3]:
# split in sklearn
x_train_text, x_test_text, y_train,  y_test = train_test_split(data, label, test_size=0.3, random_state=0)

In [4]:
sent_to_seq = text.Tokenizer()
sent_to_seq.fit_on_texts(x_train_text)
x_train = sequence.pad_sequences(sent_to_seq.texts_to_sequences(x_train_text), maxlen=max_len)
x_test = sequence.pad_sequences(sent_to_seq.texts_to_sequences(x_test_text), maxlen=max_len)

In [5]:
# cnn settings
max_features = 5000
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 2


In [6]:

model = Sequential()
# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=max_len))
# model.add(Dropout(0.2)) # this layer is a regularization layer
# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
# we use max pooling:
model.add(GlobalMaxPooling1D())
# We add a vanilla hidden layer:
model.add(Dense(hidden_dims, activation='relu', name='wordembedding')) # this is the word embedding if you wish to keep it, we can always extract it later
# model.add(Dropout(0.2)) # regularization layer
# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(2)) # as we have a classification problem the final layer is 2, if we have multi-class (say 3 classes then this would be `Dense(3)`.
model.add(Activation('softmax')) # if we use softmax, then we have softmax AKA multinomial regression
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=2,
          validation_data=(x_test, y_test))

Train on 1400 samples, validate on 600 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1cd2ef41ef0>

In [8]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 28, 50)            250000    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 26, 250)           37750     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 250)               0         
_________________________________________________________________
wordembedding (Dense)        (None, 250)               62750     
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 502       
_________________________________________________________________
activation_1 (Activation)    (None, 2)                 0         
Total params: 351,002
Trainable params: 351,002
Non-trainable params: 0
_________________________________________________________________


In [7]:

# show performance...this is rather crude but works in this instance
# normally we would want to set a threshold
yh_train = np.argmax(model.predict(x_train), axis=1)
yh_test = np.argmax(model.predict(x_test), axis=1)
# Accuracy should be very very high
print("Train accuracy: {}".format(accuracy_score(y_train, yh_train)))
print("Test accuracy: {}".format(accuracy_score(y_test, yh_test)))
 
# get the output as a vector:
word_embedding = Model(inputs=model.input,
                       outputs=model.get_layer(name='wordembedding').output)
# this will output your word embedding of size 250.
word_train = word_embedding.predict(x_train)
word_test = word_embedding.predict(x_test)
# you can verify using `word_train.shape` or `word_test.shape`

Train accuracy: 1.0
Test accuracy: 1.0
