In [1]:
%matplotlib inline

# CNN text

In [2]:
import keras
import tensorflow as tf

Using TensorFlow backend.


In [3]:
from keras.layers import Conv1D, MaxPooling1D
from keras.layers import Dense, Flatten, Dropout
from keras.models import Sequential
from keras.optimizers import Adam
from keras.regularizers import l2
from keras.utils.np_utils import to_categorical

from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

## Globals setup

In [None]:
import string

In [None]:
DATA_PATH = '/data/fashion/txt/fashion.dedup.txt'

GLOVE_6B_VOCAB_PATH = '/data/models/glove.6B.vocab'
GLOVE_6B_PATH = '/data/models/glove.6B.300d.txt'

GLOVE_840B_VOCAB_PATH = '/data/models/glove.840B.vocab'
GLOVE_840B_PATH = '/data/models/glove.840B.300d.txt'

BLACK_LIST = string.punctuation.replace('%', '').replace('-','') + '\n'

In [None]:
def normalize(text, black_list = BLACK_LIST, vocab=None, lowercase =  True, tokenize = False):
    if black_list:
        text = text.translate(None, BLACK_LIST)
    if lowercase:
        text = text.lower()
    if vocab:
        text = ' '.join([word for word in text.split() if word in vocab])
    if tokenize:
        return text.split()
    return text

## Word-CNN

In [None]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [None]:
import os
import numpy as np

from collections import Counter

from keras.utils.np_utils import to_categorical
from keras.layers import Embedding

def load_and_process(data_path, num_words, maxlen):
    with open(data_path, 'rt') as f:
        classes, texts =  zip(*[line.split(" ", 1) for line in f.readlines()])
        
        # class preprocessing
        classes = [cls[9:] for cls in classes]
        class_to_id = { 
            key: index for (index, (key, value)) in enumerate(Counter(classes).most_common())
        }
        ids = to_categorical([class_to_id[cls] for cls in classes])
    
    # Setting up keras tokenzer
    tokenizer = Tokenizer(num_words=num_words)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    
    word_index = tokenizer.word_index
    logger.debug('Found %s unique tokens', len(word_index))

    # Padding data
    data = pad_sequences(sequences, maxlen=maxlen, padding='post', truncating='post')

    logger.debug('Shape of data tensor: %s', data.shape)
    logger.debug('Shape of label tensor: %s', ids.shape)

    return data, ids, tokenizer


def load_glove_embeddings(embedding_path, word_index, max_sequence, trainable=True):
    '''
    Loads Glove word vectors
    Arguments:
        embedding_path  - path to GloVe word embeddings
        word_index      - dictionary mapping words to their rank
    '''
    logger = logging.getLogger(__name__)

    # create dictionary with embeddings
    embeddings_index = {}
    with open(embedding_path) as f:
        for line in f:
            word, coefs = line.split(" ", 1)
            coefs = np.asarray(coefs.split(), dtype='float32')
            embeddings_index[word] = coefs

    logger.debug('Found %s word vectors with shape', len(embeddings_index))

    # for convenience
    nrows, ncols = len(word_index) + 1, coefs.shape[0]
    logger.debug("rows %s, columns %s", nrows, ncols)

    # words not found in embedding index will be all-zeros
    embedding_matrix = np.zeros((nrows, ncols))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    embedding_layer = Embedding(nrows,
                                ncols,
                                weights=[embedding_matrix],
                                input_length=max_sequence,
                                
                                trainable=trainable)
    return embedding_layer


def train_val_split(data, labels, split_ratio, seed=0):
    '''
    Splits data and lables into training and validation set
    '''
    # set seed
    np.random.seed(seed)

    # shuffle indices
    indices = np.arange(data.shape[0])
    np.random.shuffle(indices)

    data = data[indices]
    labels = labels[indices]
    nb_validation_samples = int(split_ratio * data.shape[0])

    x_train = data[:-nb_validation_samples]
    y_train = labels[:-nb_validation_samples]

    x_val = data[-nb_validation_samples:]
    y_val = labels[-nb_validation_samples:]

    return x_train, y_train, x_val, y_val

In [None]:
MAX_WORDS = 7000
MAX_SENT_LENGTH = 400

In [None]:
data, labels, tokenizer = load_and_process(DATA_PATH, MAX_WORDS, MAX_SENT_LENGTH)

In [None]:
# split test, val
x_train, y_train, x_val, y_val = train_val_split(data, labels, 0.1)

In [None]:
embedding_layer = load_glove_embeddings(GLOVE_6B_PATH, tokenizer.word_index, MAX_SENT_LENGTH)

In [None]:
dropout_rate = 0.2
nb_classes = labels.shape[1]
weight_decay = 0.01
weight_decay = 128
filter_length = 5

In [None]:
model = Sequential()
model.add(embedding_layer)

model.add(Conv1D(128, 3, activation='relu'))
model.add(MaxPooling1D(5))

model.add(Conv1D(128, 3, activation='relu'))
model.add(MaxPooling1D(3))
model.add(Flatten())

# add dropout
model.add(Dropout(dropout_rate))

# add l2 regularization
model.add(Dense(1024, name="embedding", activation='relu', kernel_regularizer=l2(.01)))
model.add(Dense(nb_classes, activation='softmax'))

# Setup optimizer
adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

# Compile model
model.compile(loss='categorical_crossentropy',
              optimizer=adam,
              metrics=['acc'])

In [None]:
model.summary()

In [None]:
model.fit(
    x_train, y_train,
    validation_data=(x_val, y_val),
    epochs=10,
    batch_size=128,
    verbose=1)

In [None]:
# evalute model on validation data
logger.info("Train data")
model.evaluate(x_train, y_train, batch_size=128)

In [None]:
# evalute model on validation data
logger.info("Validation data")
model.evaluate(x_val, y_val, batch_size=128)

## Inspect model

In [None]:
sess = K.get_session()

In [None]:
# This is the exported input tensor (placeholder)

In [None]:
input_ = sess.graph.get_tensor_by_name('embedding_1_input:0')

In [None]:
# This is the 

In [None]:
features = sess.graph.get_tensor_by_name("embedding/Relu:0")

### Export model

In [None]:
export_path = '/data/models/cnn-word-fashion/' # where to save the exported graph

In [None]:
from keras import backend as K

# all new operations will be in test mode from now on (dropout, etc.)
K.set_learning_phase(0) 

In [None]:
# serialize the model and get its weights, for quick re-building
config = model.get_config()
weights = model.get_weights()

# re-build a model where the learning phase is now hard-coded to 0
production_model = Sequential.from_config(config)
production_model.set_weights(weights)

In [None]:
### export tokenizer to ensure consistency between training and production
import pickle

with open(os.path.join(export_path, 'tokenizer.pickle'), 'wb') as f_:
    pickle.dump(tokenizer, f_, protocol=2)