In [1]:
%matplotlib inline

# CNN text

In [2]:
import keras
import tensorflow as tf

Using TensorFlow backend.


In [4]:
from keras.layers import Conv1D, MaxPooling1D
from keras.layers import Dense, Flatten, Dropout
from keras.models import Sequential
from keras.optimizers import Adam
from keras.regularizers import l2
from keras.utils.np_utils import to_categorical

from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

## Globals setup

In [5]:
import string

In [6]:
DATA_PATH = '/data/fashion/txt/fashion.dedup.txt'

GLOVE_6B_VOCAB_PATH = '/data/glove.6B.vocab'
GLOVE_6B_PATH = '/data/glove.6B.300d.txt'

GLOVE_840B_VOCAB_PATH = '/data/glove.840B.vocab'
GLOVE_840B_PATH = '/data/glove.840B.300d.txt'

BLACK_LIST = string.punctuation.replace('%', '').replace('-','') + '\n'

In [7]:
def normalize(text, black_list = BLACK_LIST, vocab=None, lowercase =  True, tokenize = False):
    if black_list:
        text = text.translate(None, BLACK_LIST)
    if lowercase:
        text = text.lower()
    if vocab:
        text = ' '.join([word for word in text.split() if word in vocab])
    if tokenize:
        return text.split()
    return text

## Word-CNN

In [8]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [15]:
import os
import numpy as np

from collections import Counter

from keras.utils.np_utils import to_categorical
from keras.layers import Embedding

def load_and_process(data_path, num_words, maxlen):
    with open(data_path, 'rt') as f:
        classes, texts =  zip(*[line.split(" ", 1) for line in f.readlines()])
        
        # class preprocessing
        classes = [cls[9:] for cls in classes]
        class_to_id = { 
            key: index for (index, (key, value)) in enumerate(Counter(classes).most_common())
        }
        ids = to_categorical([class_to_id[cls] for cls in classes])
    
    # Setting up keras tokenzer
    tokenizer = Tokenizer(num_words=num_words)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    
    word_index = tokenizer.word_index
    logger.debug('Found %s unique tokens', len(word_index))

    # Padding data
    data = pad_sequences(sequences, maxlen=maxlen, padding='post', truncating='post')

    logger.debug('Shape of data tensor: %s', data.shape)
    logger.debug('Shape of label tensor: %s', ids.shape)

    return data, ids, word_index


def load_glove_embeddings(embedding_path, word_index, max_sequence, trainable=True):
    '''
    Loads Glove word vectors
    Arguments:
        embedding_path  - path to GloVe word embeddings
        word_index      - dictionary mapping words to their rank
    '''
    logger = logging.getLogger(__name__)

    # create dictionary with embeddings
    embeddings_index = {}
    with open(embedding_path) as f:
        for line in f:
            word, coefs = line.split(" ", 1)
            coefs = np.asarray(coefs.split(), dtype='float32')
            embeddings_index[word] = coefs

    logger.debug('Found %s word vectors with shape', len(embeddings_index))

    # for convenience
    nrows, ncols = len(word_index) + 1, coefs.shape[0]
    logger.debug("rows %s, columns %s", nrows, ncols)

    # words not found in embedding index will be all-zeros
    embedding_matrix = np.zeros((nrows, ncols))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    embedding_layer = Embedding(nrows,
                                ncols,
                                weights=[embedding_matrix],
                                input_length=max_sequence,
                                trainable=trainable)
    return embedding_layer


def train_val_split(data, labels, split_ratio, seed=0):
    '''
    Splits data and lables into training and validation set
    '''
    # set seed
    np.random.seed(seed)

    # shuffle indices
    indices = np.arange(data.shape[0])
    np.random.shuffle(indices)

    data = data[indices]
    labels = labels[indices]
    nb_validation_samples = int(split_ratio * data.shape[0])

    x_train = data[:-nb_validation_samples]
    y_train = labels[:-nb_validation_samples]

    x_val = data[-nb_validation_samples:]
    y_val = labels[-nb_validation_samples:]

    return x_train, y_train, x_val, y_val

In [None]:
MAX_WORDS = 7000
MAX_SENT_LENGTH = 400

In [10]:
data, labels, word_index = load_and_process(DATA_PATH, MAX_SENT_LENGTH, MAX_SENT_LENGTH)

No handlers could be found for logger "root"


In [20]:
# split test, val
x_train, y_train, x_val, y_val = train_val_split(data, labels, 0.1)

In [11]:
embedding_layer = load_glove_embeddings(GLOVE_6B_PATH, word_index, MAX_SENT_LENGTH)

In [12]:
nb_filter = 128
filter_length = 5
dropout_rate = 0.1
nb_classes = labels.shape[1]

In [14]:
model = Sequential()
model.add(embedding_layer)

model.add(Conv1D(nb_filter, filter_length, activation='relu'))
model.add(MaxPooling1D(filter_length))

model.add(Conv1D(nb_filter, filter_length, activation='relu'))
model.add(MaxPooling1D(filter_length))
model.add(Flatten())

# add dropout
model.add(Dropout(dropout_rate))

# add l2 regularization
model.add(Dense(nb_filter, activation='relu', kernel_regularizer=l2(.01)))
model.add(Dense(nb_classes, activation='softmax'))

# Setup optimizer
adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

# Compile model
model.compile(loss='categorical_crossentropy',
              optimizer=adam,
              metrics=['acc'])

In [None]:
model.fit(
    x_train, y_train,
    validation_data=(x_val, y_val),
    epochs=10,
    batch_size=128,
    verbose=1)

Train on 36891 samples, validate on 4098 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10

In [None]:
# evalute model on validation data
logger.info("Validation data")
logger.info(model.evaluate(x_val, y_val, batch_size=args.batch_size))

## Char-CNN

# Dataset

In [None]:
with open(DATA_PATH, 'rt') as f:
    classes, texts =  zip(*[line.split(" ", 1) for line in f.readlines()])
    classes = [cls[9:] for cls in classes]

In [None]:
classes[:5]

In [None]:
texts[:5]

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from collections import Counter


labels, values = zip(*Counter(classes).most_common())

# get the EDF
perc = [0.9, 0.95, 0.99]
cumsum = np.cumsum(values) / float(np.sum(values))
idxs = [np.argmax(cumsum >= idx) for idx in perc]

indexes = np.arange(len(labels))
width = 1

plt.figure(figsize=(20, 2))
plt.title("Class frequency (%i classes)" % len(values))
plt.bar(indexes, values, width)
plt.xticks(indexes + width * 0.5, labels)
plt.xticks(rotation=90)
for idx, text in zip(idxs, perc):
    plt.axvline(x=idx, color='r')
    plt.text(idx+1, 3500, "%0.2f at %i" % (text, idx) , color='r')
plt.show()

## Tokens (GloVe comparision)

In [None]:
import itertools

In [None]:
vocab_fasion = Counter(itertools.chain(*[normalize(text, tokenize=True) for text in texts]))
top_words = [k for k,v in vocab_fasion.most_common()]

In [None]:
print("There are %i elements in the dataset" % len(vocab_fasion))
vocab_fasion.most_common(10)

## Sentence length

In [None]:
lengths = [len(text) for text in texts]

In [None]:
plt.style.use('ggplot')

In [None]:
plt.title("Distribution of sentence length")
plt.hist(lengths, bins=20);

In [None]:
long_sent = [text for text in texts if len(text) >= 400]

In [None]:
long_sent[:5]

In [None]:
with open(GLOVE_6B_VOCAB_PATH, 'rt') as f:
    vocab_6b = set([line[:-1] for line in f.readlines()])

In [None]:
with open(GLOVE_840B_VOCAB_PATH, 'rt') as f:
    vocab_840b = set([line[:-1] for line in f.readlines()])

In [None]:
missing_from_6B = Counter({k: vocab_fasion[k] for k in vocab_fasion if k not in vocab_6b})
missing_from_840B = Counter({k: vocab_fasion[k] for k in vocab_fasion if k not in vocab_840b})

In [None]:
print("There are %i elements missing from 840b out of %i" % (len(missing_from_6B), len(vocab_fasion)))
for k, v in missing_from_6B.most_common(10):
    print("- %s with %i occurances, %i most common word" % (k, missing_from_6B[k], top_words.index(k)))

In [None]:
print("There are %i elements missing from 840b out of %i" % (len(missing_from_840B), len(vocab_fasion)))
for k, v in missing_from_840B.most_common(10):
    print("- %s with %i occurances, %i most common word" % (k, v, top_words.index(k)))