# Example notebook 2: visualize word embeddings after fine-tuning
In this example the embeddings are fine-tuned on a classification task on the 20 newsgroup dataset.
The classifier is a TextCNN, i.e. a 1d convolutional network used for text classification.

In [1]:
import os
from zeugma import GloVeTransformer

# First download embeddings, if they are already 
# there it will be automatically skipped
GloVeTransformer.download_embeddings()  

# load embedding transformer
embedding_transformer = GloVeTransformer()

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Preprocessing

In [2]:
from keras.utils import np_utils
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train')
corpus = newsgroups_train.data
y_train = np_utils.to_categorical(newsgroups_train.target)

In [3]:
from zeugma import TextsToSequences, Padder
from sklearn.pipeline import make_pipeline

max_length = 200
num_words = 10000

sequencer = TextsToSequences(num_words=num_words)
padder = Padder(max_length=max_length)

# Build the Scikit-learn pipeline
pipeline = make_pipeline(sequencer, padder)
x_train = pipeline.fit_transform(corpus)

In [7]:
word_index = {k: v for k, v in sequencer.word_index.items() if v <= num_words}

def create_embedding_matrix(word_index, embedding_transformer):
    """ Prepare the embedding matrix """
    embedding_dim = embedding_transformer.transform(['the']).shape[1]
    embedding_matrix = np.zeros((len(word_index)+1, embedding_dim))
    for word, i in word_index.items():
        embedding_matrix[i] = embedding_transformer.transform([word])[0]
    return embedding_matrix

embedding_matrix = create_embedding_matrix(word_index, embedding_transformer)

LOG_DIR = os.path.join(os.environ['PWD'], 'logs')

if not os.path.exists(LOG_DIR):
    os.mkdir(LOG_DIR)
    
# Write word names in a file for tensorboard display
with open(os.path.join(LOG_DIR, 'metadata.tsv'), 'w') as f:
    f.write('Word\n' + '\n'.join(word_index.keys()))

## Model definition and traning

In [9]:
from keras.models import Model
from keras.layers import concatenate
from keras.layers import Dense, Input, Dropout
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D
from keras.wrappers.scikit_learn import KerasClassifier

def create_model(max_length, embedding_matrix):
    """ Model creation function: returns a compiled TextCNN"""
    embedding_layer = Embedding(len(embedding_matrix),
                            embedding_matrix.shape[1],
                            weights=[embedding_matrix],
                            input_length=max_length,
                            trainable=True,
                            name='embeddings')

    input_layer = Input(shape=(max_length,), dtype='int32', name='input')
    embedded_sequences = embedding_layer(input_layer)

    conv_blocks = []
    for filter_size in [3]:
        conv = Conv1D(128, filter_size, activation='relu', name='conv_'+str(filter_size))(embedded_sequences)
        pool = GlobalMaxPooling1D(name='global_pool_cv_'+str(filter_size))(conv)
        conv_blocks.append(pool)
    merge = concatenate(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]
    #dropout = Dropout(0.5)(merge)
    dense = Dense(128, activation='relu', name='dense')(merge)
    #dropout = Dropout(0.5)(dense)
    preds = Dense(20, activation='softmax', name='softmax')(dense)

    model = Model(inputs=input_layer, outputs=preds)
    model.compile(loss='categorical_crossentropy', optimizer='adam',
                  metrics=['accuracy'])
    return model

In [11]:
from keras.callbacks import TensorBoard
    
tensorboard = TensorBoard(
    histogram_freq=0,
    write_graph=True,
    write_images=True,
    embeddings_freq=1,
    embeddings_layer_names=['embeddings'],
    embeddings_metadata={'embeddings': os.path.join(LOG_DIR, 'metadata.tsv')},
)

# Use Keras Scikit-learn wrapper to instantiate a TextCNN with all methods
# required by Scikit-learn for the last step of a Pipeline
sklearn_textcnn = KerasClassifier(build_fn=create_model, epochs=2, batch_size=128, 
                                  max_length=max_length, embedding_matrix=embedding_matrix,
                                  verbose=2, callbacks=[tensorboard])

sklearn_textcnn.fit(x_train, y_train)

Epoch 1/2
 - 47s - loss: 2.1000 - acc: 0.4555
Epoch 2/2
 - 50s - loss: 0.7371 - acc: 0.8056


<keras.callbacks.History at 0x1239ebf98>

In [None]:
# Run tensorboard to visualize the embeddings in your
# browser on port 6006: http://localhost:6006
# Navigate to the 'Projector' tab to visualize the embeddings
!tensorboard --logdir logs/ --host 127.0.0.1

  from ._conv import register_converters as _register_converters
TensorBoard 1.6.0 at http://127.0.0.1:6006 (Press CTRL+C to quit)
