In [1]:
'''This script loads pre-trained word embeddings (GloVe embeddings)
into a frozen Keras Embedding layer, and uses it to
train a text classification model on the 20 Newsgroup dataset
(classification of newsgroup messages into 20 different categories).

GloVe embedding data can be found at:
http://nlp.stanford.edu/data/glove.6B.zip
(source page: http://nlp.stanford.edu/projects/glove/)

20 Newsgroup data can be found at:
http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html
'''

from __future__ import print_function

import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model


MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [3]:
# first, build index mapping words in the embeddings set
# to their embedding vector

print('Indexing word vectors.')

embeddings_index = {}
f = open('/media/jma/DATA/glove.6B/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

#100 length word vector for 'home'
embeddings_index['home']

Indexing word vectors.
Found 400000 word vectors.


array([-0.092998,  0.16297 ,  0.73724 , -0.37971 , -0.077342,  0.76823 ,
       -0.074471,  0.2472  , -0.29568 ,  0.26877 ,  0.16257 , -0.57607 ,
        0.14767 ,  0.17896 ,  0.12081 , -0.051907,  0.43795 ,  0.25028 ,
       -0.62697 ,  0.6331  ,  0.43648 ,  0.43814 ,  0.75078 ,  0.42543 ,
        0.084075, -0.31261 , -0.49792 , -0.60304 , -0.32523 ,  0.1209  ,
       -0.077789,  0.26272 ,  0.29894 ,  0.85102 ,  0.13084 ,  0.47982 ,
       -0.43993 ,  0.47408 , -0.06119 , -0.48955 ,  0.3537  , -0.50504 ,
        0.12695 , -0.6797  ,  0.54241 ,  0.13003 ,  0.19291 , -0.17411 ,
        0.85916 , -0.26002 , -0.30243 , -0.51926 ,  0.10875 ,  0.71831 ,
        0.019399, -2.7363  , -0.54512 , -0.34702 ,  1.6927  ,  0.58821 ,
       -0.24435 ,  0.59183 ,  0.13423 , -0.16698 ,  0.25298 ,  0.26853 ,
        0.63114 ,  0.046757, -0.12598 ,  0.045965, -0.3553  , -0.020918,
       -0.14581 , -1.2608  ,  0.16349 ,  0.62281 , -0.12171 , -0.030637,
       -0.89189 ,  0.26495 ,  0.50661 ,  0.70529 , 

In [11]:
# second, prepare text samples and their labels
print('Processing text dataset')

texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
for name in sorted(os.listdir('/media/jma/DATA/20_newsgroup/')):
    path = os.path.join('/media/jma/DATA/20_newsgroup/', name)
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[name] = label_id
        for fname in sorted(os.listdir(path)):
            if fname.isdigit():
                fpath = os.path.join(path, fname)
                if sys.version_info < (3,):
                    f = open(fpath)
                else:
                    f = open(fpath, encoding='latin-1')
                t = f.read()
                i = t.find('\n\n')  # skip header
                if 0 < i:
                    t = t[i:]
                texts.append(t)
                f.close()
                labels.append(label_id)

print('Found %s texts.' % len(texts))

Processing text dataset
Found 19997 texts.


In [14]:
texts[1]

'\n\nArchive-name: atheism/introduction\nAlt-atheism-archive-name: introduction\nLast-modified: 5 April 1993\nVersion: 1.2\n\n-----BEGIN PGP SIGNED MESSAGE-----\n\n                          An Introduction to Atheism\n                       by mathew <mathew@mantis.co.uk>\n\nThis article attempts to provide a general introduction to atheism.  Whilst I\nhave tried to be as neutral as possible regarding contentious issues, you\nshould always remember that this document represents only one viewpoint.  I\nwould encourage you to read widely and draw your own conclusions; some\nrelevant books are listed in a companion article.\n\nTo provide a sense of cohesion and progression, I have presented this article\nas an imaginary conversation between an atheist and a theist.  All the\nquestions asked by the imaginary theist are questions which have been cropped\nup repeatedly on alt.atheism since the newsgroup was created.  Some other\nfrequently asked questions are answered in a companion article.

In [15]:
# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 174074 unique tokens.


In [16]:
sequences[0]

[1237,
 273,
 1213,
 1439,
 1071,
 1213,
 1237,
 273,
 1439,
 192,
 2515,
 348,
 2964,
 779,
 332,
 28,
 45,
 1628,
 1439,
 2516,
 3,
 1628,
 2144,
 780,
 937,
 29,
 441,
 2770,
 8854,
 4601,
 7969,
 11979,
 5,
 12806,
 75,
 1628,
 19,
 229,
 29,
 1,
 937,
 29,
 441,
 2770,
 6,
 1,
 118,
 558,
 2,
 90,
 106,
 482,
 3979,
 6602,
 5375,
 1871,
 12260,
 1632,
 17687,
 1828,
 5101,
 1828,
 5101,
 788,
 1,
 8854,
 4601,
 96,
 4,
 4601,
 5455,
 64,
 1,
 751,
 563,
 1716,
 15,
 71,
 844,
 24,
 20,
 1971,
 5,
 1,
 389,
 8854,
 744,
 1023,
 1,
 7762,
 1300,
 2912,
 4601,
 8,
 73,
 1698,
 6,
 1,
 118,
 558,
 2,
 1828,
 5101,
 16500,
 13447,
 73,
 1261,
 10982,
 170,
 66,
 6,
 1,
 869,
 2235,
 2544,
 534,
 34,
 79,
 8854,
 4601,
 29,
 6603,
 3388,
 264,
 1505,
 535,
 49,
 12,
 343,
 66,
 60,
 155,
 2,
 6603,
 1043,
 1,
 427,
 8,
 73,
 1698,
 618,
 4601,
 417,
 1628,
 632,
 11716,
 4602,
 814,
 1628,
 691,
 3,
 1,
 467,
 2163,
 3,
 2266,
 7491,
 5,
 48,
 15,
 40,
 135,
 378,
 8,
 1,
 467,
 6359,
 

In [17]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

print('Preparing embedding matrix.')

# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index))
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

print('Training model.')

# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(len(labels_index), activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

model.fit(x_train, y_train,
          batch_size=128,
          epochs=10,
          validation_data=(x_val, y_val))

Shape of data tensor: (19997, 1000)
Shape of label tensor: (19997, 20)
Preparing embedding matrix.
Training model.
Train on 15998 samples, validate on 3999 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ff7cf748940>