In [1]:
import keras
from keras.datasets import reuters
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation

Using TensorFlow backend.


In [2]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=None, test_split=0.2)

In [3]:
word_index = reuters.get_word_index()

In [4]:
                                 # this shows how many classes of news reports we have in this dataset
print(len(x_train), len(x_test), max(y_train))
num_classes = max(y_train)+1 # the numbering for the classes begins from 0

8982 2246 45


In [5]:
print(x_train[0])
print(y_train[0])

[1, 27595, 28842, 8, 43, 10, 447, 5, 25, 207, 270, 5, 3095, 111, 16, 369, 186, 90, 67, 7, 89, 5, 19, 102, 6, 19, 124, 15, 90, 67, 84, 22, 482, 26, 7, 48, 4, 49, 8, 864, 39, 209, 154, 6, 151, 6, 83, 11, 15, 22, 155, 11, 15, 7, 48, 9, 4579, 1005, 504, 6, 258, 6, 272, 11, 15, 22, 134, 44, 11, 15, 16, 8, 197, 1245, 90, 67, 52, 29, 209, 30, 32, 132, 6, 109, 15, 17, 12]
3


In [6]:
index_to_word = {}
for k,v in word_index.items():
    index_to_word[v] = k

In [7]:
print(' '.join([index_to_word[x] for x in x_train[0]]))

the wattie nondiscriminatory mln loss for plc said at only ended said commonwealth could 1 traders now april 0 a after said from 1985 and from foreign 000 april 0 prices its account year a but in this mln home an states earlier and rise and revs vs 000 its 16 vs 000 a but 3 psbr oils several and shareholders and dividend vs 000 its all 4 vs 000 1 mln agreed largely april 0 are 2 states will billion total and against 000 pct dlrs


*** why do we use tokenizer.sequences_to_matrix() function? ***


So we have sequences as the original format of the dataset, and typically we add padding to make all the sequences<br>
of the same length. The reason why we are using sequences_to_matrix here though is that we are trying to transform<br> 
the data into a particular encoding. We don't want a sequence of indices of words that were in the sentence, but<br> 
rather we want a matrix to represent a sequence.<br>

To kind've explain more visually, if we have the sentence "the cat is black" and that is represented by the integer<br> 
sequence [1 4 2 3], then we might want to encode that sequence into a matrix for better learning. We might do this<br> 
(like in the video) by using the binary format. That would create a 4x5 matrix that would look<br> 
like this [[0 1 0 0 0], [0 0 0 0 1], [0 0 1 0 0], [0 0 0 1 0]] where the 1 denotes if a word is present in<br> 
the sentence (this is an example where our only vocabulary is these 4 words; if it was more, then we would have<br> 
a much larger matrix with a lot more zeros in it, if that makes sense). 

Really, we using this convert sequences to matrices functionality to take these dense vectors and transform them<br>  into some more sparse, encoded matrix that has some representation that makes sense in a NLP context. Binary<br>  represents if a words is present, count counts the number of time a word appears, TF-IDF uses a specially<br>  statistical scoring method across sequences to weight words by the term frequency multiplied by their inverse<br>  document frequency, etc<br> 


In [8]:
max_words = 10000

tokenizer = Tokenizer(num_words=max_words)
# we text this function that take top 10000 words, then turn each example (training[i]) into a one-hot vector 
# and if the words in article are included in the 10000 then give 1 , otherwise give 0
x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')
x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')

In [9]:
# one hot encoding for the classes
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

In [10]:
print(x_train.shape)
print(x_train[0])

print(y_train.shape)
print(y_train[0])

(8982, 10000)
[0. 1. 0. ... 0. 0. 0.]
(8982, 46)
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [35]:
model = Sequential()
#   When input data is one-dimensional, such as the MLP, the shape 
#   must explicitly leave room for the shape of the mini-batch size 
#   used when splitting the data when training the network. Hence, 
#   the shape tuple is always defined with a hanging last dimension.
#   For instance, "(2,)", as in the example below:
model.add(Dense(512, input_shape=(max_words, )))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [36]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 512)               5120512   
_________________________________________________________________
activation_1 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 46)                23598     
_________________________________________________________________
activation_2 (Activation)    (None, 46)                0         
Total params: 5,144,110
Trainable params: 5,144,110
Non-trainable params: 0
_________________________________________________________________


In [37]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.metrics_names)

['loss', 'acc']


In [38]:
batch_size = 32
epochs = 2
# verbose: Integer. 0, 1, or 2. Verbosity mode. 0 = silent, 1 = progress bar, 2 = one line per epoch.
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_split=0.1)
score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1)

Instructions for updating:
Use tf.cast instead.
Train on 8083 samples, validate on 899 samples
Epoch 1/2
Epoch 2/2


In [39]:
print(score[0], score[1])

0.8414560103140446 0.8049866429472862


In [12]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=None, test_split=0.2)
# this way if a word occurs more than one time in the text its count will increase in our one-hot vector
x_train = tokenizer.sequences_to_matrix(x_train, mode='count')
x_test = tokenizer.sequences_to_matrix(x_test, mode='count')

y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

print(x_train.shape)
print(x_train[0])
print(y_train[0])
print(max(x_train[0]))

(8982, 10000)
[0. 1. 0. ... 0. 0. 0.]
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
6.0


In [14]:
model = Sequential()
model.add(Dense(512, input_shape=(max_words, )))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
batch_size = 32
epochs = 2
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_split=0.1)
score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1)
print(score[0], score[1])

Instructions for updating:
Use tf.cast instead.
Train on 8083 samples, validate on 899 samples
Epoch 1/2
Epoch 2/2
0.8778713806760385 0.8103294746480876


In [16]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=None, test_split=0.2)
# need to fit tokenizer on train data before applying tfidf to it.
tokenizer.fit_on_sequences(x_train)

## TF-IDF : TF(w) * IDF(w)     
#  tf(w) = (Number of times the word appears in a document) / (Total number of words in the document)
#  idf(w) = log(Number of documents / Number of documents that contain word w )
# this way unique words get more attension than words that appear all the time
x_train = tokenizer.sequences_to_matrix(x_train, mode='tfidf')
x_test = tokenizer.sequences_to_matrix(x_test, mode='tfidf')

y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

print(x_train.shape)
print(x_train[0])
print(y_train[0])
print(max(x_train[0]))

(8982, 10000)
[0.         0.69309152 0.         ... 0.         0.         0.        ]
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
6.214608098422191


In [17]:
model = Sequential()
model.add(Dense(512, input_shape=(max_words, )))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
batch_size = 32
epochs = 2
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_split=0.1)
score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1)
print(score[0], score[1])

Train on 8083 samples, validate on 899 samples
Epoch 1/2
Epoch 2/2
1.0234859694568366 0.8018699911218187
