# Lesson 10
### Author: Emily McAfee
### New Topic Identification

In [None]:
# Load basic packages
import numpy as np
import pandas as pd

### 1. Read Reuters ataset into training and testing

In [None]:
# import NN packages
import tensorflow as tf
from tensorflow import keras

In [None]:
data = tf.keras.datasets.reuters
num_of_words=10000
(x_train, y_train), (x_test, y_test) = data.load_data(num_words=num_of_words)

In [None]:
# Explore data
print(x_train[0])

### 2. Prepare dataset

In [None]:
# Make it so we can read the dictionary
# A dictionary mapping words to an integer index
word_index = tf.keras.datasets.reuters.get_word_index()

# The first indices are reserved
word_index = {k:(v+3) for k,v in word_index.items()}
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2  # unknown
word_index["<UNUSED>"] = 3

reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

In [None]:
decode_review(x_train[0])

In [None]:
# Need input sequences to all have the same length (for the model to work)
# Preprocess with keras
# Only look at first 400 words in article
max_review_length = 400
x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen = max_review_length)
x_test = keras.preprocessing.sequence.pad_sequences(x_test, maxlen = max_review_length)

### 3. Build and compile 3 difference models using Keras LTSM ideally improving model at each iteration

#### Model 1 (sigmoid)

In [None]:
# Construct 1st model
embedding_vecor_length = 32
model = keras.models.Sequential()
model.add(keras.layers.Embedding(num_of_words, embedding_vecor_length, input_length=max_review_length))
model.add(keras.layers.LSTM(100))
model.add(keras.layers.Dense(47, activation='sigmoid'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())


model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=3, batch_size=64)

In [None]:
# Evaluate the model
scores = model.evaluate(x_test, y_test, verbose = 0)
print("Accuracy (sigmoid): %.2f%%" % (scores[1]*100))

#### Model 2 (tanh)

In [None]:
# Construct 2nd model
embedding_vecor_length = 32

model = keras.models.Sequential()

model.add(keras.layers.Embedding(num_of_words, embedding_vecor_length, input_length=max_review_length))

model.add(keras.layers.LSTM(100))

model.add(keras.layers.Dense(47, activation='tanh'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=3, batch_size=64)

In [None]:
# Evaluate the model
scores = model.evaluate(x_test, y_test, verbose = 0)
print("Accuracy (tanh): %.2f%%" % (scores[1]*100))

In [None]:
#### Model 3 (relu)
# Construct 2nd model
embedding_vecor_length = 32

model = keras.models.Sequential()

model.add(keras.layers.Embedding(num_of_words, embedding_vecor_length, input_length=max_review_length))

model.add(keras.layers.LSTM(100))

model.add(keras.layers.Dense(47, activation='relu'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=3, batch_size=64)

In [None]:
# Evaluate the model
scores = model.evaluate(x_test, y_test, verbose = 0)
print("Accuracy (relu): %.2f%%" % (scores[1]*100))

#### Let's try  <font color = "blue"> binary </font> classification instead of multi-class to see if we can improve the accuracy

In [None]:
from keras.datasets import reuters
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=None, test_split=0.2)
word_index = reuters.get_word_index(path= "reuters_word_index.json")

print('# of Training Samples: {}'.format(len(x_train)))
print('# of Test Samples: {}'.format(len(x_test)))

num_classes = max(y_train) + 1
print('# of Classes: {}'.format(num_classes))

In [None]:
# Explore indices as actual words
index_to_word = {}
for key, value in word_index.items():
    index_to_word[value] = key
print(' '.join([index_to_word[x] for x in x_train[0]]))
print(y_train[0])

In [None]:
# There are obviously words there that aren't helpful 
# so let's take care of that
from keras.preprocessing.text import Tokenizer
max_words = 10000

tokenizer = Tokenizer(num_words=max_words)
x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')
x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')

y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print(x_train[0])
print(len(x_train[0]))

print(y_train[0])
print(len(y_train[0]))

In [None]:
# Build model
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import LSTM

model = Sequential()

model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.metrics_names)
['loss', 'acc']

In [None]:
# Fit model
batch_size = 32
epochs = 3

history = model.fit(x_train, y_train, batch_size = batch_size, epochs = epochs, verbose = 1, validation_split = 0.1)
score = model.evaluate(x_test, y_test, batch_size = batch_size, verbose = 1)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

In [None]:
scores = model.evaluate(x_test, y_test, verbose = 0)
print("Accuracy (binary classification): %.2f%%" % (scores[1]*100))

### 4. Describe and explain your findings.

In our first three different models we used different activation functions: sigmoid, tanh, and relu). Instead of adding additional layers with different activations, I wanted to see how each activation performed independently on different models. Above, we see that relu on our news dataset performed at about 53%. This is not a good accuracy rate, but yet the best one out of our three. After looking online I see this problem being approached as a binary classification, which initially did not make sense, as there are 46 topics. However, I see that by *tokenizing* the data, we are able to treat it like a binary classification instead of a multiclass problem. To see how the well the model would perform as a binary classification, I ran one of those, too (with tokenization). We can see that the accuracy is much higher than nthe multi class classification (around 80%). Initially I did not have softmax as a layer, after adding it in after the initial model, I see it performed much better (~20% to 80%). Unsure if the original data was meant to be a multiclass or binary classification problem, as I am having issues implementing the LSTM into the binary model but I feel confident moving forward as I learn about new layers/approaches and my decision to only implement them once I am confident in their purposes.