## RNN Classifier Setup

In [1]:
import re

def replace(string, substitutions):
    substrings = sorted(substitutions, key=len, reverse=True)
    regex = re.compile('|'.join(map(re.escape, substrings)))
    return regex.sub(lambda match: substitutions[match.group(0)], string)

In [2]:
from nltk.corpus import reuters
from sklearn.preprocessing import MultiLabelBinarizer
import keras
from nltk.corpus import reuters
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, Dropout, SpatialDropout1D, SimpleRNN
from keras.layers import Embedding
from keras.callbacks import ModelCheckpoint
import os
import numpy as np
from sklearn.metrics import roc_auc_score
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
 
# List of document ids
all_docs_id = reuters.fileids()
 
train_docs_id = list(filter(lambda doc: doc.startswith("train"),
                            all_docs_id))
test_docs_id = list(filter(lambda doc: doc.startswith("test"),
                           all_docs_id))
 
train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id]
test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id]
all_docs = [reuters.raw(doc_id) for doc_id in all_docs_id]

# Transform multilabel labels
mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform([reuters.categories(doc_id)
                                  for doc_id in train_docs_id])
test_labels = mlb.transform([reuters.categories(doc_id)
                             for doc_id in test_docs_id])
 


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
# removes \n from all the text
subst = {"\n" :""}

for i, doc in enumerate(test_docs):
    test_docs[i] = replace(doc, subst)
    
for i, doc in enumerate(train_docs):
    train_docs[i] = replace(doc, subst)
    
for i, doc in enumerate(all_docs):
    all_docs[i] = replace(doc, subst)

In [4]:
t = keras.preprocessing.text.Tokenizer(num_words=5000, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True, split=' ', char_level=False, oov_token=None)
t.fit_on_texts(all_docs)

In [5]:
print(t.word_index)



In [6]:
encoded_test_docs = t.texts_to_sequences(test_docs)
encoded_train_docs = t.texts_to_sequences(train_docs)

## RNN Classifier

### Set hyperparameters

In [7]:
output_dir = r'model_output\rnn_classifier'

epochs = 32
batch_size = 128

n_dim = 64
n_unique_words = 10000
max_review_length = 100
pad_type = trunc_type = 'pre'
drop_embed = 0.2

n_rnn = 256
dropout_rnn = 0.2

# could add dense layer after rnn, but it's not common practice. Do try out when you have time

### Preprocess data

In [8]:
encoded_train_docs = pad_sequences(encoded_train_docs, maxlen=max_review_length, padding=pad_type, truncating=trunc_type, value = 0)
encoded_test_docs = pad_sequences(encoded_test_docs, maxlen=max_review_length, padding=pad_type, truncating=trunc_type, value = 0)

### NN Architecture

In [9]:
model = Sequential()
model.add(Embedding(n_unique_words, n_dim, input_length=max_review_length))
model.add(SpatialDropout1D(drop_embed))
model.add(SimpleRNN(n_rnn, dropout=dropout_rnn))
model.add(Dense(90, activation='sigmoid'))

In [10]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 64)           640000    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 100, 64)           0         
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 256)               82176     
_________________________________________________________________
dense_1 (Dense)              (None, 90)                23130     
Total params: 745,306
Trainable params: 745,306
Non-trainable params: 0
_________________________________________________________________


In [11]:
from keras import backend as K

def sum_binary_crossentropy(y_true, y_pred):
    return K.sum(K.binary_crossentropy(y_true, y_pred), axis=-1)

check out https://github.com/keras-team/keras/issues/2275 for categorical_accuracy intuition

In [12]:
model.compile(loss=sum_binary_crossentropy, optimizer='adam', metrics=['categorical_accuracy'])

In [13]:
modelcheckpoint = ModelCheckpoint(filepath=output_dir+"\weights.{epoch:02d}.hdf5")

In [14]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

### Train 

In [15]:
model.fit(encoded_train_docs, train_labels, batch_size = batch_size, epochs=epochs, verbose=1, 
          validation_data=(encoded_test_docs, test_labels), callbacks=[modelcheckpoint])

Train on 7769 samples, validate on 3019 samples
Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32
Epoch 8/32
Epoch 9/32
Epoch 10/32
Epoch 11/32
Epoch 12/32
Epoch 13/32
Epoch 14/32
Epoch 15/32
Epoch 16/32
Epoch 17/32
Epoch 18/32
Epoch 19/32
Epoch 20/32
Epoch 21/32
Epoch 22/32
Epoch 23/32
Epoch 24/32
Epoch 25/32
Epoch 26/32
Epoch 27/32
Epoch 28/32
Epoch 29/32
Epoch 30/32
Epoch 31/32
Epoch 32/32


<keras.callbacks.History at 0x28e201be2b0>

### Load the ideal weights based on above result

In [16]:
model.load_weights(output_dir+'/weights.20.hdf5')

In [17]:
y_hat = model.predict(encoded_test_docs)

### ROC AUC Score
macro checks per category while micro checks per sample

In [18]:
pct_auc_macro = roc_auc_score(test_labels, y_hat, average = 'macro')*100
pct_auc_micro = roc_auc_score(test_labels, y_hat, average = 'micro')*100
pct_auc_macro, pct_auc_micro

(64.81237484658035, 90.88022591294111)

### Random Check

In [19]:
print(y_hat[7][26])
print(test_labels[7][26])

0.021985628
1


### Label ranking average precision
Label ranking average precision (LRAP) is the average over each ground truth label assigned to each sample, of the ratio of true vs. total labels with lower score. This metric is used in multilabel ranking problem, where the goal is to give better rank to the labels associated to each sample.

The obtained score is always strictly greater than 0 and the best value is 1.

In [20]:
from sklearn.metrics import label_ranking_average_precision_score
label_ranking_average_precision_score(test_labels, y_hat)

0.5926923860770521

### Label Ranking Loss
The label_ranking_loss function computes the ranking loss which averages over the samples the number of label pairs that are incorrectly ordered, i.e. true labels have a lower score than false labels, weighted by the inverse number of false and true labels. The lowest achievable ranking loss is zero.

In [21]:
from sklearn.metrics import label_ranking_loss
label_ranking_loss(test_labels, y_hat)

0.07620343478619537