## Step 0 :- Importing necessary libraries 

In [1]:
from keras import Model, Input
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Embedding, CuDNNLSTM, Dropout, TimeDistributed, Reshape, Activation, Dot, RepeatVector
from keras.layers.wrappers import Bidirectional
from keras.callbacks import *
from keras.utils import to_categorical
import numpy as np 
import pandas as pd
import re
import string
import random
import matplotlib.pyplot as plt; plt.rcdefaults()
from IPython.core.display import display, HTML
%matplotlib inline

Using TensorFlow backend.


## Step 1 :- Reading the dataset and performing elementary preprocessing on texts.
<div class="alert alert-block alert-success">
<b>Elementary preprocessing done :</b> The preprocessing task involves removal of punctuation and converting all uppercase characters to lower. <br><br>
    <b> The data is loaded from https://www.kaggle.com/c/learn-ai-bbc/data</b>
</div>

In [2]:
import pandas as pd
df = pd.read_csv(r'bbc.csv')
df['text'] = df['text'].str.replace('[{}]'.format(string.punctuation), '')
df['text'] = df['text'].str.lower()
df.head(10)

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...
5,politics,howard hits back at mongrel jibe michael howar...
6,politics,blair prepares to name poll date tony blair is...
7,sport,henman hopes ended in dubai third seed tim hen...
8,sport,wilkinson fit to face edinburgh england captai...
9,entertainment,last star wars not for children the sixth an...


## Step 2 : Text Preprocessing :-
<div class="alert alert-block alert-success">
<b>text_tokens :</b> A list containing a list which in turn contains all possible vocabularies of a particular text row.<br>
    <b>text_news :</b> A list having all possible news of column 'text' of dataframe. It is a list having all text values. <br>
</div>

In [3]:
text_tokens = [text.split() for text in df["text"].values.tolist()]
text_news  = df["text"].values.tolist()
labels      = df["category"].values.tolist()

In [9]:
tokenizer     = Tokenizer()
tokenizer.fit_on_texts(text_news)
word2id       = tokenizer.word_index
id2word       = dict([(value, key) for (key, value) in word2id.items()])
vocab_size    = len(word2id) + 1
embedding_dim = 150
max_len       = 200
X             = [[word2id[word] for word in sent] for sent in text_tokens]
X_pad         = pad_sequences(X, maxlen=max_len, padding='post', truncating='post')


In [10]:
label2id      = {l: i for i, l in enumerate(set(labels))}
id2label      = {v: k for k, v in label2id.items()}
y             = [label2id[label] for label in labels]
y             = to_categorical(y, num_classes=len(label2id), dtype='float32')
print("X (shape): {}".format(X_pad.shape))
print("y (shape): {}".format(y.shape))

X (shape): (2225, 200)
y (shape): (2225, 5)


## Step 3 : Model Building
<div class="alert alert-block alert-success">
<b>Encoder-Decoder with Attention:</b> We will first build a simple LSTM model and get its hidden layers to form a context vector to be input into the second decoder LSTM and finally use dense layers to predict the class
</div>

In [14]:
seq_input    = Input(shape=(max_len,), dtype='int32')
embedded     = Embedding(vocab_size,
                          embedding_dim,
                          input_length=max_len)(seq_input)
embedded     = Dropout(0.2)(embedded)
lstm_encoder = Bidirectional(CuDNNLSTM(embedding_dim, return_sequences=True))(embedded)
lstm_encoder = Dropout(0.2)(lstm_encoder)
attn_vector  = TimeDistributed(Dense(1))(lstm_encoder)
attn_vector  = Reshape((max_len,))(attn_vector)
attn_vector  = Activation('softmax', name='attention_vec')(attn_vector)
attn_output  = Dot(axes=1)([lstm_encoder, attn_vector])
context      = RepeatVector(200)(attn_output)
lstm_decoder = Bidirectional(CuDNNLSTM(embedding_dim,return_sequences=False))(context)
output       = Dense(len(label2id), activation='softmax')(lstm_decoder)
model        = Model(inputs = [seq_input],outputs = output)
model.compile(loss="categorical_crossentropy", metrics=["accuracy"], optimizer='adam')
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 200, 150)     4965750     input_3[0][0]                    
__________________________________________________________________________________________________
dropout_5 (Dropout)             (None, 200, 150)     0           embedding_3[0][0]                
__________________________________________________________________________________________________
bidirectional_5 (Bidirectional) (None, 200, 300)     362400      dropout_5[0][0]                  
__________________________________________________________________________________________________
dropout_6 

## Step 4 : Creating callbacks 

In [15]:
reduce_lr  = ReduceLROnPlateau(monitor='val_acc', factor=0.02,verbose=1,
                              patience=5, min_lr=0.0001)
es         = EarlyStopping(monitor='val_acc', patience=15, verbose=1, mode='auto', baseline=None, 
                          restore_best_weights=True)
filepath   = os.getcwd()+'\\chkpts\\'+"weights-improvement-{epoch:02d}-{loss:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='auto')

## Step 5 : Running the model to get output

In [16]:
model.fit(X_pad, y, epochs=30, batch_size=128, validation_split=0.2, shuffle=True,
         callbacks        = [es,reduce_lr,checkpoint])

Train on 1780 samples, validate on 445 samples
Epoch 1/30

Epoch 00001: val_acc improved from -inf to 0.41798, saving model to C:\Users\Batfleck\APB_DL_EXERCISES\News Classification using attention\chkpts\weights-improvement-01-1.49.hdf5
Epoch 2/30

Epoch 00002: val_acc improved from 0.41798 to 0.62697, saving model to C:\Users\Batfleck\APB_DL_EXERCISES\News Classification using attention\chkpts\weights-improvement-02-1.20.hdf5
Epoch 3/30

Epoch 00003: val_acc improved from 0.62697 to 0.75506, saving model to C:\Users\Batfleck\APB_DL_EXERCISES\News Classification using attention\chkpts\weights-improvement-03-0.61.hdf5
Epoch 4/30

Epoch 00004: val_acc improved from 0.75506 to 0.85843, saving model to C:\Users\Batfleck\APB_DL_EXERCISES\News Classification using attention\chkpts\weights-improvement-04-0.27.hdf5
Epoch 5/30

Epoch 00005: val_acc improved from 0.85843 to 0.89438, saving model to C:\Users\Batfleck\APB_DL_EXERCISES\News Classification using attention\chkpts\weights-improvement

<keras.callbacks.History at 0x2493f295e80>

## Step 6 : Validating the model to get results

In [17]:
model_att = Model(inputs=model.input,
                  outputs=[model.output, model.get_layer('attention_vec').output])

In [40]:
random_value        = random.randint(0,len(df))
sample_text         = df.iloc[random_value]['text']
associated_category = df.iloc[random_value]['category']

In [41]:
#sample_text = random.choice(df["text"].values.tolist())
print('The sample text is: \n\n',sample_text)
print('\n')
print('The associated category is: ',associated_category)

The sample text is: 

 playstation 3 processor unveiled the cell processor  which will drive sony s playstation 3  will run 10times faster than current pc chips  its designers have said  sony  ibm and toshiba  who have been working on the cell processor for three years  unveiled the chip on monday it is being designed for use in graphics workstations  the new playstation console  and has been described as a supercomputer on a chip the chip will run at speeds of greater than 4 ghz  the firms said by comparison  rival chip maker intel s fastest processor runs at 38 ghz details of the chip were released at the international solid state circuits conference in san francisco the new processor is set to ignite a fresh battle between intel and the cell consortium over which processor sits at the centre of digital products the playstation 3 is expected in 2006  while toshiba plans to incorporate it into highend televisions next year ibm has said it will sell a workstation with the chip starting

In [42]:
tokenized_sample = sample_text.split()
encoded_samples = [[word2id[word] for word in tokenized_sample]]
encoded_samples = pad_sequences(encoded_samples, maxlen=max_len)
label_probs, attentions = model_att.predict(encoded_samples)

In [43]:
label_probs = {id2label[_id]: prob for (label, _id), prob in zip(label2id.items(),label_probs[0])}
print(label_probs)

{'politics': 9.615334e-05, 'entertainment': 0.00056732853, 'business': 0.0016372548, 'sport': 8.549465e-06, 'tech': 0.9976907}


In [48]:
print('The predicted class will be ',max(label_probs, key=label_probs.get))

The predicted class will be  tech
