In [1]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer,  text_to_word_sequence
from keras.engine.topology import Layer
from keras import initializers as initializers, regularizers, constraints
from keras.callbacks import Callback
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding, Input, Dense, LSTM, GRU, Bidirectional, TimeDistributed
from keras import backend as K
from keras import optimizers
from keras.models import Model
import nltk
import re
import matplotlib.pyplot as plt
import sys
from sklearn.metrics import roc_auc_score
from nltk import tokenize

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
!ls

__notebook_source__.ipynb


### Attention Layer

In [3]:
def dot_product(x, kernel):
    """
    Wrapper for dot product operation, in order to be compatible with both
    Theano and Tensorflow
    Args:
        x (): input
        kernel (): weights
    Returns:
    """
    if K.backend() == 'tensorflow':
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)

class AttentionWithContext(Layer):
    """
    Attention operation, with a context/query vector, for temporal data.
    Supports Masking.
    Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf]
    "Hierarchical Attention Networks for Document Classification"
    by using a context vector to assist the attention
    # Input shape
        3D tensor with shape: `(samples, steps, features)`.
    # Output shape
        2D tensor with shape: `(samples, features)`.
    How to use:
    Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
    The dimensions are inferred based on the output shape of the RNN.
    Note: The layer has been tested with Keras 2.0.6
    Example:
        model.add(LSTM(64, return_sequences=True))
        model.add(AttentionWithContext())
        # next add a Dense layer (for classification/regression) or whatever...
    """

    def __init__(self,
                 W_regularizer=None, u_regularizer=None, b_regularizer=None,
                 W_constraint=None, u_constraint=None, b_constraint=None,
                 bias=True, **kwargs):

        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.u_regularizer = regularizers.get(u_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.u_constraint = constraints.get(u_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(AttentionWithContext, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1], input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight((input_shape[-1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)

        self.u = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_u'.format(self.name),
                                 regularizer=self.u_regularizer,
                                 constraint=self.u_constraint)

        super(AttentionWithContext, self).build(input_shape)

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        uit = dot_product(x, self.W)

        if self.bias:
            uit += self.b

        uit = K.tanh(uit)
        ait = dot_product(uit, self.u)

        a = K.exp(ait)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]

### Config

In [4]:
max_features=200000
max_senten_len=500
max_senten_num=15
embed_size=200
VALIDATION_SPLIT = 0.2

### Data

In [5]:
import json
from sklearn.utils import shuffle

In [6]:
df = shuffle(pd.read_json('../input/news-category-dataset/News_Category_Dataset.json', lines=True))[:5000].reset_index()

In [7]:
len(df.category.unique())

31

In [8]:
df.head()

Unnamed: 0,index,authors,category,date,headline,link,short_description
0,81195,Steven Hoffer,SCIENCE,2015-09-03,Dione Crosses Saturn's Disk In Spectacular New...,https://www.huffingtonpost.com/entry/saturn-mo...,The photo was captured from approximately 1.4 ...
1,113250,Jason Linkins,POLITICS,2014-08-31,"Barack Obama And The Misery Of His Augusts, Ra...",https://www.huffingtonpost.com/entry/barack-ob...,There was definitely a time in President Barac...
2,103752,Christopher Rosen,ENTERTAINMENT,2014-12-18,Aaron Sorkin Partly Blames Media For 'The Inte...,https://www.huffingtonpost.com/entry/aaron-sor...,
3,61862,,ENTERTAINMENT,2016-04-09,Paul Walker's Daughter Meadow Reaches $10 Mill...,https://www.huffingtonpost.com/entry/meadow-wa...,The money will go into a trust for the 17-year...
4,37774,Todd Van Luling,ENTERTAINMENT,2017-01-09,Golden Globes Air Special In Memoriam Segment ...,https://www.huffingtonpost.com/entry/golden-gl...,The award show made an exception for the mothe...


In [9]:
df['text'] = df['headline'] +'. ' +df['short_description']

In [10]:
df.head()

Unnamed: 0,index,authors,category,date,headline,link,short_description,text
0,81195,Steven Hoffer,SCIENCE,2015-09-03,Dione Crosses Saturn's Disk In Spectacular New...,https://www.huffingtonpost.com/entry/saturn-mo...,The photo was captured from approximately 1.4 ...,Dione Crosses Saturn's Disk In Spectacular New...
1,113250,Jason Linkins,POLITICS,2014-08-31,"Barack Obama And The Misery Of His Augusts, Ra...",https://www.huffingtonpost.com/entry/barack-ob...,There was definitely a time in President Barac...,"Barack Obama And The Misery Of His Augusts, Ra..."
2,103752,Christopher Rosen,ENTERTAINMENT,2014-12-18,Aaron Sorkin Partly Blames Media For 'The Inte...,https://www.huffingtonpost.com/entry/aaron-sor...,,Aaron Sorkin Partly Blames Media For 'The Inte...
3,61862,,ENTERTAINMENT,2016-04-09,Paul Walker's Daughter Meadow Reaches $10 Mill...,https://www.huffingtonpost.com/entry/meadow-wa...,The money will go into a trust for the 17-year...,Paul Walker's Daughter Meadow Reaches $10 Mill...
4,37774,Todd Van Luling,ENTERTAINMENT,2017-01-09,Golden Globes Air Special In Memoriam Segment ...,https://www.huffingtonpost.com/entry/golden-gl...,The award show made an exception for the mothe...,Golden Globes Air Special In Memoriam Segment ...


In [11]:
df = df[['text', 'category']]

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 2 columns):
text        5000 non-null object
category    5000 non-null object
dtypes: object(2)
memory usage: 78.2+ KB


In [13]:
categories = df['category']
text = df['text']

In [14]:
paras = []
labels = []
texts = []

In [15]:
max_sent_len_exist = 0
max_sent_num_exist = 0
for idx in range(df.text.shape[0]):
    text = df.text[idx]
    texts.append(text)
    sentences = tokenize.sent_tokenize(text)
    if max_sent_num_exist < len(sentences):
        max_sent_num_exist = len(sentences)
    for sent in sentences:
        if max_sent_len_exist < len(sent):
            max_sent_len_exist = len(sent)
    
paras.append(sentences)
print('Max existing sentence len:',max_sent_len_exist )
print('Max existant sentence num:',max_sent_num_exist )

Max existing sentence len: 377
Max existant sentence num: 14


In [16]:
tokenizer = Tokenizer(num_words=max_features, oov_token=True)
tokenizer.fit_on_texts(texts)

In [17]:
data = np.zeros((len(texts), max_senten_num, max_senten_len), dtype='int32')
for i, sentences in enumerate(paras):
    for j, sent in enumerate(sentences):
        if j< max_senten_num:
            wordTokens = text_to_word_sequence(sent)
            k=0
            for _, word in enumerate(wordTokens):
                if k<max_senten_len and tokenizer.word_index[word]<max_features:
                    data[i,j,k] = tokenizer.word_index[word]
                    k=k+1

In [18]:
data.shape

(5000, 15, 500)

In [19]:
word_index = tokenizer.word_index
print('Total %s unique tokens.' % len(word_index))

Total 17796 unique tokens.


In [20]:
labels = pd.get_dummies(categories)

In [21]:
print('Shape of data tensor:', data.shape)
print('Shape of labels tensor:', labels.shape)

Shape of data tensor: (5000, 15, 500)
Shape of labels tensor: (5000, 31)


In [22]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels.iloc[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]
print('Number of positive and negative reviews in traing and validation set')
print(y_train.columns.tolist())
print(y_train.sum(axis=0).tolist())
print(y_val.sum(axis=0).tolist())

Number of positive and negative reviews in traing and validation set
['ARTS', 'ARTS & CULTURE', 'BLACK VOICES', 'BUSINESS', 'COLLEGE', 'COMEDY', 'CRIME', 'EDUCATION', 'ENTERTAINMENT', 'FIFTY', 'GOOD NEWS', 'GREEN', 'HEALTHY LIVING', 'IMPACT', 'LATINO VOICES', 'MEDIA', 'PARENTS', 'POLITICS', 'QUEER VOICES', 'RELIGION', 'SCIENCE', 'SPORTS', 'STYLE', 'TASTE', 'TECH', 'THE WORLDPOST', 'TRAVEL', 'WEIRD NEWS', 'WOMEN', 'WORLD NEWS', 'WORLDPOST']
[59, 41, 124, 137, 35, 126, 82, 30, 444, 40, 45, 86, 232, 73, 36, 92, 119, 1063, 163, 82, 55, 141, 66, 56, 41, 108, 69, 61, 131, 81, 82]
[13, 12, 36, 38, 6, 30, 23, 9, 114, 5, 12, 24, 50, 26, 11, 18, 31, 280, 44, 19, 10, 27, 15, 9, 10, 35, 19, 18, 23, 17, 16]


### Model

In [23]:
REG_PARAM = 1e-13
l2_reg = regularizers.l2(REG_PARAM)

In [24]:
import os

In [25]:
GLOVE_DIR = "../input/glove-global-vectors-for-word-representation/glove.6B.200d.txt"
embeddings_index = {}
f = open(GLOVE_DIR)
for line in f:
    try:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    except:
        pass
f.close()

print('Total %s word vectors.' % len(embeddings_index))

Total 400000 word vectors.


In [26]:
embedding_matrix = np.random.random((len(word_index) + 1, embed_size))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
embedding_layer = Embedding(len(word_index) + 1,embed_size,weights=[embedding_matrix],input_length=max_senten_len,trainable=True)

In [None]:
word_input = Input(shape=(max_senten_len,), dtype='int32')
word_sequences = embedding_layer(word_input)
word_lstm = Bidirectional(LSTM(200, return_sequences=True))(word_sequences)
word_dense = TimeDistributed(Dense(150))(word_lstm)
word_att = AttentionWithContext(W_regularizer=l2_reg)(word_dense)
wordEncoder = Model(word_input, word_att)

sent_input = Input(shape=(max_senten_num, max_senten_len), dtype='int32')
sent_encoder = TimeDistributed(wordEncoder)(sent_input)
sent_lstm = Bidirectional(LSTM(200, return_sequences=True))(sent_encoder)
sent_dense = TimeDistributed(Dense(150))(sent_lstm)
sent_att = AttentionWithContext(W_regularizer=l2_reg)(sent_dense)
preds = Dense(31, kernel_regularizer=l2_reg)(sent_att)
model = Model(sent_input, preds)
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
history = model.fit(x_train, y_train, validation_data=(x_val, y_val),epochs=5, batch_size=64)

## Plotting time

In [None]:
print(history.history.keys())

In [None]:
# summarize history for accuracy
plt.plot(history.history['mean_absolute_error'])
plt.plot(history.history['val_mean_absolute_error'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
model.save('han_clap_predictor.h5')