## LSTM with validation set

### Imports

In [1]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, CuDNNLSTM, Embedding, Dropout, Activation, Flatten, Bidirectional, GlobalMaxPool1D
from keras.models import Model, Sequential
import keras
import tensorflow as tf
from keras.layers import LSTM
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix


Using TensorFlow backend.


### Load data

In [2]:
df1 = pd.read_json("./data/Sarcasm_Headlines_Dataset.json", lines=True)
df2 = pd.read_json("./data/Sarcasm_Headlines_Dataset_v2.json", lines=True)
# re-order attibute columns in df2
df2 = df2[['article_link','headline','is_sarcastic']]
df = pd.concat([df1, df2], axis=0)
df = df.drop(['article_link'], axis=1)
print(len(df))
df.head()

55328


Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


Reset the index as we have merged two different indexes.

In [3]:
df.reset_index(inplace=True, drop=True)

In [4]:
df['len'] = df['headline'].apply(lambda x: len(x.split(" ")))
print(df['len'].mean())
print(min(df['len']))
print(max(df['len']))

9.953368999421631
2
151


### Split data in train, test and val
50% for train, 25% for test, 25% for validation.

In [5]:
train, test = train_test_split(df, test_size=0.25)
print(train.shape)
train, val = train_test_split(train, test_size=0.33333)

(41496, 3)


In [6]:
print('train:',train.shape)
print('test:',test.shape)
print('val:',val.shape)

train: (27664, 3)
test: (13832, 3)
val: (13832, 3)


### Tokenize
Fit the tokenizer only on the text of training data.
Then, we use that same tokenizer to transform the texts of train, val and test sets to sequences of integers.

It's possible to fit on the entire data. But it's probably a better idea to reserve a token for "unknown" words (oov_token=True), for the cases when you find new test data with words your model has never seen.

In [7]:
max_features = 10000 # max num words
maxlen = 25 # we could also try 151
embedding_size = 200

# create the tokenizer with the maximum number of words to keep, 
# based on word frequency. 
# Only the most common num_words-1 words will be kept.
tokenizer = Tokenizer(num_words=max_features, oov_token = True)

# fit the tokenizer on the headlines
tokenizer.fit_on_texts(list(train['headline']))

# Transforms each text in texts to a sequence of integers.
train_X = tokenizer.texts_to_sequences(train['headline'])
test_X = tokenizer.texts_to_sequences(test['headline'])
val_X = tokenizer.texts_to_sequences(val['headline'])

# transforms a list of num_samples sequences (lists of integers)
# into a 2D Numpy array of shape (num_samples, num_timesteps).
train_X = pad_sequences(train_X, maxlen = maxlen)
test_X = pad_sequences(test_X, maxlen = maxlen)
val_X = pad_sequences(val_X, maxlen = maxlen)

train_y = train['is_sarcastic']
test_y = test['is_sarcastic']
val_y = val['is_sarcastic']

Load glove embedding set, construct embedding matrix for words in word_index:


In [8]:
# load embeddings
EMBEDDING_FILE = './embeddings/glove.6B.200d.txt'

def get_coefs(word,*arr): 
    return word, np.asarray(arr, dtype='float32')

# first, build index mapping words in the embeddings set
# to their embedding vector
embeddings_index = {}
with open(EMBEDDING_FILE, encoding="utf8") as f:
    for line in f:
        word, coefs = get_coefs(*line.split(" "))
        embeddings_index[word] = coefs
            
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))

# Random embedding vector for unknown words.
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embedding_size))
# prepare embedding matrix
for word, i in word_index.items():
    if i >= max_features: 
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: 
        # words not found in embedding index will be random
        embedding_matrix[i] = embedding_vector

  if (await self.run_code(code, result,  async_=asy)):


### LSTM Model
Model Parameters:

- **Activation Function**: I have used ReLU as the activation function. ReLU is a non-linear activation function, which helps complex relationships in the data to be captured by the model.

- **Optimiser**: We use adam optimiser, which is an adaptive learning rate optimiser.

- **Loss function**: We will train a network to output a probability over the 2 classes using Sigmoid Loss.

In [9]:
# Create model structure

model = Sequential()
model.add(Embedding(max_features, embedding_size, weights = [embedding_matrix]))
model.add(Bidirectional(LSTM(128, return_sequences = True)))
model.add(GlobalMaxPool1D())
model.add(Dense(40, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(20, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',keras.metrics.Precision(), keras.metrics.Recall(), keras.metrics.TruePositives()])

### Save the best model and early stopping
To prevent the model from overfitting I have enabled early stopping.

Early stopping is a method that allows us to specify an arbitrary large number of training epochs and stop training once the model performance stops improving on a hold out/validation dataset.


In [10]:
# Save the model after every epoch.
saveBestModel = keras.callbacks.ModelCheckpoint('./model/best_model.hdf5', monitor='val_acc', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1)
# Stop training when a monitored quantity has stopped improving.
earlyStopping = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')

In [None]:
# Fit the model
batch_size = 100
epochs = 25
model.fit(train_X, train_y, batch_size=batch_size, epochs=epochs, validation_data=(val_X, val_y), callbacks=[saveBestModel, earlyStopping])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 27664 samples, validate on 13832 samples
Epoch 1/25
Epoch 2/25
  100/27664 [..............................] - ETA: 44s - loss: 0.4303 - accuracy: 0.8400 - precision_1: 0.7917 - recall_1: 0.8636 - true_positives_1: 38.0000



Epoch 3/25
Epoch 4/25

Stops in the Epoch 6 out of 25, this is thanks to the validation set, that prevents us to overfit the model.

### Evaluate model results with test data

In [None]:
model.metrics_names

In [None]:
loss, accuracy, precision, recall, true_positives = model.evaluate(test_X, test_y, batch_size=batch_size)

In [None]:
mult_pr=precision*recall
sum_pr=precision+recall
div=mult_pr/sum_pr
f1_score=2*div

#### Loss, Accuracy, Precision, Recall and F1

In [None]:
print('Loss:',loss)
print('Accuracy:',accuracy)
print('Precision:',precision)
print('Recall:',recall)
print('f1 score:',f1_score)
print('True positives:',true_positives)

# Extract FalsePositives and FalseNegatives


We first get the list of predictions. In the confusion matrix it can be observed that the number of True Positives is the same ()

In [16]:
pred_y = model.predict_classes(test_X, batch_size=batch_size)

In [17]:
confusion_matrix(test_y, pred_y)

array([[7198,  386],
       [ 697, 5551]], dtype=int64)

In [18]:
def getFalsePositive(test_X, test_y, pred_y):
    FP_text = []
    FP_index = []
    for i in range(len(test_y)):
        if(pred_y[i]==1 and test_y[i]==0):
            FP_text.append(test_X[i])
            FP_index.append(test_y.index[i])

In [39]:
test

Unnamed: 0,headline,is_sarcastic,len
26007,a good news story about 'imperfect' pregnancy,0,7
11141,leaving your dream: tedx talk,0,5
28929,turkish soccer body penalizes kurdish club ami...,0,9
37367,"listen up, girlfriends: we need each other",0,7
32993,"u.s. reaches major milestone: 100,000 american...",0,10
...,...,...,...
47161,looking through the glass ceiling,0,5
14338,"move over 'hamilton,' d.c. just debuted 'trump'",0,7
22755,"kimmel, atop scorched earth, takes aim at trum...",0,12
50715,these newlyweds have god-like locks and the in...,0,12


In [40]:
print(test.loc[[28929]]['headline'].to_string())

28929    turkish soccer body penalizes kurdish club ami...


In [23]:
print(test.index)

Int64Index([26007, 11141, 28929, 37367, 32993, 15828, 28978,   949, 37359,
            27498,
            ...
            31382, 49606, 28532, 52197, 17760, 47161, 14338, 22755, 50715,
             8246],
           dtype='int64', length=13832)
