## LSTM with validation set

### Imports

In [1]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, CuDNNLSTM, Embedding, Dropout, Activation, Flatten, Bidirectional, GlobalMaxPool1D
from keras.models import Model, Sequential
import keras
import tensorflow as tf
from keras.layers import LSTM
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix


Using TensorFlow backend.


### Load data

In [2]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
train = pd.read_csv("/content/drive/My Drive/TFMColab/train.csv")
val = pd.read_csv("/content/drive/My Drive/TFMColab/val.csv")
test = pd.read_csv("/content/drive/My Drive/TFMColab/test.csv")

### Tokenize
Fit the tokenizer only on the text of training data.
Then, we use that same tokenizer to transform the texts of train, val and test sets to sequences of integers.

It's possible to fit on the entire data. But it's probably a better idea to reserve a token for "unknown" words (oov_token=True), for the cases when you find new test data with words your model has never seen.

In [0]:
max_features = 10000 # max num words
maxlen = 25 
embedding_size = 200

# create the tokenizer with the maximum number of words to keep, 
# based on word frequency. 
# Only the most common num_words-1 words will be kept.
tokenizer = Tokenizer(num_words=max_features, oov_token = True)

# fit the tokenizer on the headlines
tokenizer.fit_on_texts(list(train['headline']))

# Transforms each text in texts to a sequence of integers.
train_X = tokenizer.texts_to_sequences(train['headline'])
test_X = tokenizer.texts_to_sequences(test['headline'])
val_X = tokenizer.texts_to_sequences(val['headline'])

# transforms a list of num_samples sequences (lists of integers)
# into a 2D Numpy array of shape (num_samples, num_timesteps).
train_X = pad_sequences(train_X, maxlen = maxlen)
test_X = pad_sequences(test_X, maxlen = maxlen)
val_X = pad_sequences(val_X, maxlen = maxlen)

train_y = train['is_sarcastic']
test_y = test['is_sarcastic']
val_y = val['is_sarcastic']

Load glove embedding set, construct embedding matrix for words in word_index:


In [5]:
# load embeddings
EMBEDDING_FILE = '/content/drive/My Drive/TFMColab/glove.6B.200d.txt'

def get_coefs(word,*arr): 
    return word, np.asarray(arr, dtype='float32')

# first, build index mapping words in the embeddings set
# to their embedding vector
embeddings_index = {}
with open(EMBEDDING_FILE, encoding="utf8") as f:
    for line in f:
        word, coefs = get_coefs(*line.split(" "))
        embeddings_index[word] = coefs
            
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))

# Random embedding vector for unknown words.
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embedding_size))
# prepare embedding matrix
for word, i in word_index.items():
    if i >= max_features: 
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: 
        # words not found in embedding index will be random
        embedding_matrix[i] = embedding_vector

  if self.run_code(code, result):


### LSTM Model
Model Parameters:

- **Activation Function**: I have used ReLU as the activation function. ReLU is a non-linear activation function, which helps complex relationships in the data to be captured by the model.

- **Optimiser**: We use adam optimiser, which is an adaptive learning rate optimiser.

- **Loss function**: We will train a network to output a probability over the 2 classes using Sigmoid Loss.

In [0]:
# Create model structure

model = Sequential()
model.add(Embedding(max_features, embedding_size, weights = [embedding_matrix]))
model.add(Bidirectional(LSTM(128, return_sequences = True)))
model.add(GlobalMaxPool1D())
model.add(Dense(40, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(20, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',keras.metrics.Precision(), keras.metrics.Recall(), keras.metrics.TruePositives()])

### Save the best model and early stopping
To prevent the model from overfitting I have enabled early stopping.

Early stopping is a method that allows us to specify an arbitrary large number of training epochs and stop training once the model performance stops improving on a hold out/validation dataset.


In [0]:
# Save the model after every epoch.
saveBestModel = keras.callbacks.ModelCheckpoint('/content/drive/My Drive/TFMColab/best_model.hdf5', monitor='val_acc', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1)
# Stop training when a monitored quantity has stopped improving.
earlyStopping = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')

In [8]:
# Fit the model
batch_size = 100
epochs = 25
model.fit(train_X, train_y, batch_size=batch_size, epochs=epochs, validation_data=(val_X, val_y), callbacks=[saveBestModel, earlyStopping])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 19952 samples, validate on 2850 samples
Epoch 1/25
Epoch 2/25
  100/19952 [..............................] - ETA: 22s - loss: 0.3566 - accuracy: 0.8300 - precision_1: 0.8750 - recall_1: 0.7447 - true_positives_1: 35.0000



Epoch 3/25
Epoch 4/25
Epoch 5/25


<keras.callbacks.callbacks.History at 0x7f9101633d68>

Stops in the Epoch 6 out of 25, this is thanks to the validation set, that prevents us to overfit the model.

### Evaluate model results with test data

In [9]:
model.metrics_names

['loss', 'accuracy', 'precision_1', 'recall_1', 'true_positives_1']

In [10]:
loss, accuracy, precision, recall, true_positives = model.evaluate(test_X, test_y, batch_size=batch_size)



In [0]:
mult_pr=precision*recall
sum_pr=precision+recall
div=mult_pr/sum_pr
f1_score=2*div

#### Loss, Accuracy, Precision, Recall and F1

In [12]:
print('Loss:',loss)
print('Accuracy:',accuracy)
print('Precision:',precision)
print('Recall:',recall)
print('f1 score:',f1_score)
print('True positives:',true_positives)

Loss: 0.4391108504660693
Accuracy: 0.8680933117866516
Precision: 0.8561046719551086
Recall: 0.8687315583229065
f1 score: 0.8623718967628665
True positives: 2356.0


# Extract FalsePositives and FalseNegatives


We first get the list of predictions. In the confusion matrix it can be observed that the number of True Positives is the same.

In [0]:
pred_y = model.predict_classes(test_X, batch_size=batch_size)

In [14]:
confusion_matrix(test_y, pred_y)

array([[2593,  396],
       [ 356, 2356]])

We build a function to compare the predicted values to the actual values and extract the FalsePositives and FalseNegatives.

In [0]:
def getFP_FN_lists(test_X, test_y, pred_y):
    FP_text = []
    FP_index = []
    FN_text = []
    FN_index = []
    for i in range(len(test_y)):
        if(pred_y[i]==1 and test_y[test_y.index[i]]==0):
            FP_text.append(test['headline'][test_y.index[i]])
            FP_index.append(test_y.index[i])
        elif(pred_y[i]==0 and test_y[test_y.index[i]]==1):
            FN_text.append(test['headline'][test_y.index[i]])
            FN_index.append(test_y.index[i])
            
    return FP_text,FP_index,FN_text,FN_index

In [0]:
'''Returns 2 dataframes, one with all the False Positives and one with all the False Negatives'''
def getFP_FN(test_X, test_y, pred_y):
    FP_text,FP_index,FN_text,FN_index = getFP_FN_lists(test_X, test_y, pred_y)
    d_FP = {'FP_text':FP_text,'FP_index':FP_index}
    df_FP = pd.DataFrame(d_FP)
    d_FN = {'FN_text':FN_text,'FN_index':FN_index}
    df_FN = pd.DataFrame(d_FN)
    
    return df_FP,df_FN

In [0]:
# We get the FPs and FNs as DataFrames and store them to CSVs
df_FP,df_FN = getFP_FN(test_X, test_y, pred_y)
df_FP.to_csv('FP.csv', index=True)
df_FN.to_csv('FN.csv', index=True)