In [1]:
import numpy as np

import json
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json \
    -O /tmp/sarcasm.json

--2020-08-09 18:44:13--  https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json
Resolving storage.googleapis.com (storage.googleapis.com)... 108.177.125.128, 74.125.23.128, 74.125.203.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|108.177.125.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5643545 (5.4M) [application/json]
Saving to: ‘/tmp/sarcasm.json’


2020-08-09 18:44:13 (96.4 MB/s) - ‘/tmp/sarcasm.json’ saved [5643545/5643545]



In [3]:
# remove stop words
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]


In [4]:
# reading the JSON

with open("/tmp/sarcasm.json", 'r') as f:
  datastore = json.load(f)

plain_sentences = []
sentences = []
labels = []
urls = []
for item in datastore:
  labels.append(item['is_sarcastic'])
  sentence = item['headline'].lower()
  plain_sentences.append(sentence)
  for word in stopwords:
    token = " " + word + " "
    sentence = sentence.replace(token, " ")
  sentences.append(sentence)

print(len(labels))
print(len(sentences))
print(sentences[1])
print(plain_sentences[1])
print(labels[1])


26709
26709
the 'roseanne' revival catches thorny political mood, better worse
the 'roseanne' revival catches up to our thorny political mood, for better and worse
0


In [5]:
training_size = 20000

training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

In [6]:
#params 

vocab_size = 1000
embedding_dim = 16
max_length = 120
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

In [7]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(sentences)

In [8]:
word_index = tokenizer.word_index
print(len(word_index))

29642


In [9]:
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [10]:
# need to convert them to np array

training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [11]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 16)           16000     
_________________________________________________________________
bidirectional (Bidirectional (None, 120, 128)          41472     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                41216     
_________________________________________________________________
dense (Dense)                (None, 24)                1560      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 25        
Total params: 100,273
Trainable params: 100,273
Non-trainable params: 0
_________________________________________________________________


In [16]:
from keras.callbacks import ModelCheckpoint 

filepath = "sarcasm-model-{epoch:02d}-{val_accuracy:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='auto') 

In [17]:
num_epochs = 15
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=1, callbacks=[checkpoint])

Epoch 1/15
Epoch 00001: val_accuracy improved from -inf to 0.76792, saving model to sarcasm-model-01-0.77.hdf5
Epoch 2/15
Epoch 00002: val_accuracy improved from 0.76792 to 0.77925, saving model to sarcasm-model-02-0.78.hdf5
Epoch 3/15
Epoch 00003: val_accuracy did not improve from 0.77925
Epoch 4/15
Epoch 00004: val_accuracy did not improve from 0.77925
Epoch 5/15
Epoch 00005: val_accuracy did not improve from 0.77925
Epoch 6/15
Epoch 00006: val_accuracy did not improve from 0.77925
Epoch 7/15
Epoch 00007: val_accuracy did not improve from 0.77925
Epoch 8/15
Epoch 00008: val_accuracy did not improve from 0.77925
Epoch 9/15
Epoch 00009: val_accuracy did not improve from 0.77925
Epoch 10/15
Epoch 00010: val_accuracy did not improve from 0.77925
Epoch 11/15
Epoch 00011: val_accuracy did not improve from 0.77925
Epoch 12/15
Epoch 00012: val_accuracy did not improve from 0.77925
Epoch 13/15
Epoch 00013: val_accuracy did not improve from 0.77925
Epoch 14/15
Epoch 00014: val_accuracy did not

**Model 2**

In [18]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Conv1D(64, 5, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=4),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [19]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 120, 16)           16000     
_________________________________________________________________
conv1d (Conv1D)              (None, 116, 64)           5184      
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 29, 64)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 54,273
Trainable params: 54,273
Non-trainable params: 0
_________________________________________________________________


In [20]:
from keras.callbacks import ModelCheckpoint 

filepath = "sarcasm-model2-{epoch:02d}-{val_accuracy:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='auto') 

In [21]:
num_epochs = 15
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=1, callbacks=[checkpoint])

Epoch 1/15
Epoch 00001: val_accuracy improved from -inf to 0.75242, saving model to sarcasm-model2-01-0.75.hdf5
Epoch 2/15
Epoch 00002: val_accuracy improved from 0.75242 to 0.76628, saving model to sarcasm-model2-02-0.77.hdf5
Epoch 3/15
Epoch 00003: val_accuracy improved from 0.76628 to 0.77761, saving model to sarcasm-model2-03-0.78.hdf5
Epoch 4/15
Epoch 00004: val_accuracy did not improve from 0.77761
Epoch 5/15
Epoch 00005: val_accuracy did not improve from 0.77761
Epoch 6/15
Epoch 00006: val_accuracy did not improve from 0.77761
Epoch 7/15
Epoch 00007: val_accuracy did not improve from 0.77761
Epoch 8/15
Epoch 00008: val_accuracy did not improve from 0.77761
Epoch 9/15
Epoch 00009: val_accuracy did not improve from 0.77761
Epoch 10/15
Epoch 00010: val_accuracy did not improve from 0.77761
Epoch 11/15
Epoch 00011: val_accuracy did not improve from 0.77761
Epoch 12/15
Epoch 00012: val_accuracy did not improve from 0.77761
Epoch 13/15
Epoch 00013: val_accuracy did not improve from 0.