In [1]:
import tensorflow as tf
print(tf.__version__)

2.4.1


In [2]:
import json
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json \
    -O /tmp/sarcasm.json

--2021-03-03 09:22:15--  https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json
Resolving storage.googleapis.com (storage.googleapis.com)... 2404:6800:4007:809::2010, 2404:6800:4007:803::2010, 2404:6800:4007:802::2010, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|2404:6800:4007:809::2010|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5643545 (5.4M) [application/json]
Saving to: ‘/tmp/sarcasm.json’


2021-03-03 09:22:37 (348 KB/s) - ‘/tmp/sarcasm.json’ saved [5643545/5643545]



In [4]:
with open("/tmp/sarcasm.json", 'r') as f:
    datastore = json.load(f)

sentences = []
labels = []

for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])

In [5]:
training_size = 20000
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

In [6]:
vocab_size = 10000
embedding_dim = 16
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

In [7]:
#Word Indexing
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)

tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index


In [8]:
#Sequence Conversion for Training set
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

#Sequence Conversion for Training set
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [9]:
import numpy as np
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [10]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [11]:
history = model.fit(training_padded, 
                    training_labels, 
                    epochs=30, 
                    validation_data=(testing_padded, testing_labels), 
                    verbose=2)

Epoch 1/30
625/625 - 5s - loss: 0.6572 - accuracy: 0.6047 - val_loss: 0.5480 - val_accuracy: 0.8007
Epoch 2/30
625/625 - 2s - loss: 0.4145 - accuracy: 0.8349 - val_loss: 0.3794 - val_accuracy: 0.8439
Epoch 3/30
625/625 - 2s - loss: 0.3051 - accuracy: 0.8799 - val_loss: 0.3523 - val_accuracy: 0.8520
Epoch 4/30
625/625 - 2s - loss: 0.2559 - accuracy: 0.8990 - val_loss: 0.3426 - val_accuracy: 0.8575
Epoch 5/30
625/625 - 2s - loss: 0.2198 - accuracy: 0.9146 - val_loss: 0.3556 - val_accuracy: 0.8506
Epoch 6/30
625/625 - 2s - loss: 0.1935 - accuracy: 0.9255 - val_loss: 0.3589 - val_accuracy: 0.8499
Epoch 7/30
625/625 - 3s - loss: 0.1714 - accuracy: 0.9355 - val_loss: 0.3664 - val_accuracy: 0.8544
Epoch 8/30
625/625 - 2s - loss: 0.1526 - accuracy: 0.9442 - val_loss: 0.3890 - val_accuracy: 0.8512
Epoch 9/30
625/625 - 2s - loss: 0.1372 - accuracy: 0.9502 - val_loss: 0.4136 - val_accuracy: 0.8448
Epoch 10/30
625/625 - 2s - loss: 0.1241 - accuracy: 0.9574 - val_loss: 0.4216 - val_accuracy: 0.8504