In [1]:
import json

with open("sarcasm.json", 'r') as f:
    datastore = json.load(f)

In [2]:
sentences = []
labels = []

for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])

In [3]:
training_size = 20000

training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

In [4]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

vocab_size = 10000
oov_tok = "<OOV>"
max_length = 100
padding_type='post'
trunc_type='post'

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index




In [5]:
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [6]:
import numpy as np

training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [7]:
from keras.models import Sequential
from keras.layers import Embedding, GlobalAveragePooling1D, Dense

embedding_dim = 16

model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    GlobalAveragePooling1D(),
    Dense(24, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])





In [8]:
history = model.fit(training_padded, training_labels, epochs=30, validation_data=(testing_padded, testing_labels), verbose=2)

Epoch 1/30


625/625 - 10s - loss: 0.6626 - accuracy: 0.5846 - val_loss: 0.5845 - val_accuracy: 0.7845 - 10s/epoch - 16ms/step
Epoch 2/30
625/625 - 4s - loss: 0.4386 - accuracy: 0.8296 - val_loss: 0.3887 - val_accuracy: 0.8407 - 4s/epoch - 7ms/step
Epoch 3/30
625/625 - 5s - loss: 0.3144 - accuracy: 0.8765 - val_loss: 0.3544 - val_accuracy: 0.8541 - 5s/epoch - 8ms/step
Epoch 4/30
625/625 - 5s - loss: 0.2615 - accuracy: 0.8982 - val_loss: 0.3461 - val_accuracy: 0.8557 - 5s/epoch - 8ms/step
Epoch 5/30
625/625 - 5s - loss: 0.2256 - accuracy: 0.9129 - val_loss: 0.3433 - val_accuracy: 0.8569 - 5s/epoch - 8ms/step
Epoch 6/30
625/625 - 3s - loss: 0.1976 - accuracy: 0.9259 - val_loss: 0.3492 - val_accuracy: 0.8544 - 3s/epoch - 5ms/step
Epoch 7/30
625/625 - 5s - loss: 0.1769 - accuracy: 0.9334 - val_loss: 0.3642 - val_accuracy: 0.8551 - 5s/epoch - 7ms/step
Epoch 8/30
625/625 - 4s - loss: 0.1585 - accuracy: 0.9411 - val_loss: 0.3886 - val_accuracy: 0.8459 - 4s/epoch - 7ms/step
Epoch 9/30
625/625 