In [8]:
import tensorflow as tf
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Tokenization

In [4]:
sentences = [
    'I love my dog', 
    'I love my cat',
    'You love my dog',
    'Do you think my dog is amazing'
]

In [10]:
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences)
print(word_index)
print(sequences)
print(padded)

{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}
[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]
[[ 0  0  0  5  3  2  4]
 [ 0  0  0  5  3  2  7]
 [ 0  0  0  6  3  2  4]
 [ 8  6  9  2  4 10 11]]


Training model

In [12]:
import json

datastore = []
with open("sarcasm.json", "r") as f:
    for line in f:
        datastore.append(json.loads(line))

In [13]:
datastore[:5]

[{'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5',
  'headline': "former versace store clerk sues over secret 'black code' for minority shoppers",
  'is_sarcastic': 0},
 {'article_link': 'https://www.huffingtonpost.com/entry/roseanne-revival-review_us_5ab3a497e4b054d118e04365',
  'headline': "the 'roseanne' revival catches up to our thorny political mood, for better and worse",
  'is_sarcastic': 0},
 {'article_link': 'https://local.theonion.com/mom-starting-to-fear-son-s-web-series-closest-thing-she-1819576697',
  'headline': "mom starting to fear son's web series closest thing she will have to grandchild",
  'is_sarcastic': 1},
 {'article_link': 'https://politics.theonion.com/boehner-just-wants-wife-to-listen-not-come-up-with-alt-1819574302',
  'headline': 'boehner just wants wife to listen, not come up with alternative debt-reduction ideas',
  'is_sarcastic': 1},
 {'article_link': 'https://www.huffingtonpost.com/entry/jk-rowling-w

In [27]:
sentences = []
labels = []
urls = []
for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])
    urls.append(item['article_link'])

In [28]:
sentences[:5]

["former versace store clerk sues over secret 'black code' for minority shoppers",
 "the 'roseanne' revival catches up to our thorny political mood, for better and worse",
 "mom starting to fear son's web series closest thing she will have to grandchild",
 'boehner just wants wife to listen, not come up with alternative debt-reduction ideas',
 'j.k. rowling wishes snape happy birthday in the most magical way']

In [48]:
vocab_size = 10000
embedding_dim = 16
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 20000

In [49]:
# split data for training and test
training_sentences = sentences[0:training_size]
training_labels = labels[0:training_size]

testing_sentences =sentences[training_size:]
testing_labels = labels[training_size:]

In [52]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [53]:
import numpy as np
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [54]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [55]:
num_epochs = 30
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2)

Epoch 1/30
625/625 - 1s - loss: 0.6626 - accuracy: 0.5964 - val_loss: 0.5757 - val_accuracy: 0.7733 - 1s/epoch - 2ms/step
Epoch 2/30
625/625 - 1s - loss: 0.4362 - accuracy: 0.8328 - val_loss: 0.3887 - val_accuracy: 0.8386 - 880ms/epoch - 1ms/step
Epoch 3/30
625/625 - 1s - loss: 0.3178 - accuracy: 0.8720 - val_loss: 0.3544 - val_accuracy: 0.8520 - 861ms/epoch - 1ms/step
Epoch 4/30
625/625 - 1s - loss: 0.2658 - accuracy: 0.8965 - val_loss: 0.3565 - val_accuracy: 0.8410 - 857ms/epoch - 1ms/step
Epoch 5/30
625/625 - 1s - loss: 0.2300 - accuracy: 0.9120 - val_loss: 0.3486 - val_accuracy: 0.8496 - 852ms/epoch - 1ms/step
Epoch 6/30
625/625 - 1s - loss: 0.2023 - accuracy: 0.9230 - val_loss: 0.3516 - val_accuracy: 0.8505 - 844ms/epoch - 1ms/step
Epoch 7/30
625/625 - 1s - loss: 0.1795 - accuracy: 0.9332 - val_loss: 0.3702 - val_accuracy: 0.8498 - 849ms/epoch - 1ms/step
Epoch 8/30
625/625 - 1s - loss: 0.1619 - accuracy: 0.9416 - val_loss: 0.3745 - val_accuracy: 0.8514 - 849ms/epoch - 1ms/step
Epo

In [56]:
sentence = [
    'granny starting to fear spiders in the garden might be real',
    'the weather today is bright and sunny'
]

sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
print(model.predict(padded))

[[7.497187e-01]
 [6.173562e-07]]
