In [1]:
import tensorflow as tf 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import json

In [2]:
vocab_size = 10000
embedding_dim = 16
max_length = 32
trunc_type = 'post'
oov_tok = "<!@#>"
training_size = 20000 #27000 dataset size

In [7]:
with open('Datasets/sarcasm.json','r') as f:
    datastore = json.load(f)

In [8]:
sentences = []
labels = []

In [10]:
for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])

In [11]:
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]

training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

In [37]:
len(testing_sentences)

6709

In [16]:
tokenizer = Tokenizer(num_words= vocab_size, oov_token= oov_tok)
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(training_sentences)
padded = pad_sequences(sequences, truncating= trunc_type, maxlen = max_length)

In [19]:
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen = max_length, truncating = trunc_type)

In [25]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])


In [30]:
import numpy as np
training_padded = np.array(padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [31]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [32]:
history = model.fit(
    padded,
    training_labels,
    epochs=30,
    validation_data=(testing_padded, testing_labels),
    verbose= 2
)

Epoch 1/30
625/625 - 2s - loss: 0.5813 - accuracy: 0.6773 - val_loss: 0.4102 - val_accuracy: 0.8295
Epoch 2/30
625/625 - 1s - loss: 0.3172 - accuracy: 0.8705 - val_loss: 0.3470 - val_accuracy: 0.8523
Epoch 3/30
625/625 - 1s - loss: 0.2373 - accuracy: 0.9068 - val_loss: 0.3436 - val_accuracy: 0.8551
Epoch 4/30
625/625 - 1s - loss: 0.1907 - accuracy: 0.9279 - val_loss: 0.3638 - val_accuracy: 0.8517
Epoch 5/30
625/625 - 2s - loss: 0.1577 - accuracy: 0.9419 - val_loss: 0.3874 - val_accuracy: 0.8539
Epoch 6/30
625/625 - 2s - loss: 0.1344 - accuracy: 0.9514 - val_loss: 0.4268 - val_accuracy: 0.8419
Epoch 7/30
625/625 - 2s - loss: 0.1142 - accuracy: 0.9604 - val_loss: 0.4568 - val_accuracy: 0.8463
Epoch 8/30
625/625 - 2s - loss: 0.0987 - accuracy: 0.9668 - val_loss: 0.5044 - val_accuracy: 0.8405
Epoch 9/30
625/625 - 2s - loss: 0.0852 - accuracy: 0.9715 - val_loss: 0.5456 - val_accuracy: 0.8357
Epoch 10/30
625/625 - 2s - loss: 0.0747 - accuracy: 0.9752 - val_loss: 0.6038 - val_accuracy: 0.8296

In [34]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_sentence(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

print(decode_sentence(training_padded[0]))
print(training_sentences[2])
print(labels[2])

? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? former <!@#> store clerk sues over secret 'black <!@#> for minority shoppers
mom starting to fear son's web series closest thing she will have to grandchild
1


In [35]:
print(word_index)

'osha': 24558, 'buzzed': 24559, 'tilsen': 24560, 'reservation': 24561, 'nspw2017': 24562, 'brownies': 24563, 'coogler': 24564, 'wakanda': 24565, "charlottesville's": 24566, 'surrender': 24567, '612th': 24568, 'bracelet': 24569, 'pharmacy': 24570, 'damore': 24571, 'abuzz': 24572, 'burma': 24573, 'mania': 24574, "marketer's": 24575, "'slangry'": 24576, 'filibusters': 24577, 'corrections': 24578, 'refrain': 24579, 'chants': 24580, "'fawlty": 24581, "towers'": 24582, 'deportations': 24583, "cup's": 24584, 'mooney': 24585, 'banter': 24586, 'snatching': 24587, 'boynton': 24588, 'robinson': 24589, '104': 24590, "'effing'": 24591, 'beacon': 24592, "seymour's": 24593, 'stormtroopers': 24594, "lord'": 24595, "cowell's": 24596, 'introverted': 24597, 'attorneys': 24598, 'cleansed': 24599, 'crisp': 24600, 'dapperly': 24601, 'cate': 24602, 'blanchett': 24603, 'grover': 24604, "'apartheid": 24605, "state'": 24606, 'relied': 24607, 'electronics': 24608, '1971': 24609, 'fests': 24610, 'halfpipe': 24611

In [36]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape)

(10000, 16)


In [39]:
sentence = ["granny starting to fear spiders in the garden might be real", "game of thrones season finale showing this sunday night"]
sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type)
print(model.predict(padded))

[[9.8390031e-01]
 [2.9154987e-09]]
