In [1]:
import json
import math
import numpy as np
import tensorflow as tf
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [13]:
#Initeialize global variables to use.
oov_tok = '<OOV>'
vocab_size = 20000
embedding_dim = 16
trunc_type = 'post'
max_len = 120
callback = TensorBoard(log_dir = 'sarcasm-classifier')

In [3]:
#Import dataset.
with  open('../Notebook files/sarcasm.json') as f:
    datastore = json.load(f)
    
sentences = []
labels = []
for i in datastore:
    sentences.append(i['headline'])
    labels.append(i['is_sarcastic'])

In [4]:
train_set = int(len(sentences) * 0.8)
train_set

21367

In [5]:
#Split data into train and test sets.
train_sentences = sentences[0:train_set]
test_sentences = sentences[train_set:]

train_labels = labels[0:train_set]
test_labels = labels[train_set:]

#Convert labels into numpy arrays.
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

print('The length of the train set is:',len(train_sentences))
print('The length of the test set is:',len(test_sentences))

The length of the train set is: 21367
The length of the test set is: 5342


In [6]:
#Tokenize the sentences.
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)

tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(train_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)

In [7]:
#Pad sentences.
train_padded = pad_sequences(train_sequences, padding=trunc_type, maxlen = max_len)
test_padded = pad_sequences(test_sequences, padding=trunc_type, maxlen = max_len)

In [35]:
#Define an embedding model.
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim = vocab_size, output_dim = embedding_dim, input_length = max_len),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation = 'relu'),
    tf.keras.layers.Dense(1, activation = 'sigmoid')
])

In [37]:
#Compile model.
model.compile(
    loss = 'binary_crossentropy',
    optimizer = 'adam',
    metrics = ['accuracy']
)

In [38]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 120, 16)           320000    
                                                                 
 global_average_pooling1d_2   (None, 16)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_10 (Dense)            (None, 24)                408       
                                                                 
 dense_11 (Dense)            (None, 1)                 25        
                                                                 
Total params: 320,433
Trainable params: 320,433
Non-trainable params: 0
_________________________________________________________________


In [39]:
model.fit(train_padded, train_labels,
         validation_data=(test_padded, test_labels),
         epochs = 30, verbose = 1, callbacks=callback)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f9b34c4abb0>

In [40]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [41]:
%tensorboard --logdir sarcasm-classifier/

In [25]:
!kill 31361