<a href="https://colab.research.google.com/github/lmoroney/dlaicourse/blob/master/TensorFlow%20In%20Practice/Course%203%20-%20NLP/Course%203%20-%20Week%202%20-%20Lesson%202.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import json
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pathlib

In [6]:
vocab_size = 10000
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 20000


In [7]:
try:
  dataset_file_path = 'sarcasm.json'
  with open(dataset_file_path, 'r') as f:
    datastore = json.load(f)
except Exception:
  cache_dir = './tmp'
  dataset_file_name = 'sarcasm.txt'
  dataset_file_origin = 'https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json'

  dataset_file_path = tf.keras.utils.get_file(
      fname=dataset_file_name,
      origin=dataset_file_origin,
      cache_dir=pathlib.Path(cache_dir).absolute()
  )

print(dataset_file_path)

Downloading data from https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json
/tmp/.keras/datasets/sarcasm.txt


In [8]:
with open(dataset_file_path, 'r') as f:
    datastore = json.load(f)

sentences = []
labels = []

for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])

In [9]:
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

In [10]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [11]:
# Need this block to get it to work with TensorFlow 2.x
import numpy as np
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

# RUN ONLY 1 OF THE NEXT 2 CELLS

In [16]:
embedding_dim = 64
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.LSTM(embedding_dim), 
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [23]:
embedding_dim = 64 #most importantly the number of output nodes for LSTM
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

# RUN ONLY 1 OF THE 2 CELLS ABOVE AT A TIME

Other parameters to work with for LSTM are available [here](https://www.tensorflow.org/api_docs/python/tf/keras/layers/LSTM).

In [17]:
model.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 100, 64)           640000    
                                                                 
 lstm_1 (LSTM)               (None, 64)                33024     
                                                                 
 dense_2 (Dense)             (None, 24)                1560      
                                                                 
 dense_3 (Dense)             (None, 1)                 25        
                                                                 
Total params: 674,609
Trainable params: 674,609
Non-trainable params: 0
_________________________________________________________________


In [24]:
num_epochs = 10
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2)

Epoch 1/10
625/625 - 11s - loss: 0.4157 - accuracy: 0.8013 - val_loss: 0.3374 - val_accuracy: 0.8532 - 11s/epoch - 17ms/step
Epoch 2/10
625/625 - 8s - loss: 0.2121 - accuracy: 0.9154 - val_loss: 0.3534 - val_accuracy: 0.8532 - 8s/epoch - 12ms/step
Epoch 3/10
625/625 - 7s - loss: 0.1296 - accuracy: 0.9505 - val_loss: 0.4069 - val_accuracy: 0.8520 - 7s/epoch - 12ms/step
Epoch 4/10
625/625 - 7s - loss: 0.0750 - accuracy: 0.9750 - val_loss: 0.4844 - val_accuracy: 0.8442 - 7s/epoch - 12ms/step
Epoch 5/10
625/625 - 8s - loss: 0.0476 - accuracy: 0.9846 - val_loss: 0.5769 - val_accuracy: 0.8405 - 8s/epoch - 12ms/step
Epoch 6/10
625/625 - 7s - loss: 0.0312 - accuracy: 0.9908 - val_loss: 0.6796 - val_accuracy: 0.8356 - 7s/epoch - 12ms/step
Epoch 7/10
625/625 - 7s - loss: 0.0228 - accuracy: 0.9932 - val_loss: 0.7650 - val_accuracy: 0.8360 - 7s/epoch - 12ms/step
Epoch 8/10
625/625 - 8s - loss: 0.0169 - accuracy: 0.9949 - val_loss: 0.9212 - val_accuracy: 0.8319 - 8s/epoch - 12ms/step
Epoch 9/10
625

In [26]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_sentence(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

print(decode_sentence(training_padded[0]))
print(training_sentences[2])
print(labels[2])

former <OOV> store clerk sues over secret 'black <OOV> for minority shoppers ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?
mom starting to fear son's web series closest thing she will have to grandchild
1


In [27]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)


(10000, 64)


In [28]:
sentence = [
            "you look beautiful",
            "you look extremely beautiful",
            "I couldn't like your dress any better",
            "your dress look good alright"
]
sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
print(model.predict(padded))

[[9.7063355e-02]
 [9.9993455e-01]
 [3.5396765e-06]
 [7.4840283e-01]]
