Tokenizing the Sarcasm Dataset

In [1]:
#import required packages
import tensorflow as tf
import json
import tensorflow_datasets as tfds
from tensorflow.keras.utils import pad_sequences

In [2]:
#Download Dataset
!wget -nc https://storage.googleapis.com/tensorflow-1-public/course3/sarcasm.json

File ‘sarcasm.json’ already there; not retrieving.



In [3]:
with open('sarcasm.json','r') as f:
  datastore = json.load(f)

In [4]:
print("Len of datasotre:",len(datastore))

Len of datasotre: 26709


In [5]:
#Check few items of the list
print("datastore[0]:",datastore[0])
print()
print("datastore[2000]:",datastore[2000])


datastore[0]: {'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5', 'headline': "former versace store clerk sues over secret 'black code' for minority shoppers", 'is_sarcastic': 0}

datastore[2000]: {'article_link': 'https://www.huffingtonpost.com/entry/mh370-theft_n_5684061.html', 'headline': 'couple stole $35,000 from missing plane victims, police say', 'is_sarcastic': 0}


In [6]:
# fetching all sentences
sentences = [item['headline'] for item in datastore]

In [7]:
print("Total Number of Sentences:",len(sentences))

Total Number of Sentences: 26709


In [8]:
# iterating over few sentences
for i in range(5):
  print(sentences[i])

former versace store clerk sues over secret 'black code' for minority shoppers
the 'roseanne' revival catches up to our thorny political mood, for better and worse
mom starting to fear son's web series closest thing she will have to grandchild
boehner just wants wife to listen, not come up with alternative debt-reduction ideas
j.k. rowling wishes snape happy birthday in the most magical way


In [9]:
#instantiate layer
vectorize_layer = tf.keras.layers.TextVectorization()

#Build the vocalulary
vectorize_layer.adapt(sentences)

#Apply layer for post padding
post_padded_sequences = vectorize_layer(sentences)

In [10]:
index = 2

print("Sample headline:",sentences[index])
print()
print("Sample post_padded_sequence:",post_padded_sequences[index])
print()
print("Shape of Sample post_padded_sequence:",post_padded_sequences[index].shape)
print()

Sample headline: mom starting to fear son's web series closest thing she will have to grandchild

Sample post_padded_sequence: tf.Tensor(
[  140   825     2   813  1100  2048   571  5057   199   139    39    46
     2 13050     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0], shape=(39,), dtype=int64)

Shape of Sample post_padded_sequence: (39,)



In [11]:
#Pre Padding

vectorize_layer = tf.keras.layers.TextVectorization(ragged = True)

vectorize_layer.adapt(sentences)

In [12]:
ragged_sequences = vectorize_layer(sentences)

In [13]:
index = 2

print("Sample headline:",sentences[index])
print()
print("Sample ragged_sequence:",ragged_sequences[index])
print()
print("Shape of Sample ragged_sequence:",ragged_sequences[index].shape)
print()

Sample headline: mom starting to fear son's web series closest thing she will have to grandchild

Sample ragged_sequence: tf.Tensor(
[  140   825     2   813  1100  2048   571  5057   199   139    39    46
     2 13050], shape=(14,), dtype=int64)

Shape of Sample ragged_sequence: (14,)



In [14]:
pre_padded_sequences = pad_sequences(ragged_sequences.numpy())

In [15]:
index = 2

print("Sample headline:",sentences[index])
print()
print("Sample pre_padded_sequences:",pre_padded_sequences[index])
print()
print("Shape of pre_padded_sequences:",pre_padded_sequences[index].shape)
print()

Sample headline: mom starting to fear son's web series closest thing she will have to grandchild

Sample pre_padded_sequences: [    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0   140   825     2   813  1100  2048   571  5057   199   139    39
    46     2 13050]

Shape of pre_padded_sequences: (39,)

