In [0]:
import tensorflow as tf
import keras
from keras.preprocessing.text import Tokenizer

In [0]:
sentences = ["I love my dog", "I love my cat", "Do you love my Dog?","Do you think my Dog is amazing?"]

In [0]:
tokenizer = Tokenizer(oov_token="XDX")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'XDX': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'do': 6, 'you': 7, 'cat': 8, 'think': 9, 'is': 10, 'amazing': 11}


In [0]:
# Get the configuration of the tokenizer in the form of a dictionary output
tokenizer.get_config()

{'char_level': False,
 'document_count': 4,
 'filters': '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
 'index_docs': '{"2": 4, "4": 3, "5": 2, "3": 3, "8": 1, "7": 2, "6": 2, "11": 1, "9": 1, "10": 1}',
 'index_word': '{"1": "XDX", "2": "my", "3": "love", "4": "dog", "5": "i", "6": "do", "7": "you", "8": "cat", "9": "think", "10": "is", "11": "amazing"}',
 'lower': True,
 'num_words': None,
 'oov_token': 'XDX',
 'split': ' ',
 'word_counts': '{"i": 2, "love": 3, "my": 4, "dog": 3, "cat": 1, "do": 2, "you": 2, "think": 1, "is": 1, "amazing": 1}',
 'word_docs': '{"my": 4, "dog": 3, "i": 2, "love": 3, "cat": 1, "you": 2, "do": 2, "amazing": 1, "think": 1, "is": 1}',
 'word_index': '{"XDX": 1, "my": 2, "love": 3, "dog": 4, "i": 5, "do": 6, "you": 7, "cat": 8, "think": 9, "is": 10, "amazing": 11}'}

In [0]:
sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)

[[5, 3, 2, 4], [5, 3, 2, 8], [6, 7, 3, 2, 4], [6, 7, 9, 2, 4, 10, 11]]


In [0]:
# use of oov_token
out_of_vocab = ["do you fucking love my dog?", "I hate your cat"]
out_sequences = tokenizer.texts_to_sequences(out_of_vocab)
print(out_sequences)

[[6, 7, 1, 3, 2, 4], [5, 1, 1, 8]]


Clearly, in place of "fucking" and "hate" and "your", we have token for our oov_token, "XDX"..

In [0]:
matrix = tokenizer.sequences_to_matrix(sequences, mode='binary')
print(type(matrix))
print(matrix)

<class 'numpy.ndarray'>
[[0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 1. 0. 1. 0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 1. 1. 0. 1. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 1. 0. 1. 1. 0. 1. 1. 1.]]


In [0]:
from keras.preprocessing.sequence import pad_sequences

padded = pad_sequences(sequences)
print(padded)

[[ 0  0  0  5  3  2  4]
 [ 0  0  0  5  3  2  8]
 [ 0  0  6  7  3  2  4]
 [ 6  7  9  2  4 10 11]]


In [0]:
# adding a few more attributes

padded_more = pad_sequences(sequences, maxlen=3, padding='post', truncating='post')
print(padded_more)

[[5 3 2]
 [5 3 2]
 [6 7 3]
 [6 7 9]]


In [0]:
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json \
    -O /tmp/sarcasm.json

--2020-06-08 18:11:57--  https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.203.128, 2607:f8b0:400c:c02::80
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.203.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5643545 (5.4M) [application/json]
Saving to: ‘/tmp/sarcasm.json’


2020-06-08 18:11:57 (197 MB/s) - ‘/tmp/sarcasm.json’ saved [5643545/5643545]



In [0]:
import pandas as pd
import numpy as np

In [0]:
df = pd.read_json("/tmp/sarcasm.json")
print(df.head())

                                        article_link  ... is_sarcastic
0  https://www.huffingtonpost.com/entry/versace-b...  ...            0
1  https://www.huffingtonpost.com/entry/roseanne-...  ...            0
2  https://local.theonion.com/mom-starting-to-fea...  ...            1
3  https://politics.theonion.com/boehner-just-wan...  ...            1
4  https://www.huffingtonpost.com/entry/jk-rowlin...  ...            0

[5 rows x 3 columns]


In [0]:
sarcasm_url = df['article_link']
sarcasm_article = df['headline']
sarcasm_labels = df['is_sarcastic']

In [0]:
url = list(sarcasm_url)
article = list(sarcasm_article)
labels = list(sarcasm_labels)

In [0]:
print(url[2],article[2], labels[2])

https://local.theonion.com/mom-starting-to-fear-son-s-web-series-closest-thing-she-1819576697 mom starting to fear son's web series closest thing she will have to grandchild 1


In [0]:
print(len(url), len(article), len(labels))

26709 26709 26709


In [0]:
tokenizer.fit_on_texts(article)
sarcasm_word_index = tokenizer.word_index
print(len(sarcasm_word_index))

29657


In [0]:
sarcasm_sequences = tokenizer.texts_to_sequences(article)
padded = pad_sequences(sarcasm_sequences)
print(padded.shape)

(26709, 40)


In [0]:
print(padded[0])

[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0   308 15115   679  3337  2298    48   382  2576
 15116     6  2577  8434]
