In [8]:
# 1. tokenization in tensorflow
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
# to deal with text input of different size we need to add a few things. This will enables training in neural networks
from tensorflow.keras.preprocessing.sequence import pad_sequences
sentences=['I love my dog',
           'I love my cat',
           'You love my dog!',
           'Do you think my dog would win a contest?']
tokenizer=Tokenizer(num_words=100,oov_token="<OOV>") # maximum number of words to keep (most common)
tokenizer.fit_on_texts(sentences)
word_index=tokenizer.word_index
print(word_index)

{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'would': 10, 'win': 11, 'a': 12, 'contest': 13}


In [11]:
# 2. representing sentences as a sequence of numbers in order

sequences=tokenizer.texts_to_sequences(sentences)
# add padding
# we want a mechanism where we can handle text the tokenizer has never seen before. We therefore use oov
padded= pad_sequences(sequences)
# to have padding zeros at the end of the sentence:
padded=pad_sequences(sequences,padding='post')
# if you don't want the max length of the padding to be the same size as the largest text you can use maxlen
padded=pad_sequences(sequences,padding='post',maxlen=5)
# if sentences are longer than the max length, you can chop off either side using truncating
padded=pad_sequences(sequences,padding='post',truncating='pre',maxlen=5)

print(sequences)
print(padded)


[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11, 12, 13]]
[[ 5  3  2  4  0]
 [ 5  3  2  7  0]
 [ 6  3  2  4  0]
 [ 4 10 11 12 13]]


In [13]:
# 3. training a neural net to classify text as to whether the text is sarcastic or not

import json

with open('sarcasm.json','r') as f:
    lines = f.readlines()
    s = '['
    for line in lines:
        #each line already had a '\n' so I had to strip it out, add a comma, and put the '\n' back in
        s+= line[:-1] +',\n'
    #the last line will have an extra ',\n' which we need to get rid of and cap with a ']'
    s= s[:-2]+']'
with open('sarcasm.json', 'w') as f:
    f.write(s)

with open('sarcasm.json', 'r') as f:
    datastore = json.load(f)
sentences=[]
labels=[]
urls=[]

for item in datastore:
  sentences.append(item['headline'])
  labels.append(item['is_sarcastic'])
  urls.append(item['article_link'])

In [14]:
# adding the tokenizers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer=Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)
word_index=tokenizer.word_index

sequences=tokenizer.texts_to_sequences(sentences)
padded=pad_sequences(sequences,padding='post')
print(padded[0])
print(padded.shape)

[  308 15115   679  3337  2298    48   382  2576 15116     6  2577  8434
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]
(26709, 40)


In [None]:
# we need to have training and testing data so we have to split up our dataset
training_size=20000
training_sentences=sentences[0:training_size]
testing_sentences=sentences[training_size:]
training_labels=labels[0:training_size]
testing_labels=labels[training_size:]