In [1]:
# 1. tokenization in tensorflow
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
# to deal with text input of different size we need to add a few things. This will enables training in neural networks
from tensorflow.keras.preprocessing.sequence import pad_sequences
sentences=['I love my dog',
           'I love my cat',
           'You love my dog!',
           'Do you think my dog would win a contest?']
tokenizer=Tokenizer(num_words=100,oov_token="<OOV>") # maximum number of words to keep (most common)
tokenizer.fit_on_texts(sentences)
word_index=tokenizer.word_index
print(word_index)

{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'would': 10, 'win': 11, 'a': 12, 'contest': 13}


In [2]:
# 2. representing sentences as a sequence of numbers in order

sequences=tokenizer.texts_to_sequences(sentences)
# add padding
# we want a mechanism where we can handle text the tokenizer has never seen before. We therefore use oov
padded= pad_sequences(sequences)
# to have padding zeros at the end of the sentence:
padded=pad_sequences(sequences,padding='post')
# if you don't want the max length of the padding to be the same size as the largest text you can use maxlen
padded=pad_sequences(sequences,padding='post',maxlen=5)
# if sentences are longer than the max length, you can chop off either side using truncating
padded=pad_sequences(sequences,padding='post',truncating='pre',maxlen=5)

print(sequences)
print(padded)


[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11, 12, 13]]
[[ 5  3  2  4  0]
 [ 5  3  2  7  0]
 [ 6  3  2  4  0]
 [ 4 10 11 12 13]]


In [4]:
# 3. training a neural net to classify text as to whether the text is sarcastic or not

import json

with open('sarcasm.json','r') as f:
    lines = f.readlines()
    s = '['
    for line in lines:
        #each line already had a '\n' so I had to strip it out, add a comma, and put the '\n' back in
        s+= line[:-1] +',\n'
    #the last line will have an extra ',\n' which we need to get rid of and cap with a ']'
    s= s[:-2]+']'
with open('sarcasm.json', 'w') as f:
    f.write(s)

with open('sarcasm.json', 'r') as f:
    datastore = json.load(f)
sentences=[]
labels=[]
urls=[]

for item in datastore:
  sentences.append(item['headline'])
  labels.append(item['is_sarcastic'])
  urls.append(item['article_link'])

In [5]:
# adding the tokenizers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# tokenizer=Tokenizer(oov_token='<OOV>')
# tokenizer.fit_on_texts(sentences)
# word_index=tokenizer.word_index

# sequences=tokenizer.texts_to_sequences(sentences)
# padded=pad_sequences(sequences,padding='post')
# print(padded[0])
# print(padded.shape)
# We need to make sure we only tokenize the training sentences

In [6]:
# we need to have training and testing data so we have to split up our dataset
training_size=20000
training_sentences=sentences[0:training_size]
testing_sentences=sentences[training_size:]
training_labels=labels[0:training_size]
testing_labels=labels[training_size:]

In [7]:
# we need to make sure we only tokenize the training sentences to ensure the neural net has no access to test data
vocab_size=10000
oov_tok='<OOV>'
max_length=100
padding_type='post'
trunc_type='post'
tokenizer=Tokenizer(num_words=vocab_size,oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences) # fit tokenizer only on the training set
word_index= tokenizer.word_index
training_sequences=tokenizer.texts_to_sequences(training_sentences) # create training sequences and pad them
training_padding=pad_sequences(training_sequences,maxlen=max_length,
                               padding=padding_type,truncating=trunc_type)
testing_sequences=tokenizer.texts_to_sequences(testing_sentences) # create testing sequences and pad them
testing_padding=pad_sequences(testing_sequences,maxlen=max_length,
                              padding=padding_type, truncating=trunc_type)

In [8]:
# Need this block to get it to work with TensorFlow 2.x
import numpy as np
training_padded = np.array(training_padding)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padding)
testing_labels = np.array(testing_labels)

In [13]:
# creating the vector embeddings
# updating from normal neural net to LSTM to handle context
embedding_dim=32
model=tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size,64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)), # The numeric parameter is the number of hidden nodes ( dimension of the output)
    #Bidirectinal looks at the both forwards and backwards and picks the best parameter and merges them
    tf.keras.layers.Dense(64,activation='relu'),
    tf.keras.layers.Dense(1,activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [14]:
model.summary() # there are 128 in each layer since we are doing bidirectional

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 64)          640000    
                                                                 
 bidirectional_1 (Bidirecti  (None, 128)               66048     
 onal)                                                           
                                                                 
 dense_2 (Dense)             (None, 64)                8256      
                                                                 
 dense_3 (Dense)             (None, 1)                 65        
                                                                 
Total params: 714369 (2.73 MB)
Trainable params: 714369 (2.73 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# training the model
num_epochs=30
history=model.fit(training_padding,training_labels,epochs=num_epochs,
                  validation_data=(testing_padding,testing_labels),verbose=2)

Epoch 1/30


In [None]:
# testing the model with unseen text
sentence=[
    "granny starting to fear spiders in the garden might be real",
    "the weather today is bright and sunny"
]
sequences=tokenizer.texts_to_sequences(sentence)
padded=pad_sequences(sequences,maxlen=max_length,
                     padding=padding_type,
                     truncating=trunc_type)
print(model.predict(padded))

[[8.4652942e-01]
 [3.0046047e-06]]
