In [14]:
import json
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [15]:
#load data from jason file
datastore = [json.loads(line) for line in open('C:/Users/Omar-ElQady/Downloads/Compressed/Sarcasm_Headlines_Dataset_v2.json', 'r')]        

In [16]:
sentences = [] 
labels = []
urls = []
for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])
    urls.append(item['article_link'])

In [17]:
print(sentences[0])
print(labels[0])
print(urls[0])

thirtysomething scientists unveil doomsday clock of hair loss
1
https://www.theonion.com/thirtysomething-scientists-unveil-doomsday-clock-of-hai-1819586205


In [18]:
vocab_size = 3000
embedding_dim = 16
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 20000


In [19]:
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]


In [20]:
#making Tokenization as indexing its
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

#converte text to sequence then padding its for training and testing sentences
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [21]:
print(training_sequences[0])
print(training_sequences[1])
print()
print(training_padded[0])
print(training_padded[1])
print()
print(training_padded.shape)

[1, 325, 1, 1, 2489, 3, 655, 993]
[1, 1723, 735, 2490, 47, 248, 11, 1824, 919, 8, 1825, 2032, 2297]

[   1  325    1    1 2489    3  655  993    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]
[   1 1723  735 2490   47  248   11 1824  919    8 1825 2032 2297    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0

In [22]:
# Need this block to get it to work with TensorFlow 2.x
import numpy as np
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)
print(training_padded.shape)
print(training_labels.shape)
print(testing_padded.shape)
print(testing_labels.shape)

(20000, 100)
(20000,)
(8619, 100)
(8619,)


In [23]:
#Embedding is a vector"arrow" points to dirction ex:if direction is left is bad, right is good ,top left:bad , top right:not bad 
#Embedding should define vector"arrow" for each word
#embedding_dim is dim of arrows(vector) to use 
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [24]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 16)           48000     
_________________________________________________________________
global_average_pooling1d_1 ( (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 24)                408       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 25        
Total params: 48,433
Trainable params: 48,433
Non-trainable params: 0
_________________________________________________________________


In [25]:
#to match word with label
num_epochs = 30
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2)

Epoch 1/30
625/625 - 3s - loss: 0.6623 - accuracy: 0.6102 - val_loss: 0.5608 - val_accuracy: 0.7760
Epoch 2/30
625/625 - 2s - loss: 0.4470 - accuracy: 0.8074 - val_loss: 0.3900 - val_accuracy: 0.8285
Epoch 3/30
625/625 - 2s - loss: 0.3562 - accuracy: 0.8467 - val_loss: 0.3625 - val_accuracy: 0.8366
Epoch 4/30
625/625 - 2s - loss: 0.3242 - accuracy: 0.8616 - val_loss: 0.3509 - val_accuracy: 0.8446
Epoch 5/30
625/625 - 2s - loss: 0.3048 - accuracy: 0.8709 - val_loss: 0.3488 - val_accuracy: 0.8438
Epoch 6/30
625/625 - 2s - loss: 0.2910 - accuracy: 0.8770 - val_loss: 0.3506 - val_accuracy: 0.8427
Epoch 7/30
625/625 - 3s - loss: 0.2821 - accuracy: 0.8796 - val_loss: 0.3527 - val_accuracy: 0.8427
Epoch 8/30
625/625 - 2s - loss: 0.2763 - accuracy: 0.8826 - val_loss: 0.3594 - val_accuracy: 0.8407
Epoch 9/30
625/625 - 3s - loss: 0.2687 - accuracy: 0.8864 - val_loss: 0.3648 - val_accuracy: 0.8407
Epoch 10/30
625/625 - 2s - loss: 0.2645 - accuracy: 0.8870 - val_loss: 0.3691 - val_accuracy: 0.8378

In [30]:
#Ugliness can be fixed, stupidity is forever
#Zombies eat brains. You’re safe
#Aim at nothing–you’ll hit it every time

sentence = ["Zombies eat brains. You’re safe","Aim at nothing–you’ll hit it every time","my name is omar and i have 21 yeas "]
sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
print(model.predict(padded))

[[0.7632456 ]
 [0.88321066]
 [0.0025298 ]]
