In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
df = pd.read_json("Sarcasm_Headlines_Dataset.json", lines=True)

In [3]:
df.shape

(26709, 3)

In [4]:
df.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [5]:
vocab_size = 20000
embedding_dim = 200
max_length = 120
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 20000

In [21]:
sentences = df['headline']
labels = df['is_sarcastic']
urls = df['article_link']

training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# model = tf.keras.Sequential([
#     tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
#     tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
#     tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
#     tf.keras.layers.Dense(64, activation='relu'),
#     tf.keras.layers.Dense(32, activation='relu'),
#     tf.keras.layers.Dense(1, activation='sigmoid')
# ])

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

num_epochs = 30
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=1)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 120, 200)          4000000   
_________________________________________________________________
global_average_pooling1d (Gl (None, 200)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 24)                4824      
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 25        
Total params: 4,004,849
Trainable params: 4,004,849
Non-trainable params: 0
_________________________________________________________________
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30

In [32]:
#Inferencing
sentence1 = ["Sarcasm is like cheap wine - it leaves a terrible after taste", "game of thrones season finale showing this sunday night"]
sentence2 = ["I work 40 hours a week for me to be this poor", "Pakistan beat India in the World Cup"]
sentence3 = ["What a great rainy day to enjoy a cricket match", "The corona virus outbreak led to a pandemic"]
sentence4 = ["What a great day for Manchester United to have lost to Liverpool 5-0", "Liverpool defeated Manchester United" ]
sentence5 = ["inclement weather prevents liar from getting to work", "former versace store clerk sues over secret 'black code' for minority shoppers"]
sentence = ["Sarcasm is like cheap wine - it leaves a terrible after taste", "game of thrones season finale showing this sunday night", "I work 40 hours a week for me to be this poor", "Pakistan beat India in the World Cup", "What a great rainy day to enjoy a cricket match", "The corona virus outbreak led to a pandemic", "What a great day for Manchester United to have lost to Liverpool 5-0", "Liverpool defeated Manchester United", "inclement weather prevents liar from getting to work", "former versace store clerk sues over secret 'black code' for minority shoppers"]
#Thank you for explaining that my eye cancer isn't going to make me deaf. I feel so fortunate that an intellectual giant like yourself would deign to operate on me.
#granny starting to fear spiders in the garden might be real
threshold = 0.7
sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
prediction=model.predict(padded)
prediction_rounded=np.round(prediction, decimals = 4)
# print(prediction_rounded)
# print(prediction_rounded[4])
for i in range(10):
    if prediction_rounded[i]>threshold:
     print(sentence[i]+': '+'Sarcastic')
    else:
     print(sentence[i]+': '+'Non-Sarcastic')




# predictions=[]
# for i in range(1,6):
#     sequences = tokenizer.texts_to_sequences('sentence'+str(i))
#     padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
#     prediction=model.predict(padded)
#     prediction_rounded=np.round(prediction, decimals = 4)
#     #predictions.append(prediction_rounded)
# #     print('sentence'+str(i)+': ')
#     print(prediction_rounded)
#     #print('sentence'+str(i)+': '+prediction_rounded)

# #print(predictions)

Sarcasm is like cheap wine - it leaves a terrible after taste: Non-Sarcastic
game of thrones season finale showing this sunday night: Non-Sarcastic
I work 40 hours a week for me to be this poor: Sarcastic
Pakistan beat India in the World Cup: Non-Sarcastic
What a great rainy day to enjoy a cricket match: Sarcastic
The corona virus outbreak led to a pandemic: Non-Sarcastic
What a great day for Manchester United to have lost to Liverpool 5-0: Non-Sarcastic
Liverpool defeated Manchester United: Non-Sarcastic
inclement weather prevents liar from getting to work: Sarcastic
former versace store clerk sues over secret 'black code' for minority shoppers: Non-Sarcastic
