In [1]:
import json
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [2]:
vocab_size = 10000
embedding_dim = 16
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"


In [3]:
train=pd.read_json("C:\\Users\\Aayush Kandpal\\Downloads\\archive (1)\\Sarcasm_Headlines_Dataset_v2.json",lines=True)

In [4]:
train.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


In [5]:
test=pd.read_json("C:\\Users\\Aayush Kandpal\\Downloads\\archive (1)\\Sarcasm_Headlines_Dataset.json",lines=True)

In [6]:
test.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [7]:
train=train.drop('article_link',axis=1)
test=test.drop('article_link',axis=1)

In [8]:
train.head()

Unnamed: 0,is_sarcastic,headline
0,1,thirtysomething scientists unveil doomsday clo...
1,0,dem rep. totally nails why congress is falling...
2,0,eat your veggies: 9 deliciously different recipes
3,1,inclement weather prevents liar from getting t...
4,1,mother comes pretty close to using word 'strea...


In [9]:
test.head()

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


In [10]:
print(train['headline'])

0        thirtysomething scientists unveil doomsday clo...
1        dem rep. totally nails why congress is falling...
2        eat your veggies: 9 deliciously different recipes
3        inclement weather prevents liar from getting t...
4        mother comes pretty close to using word 'strea...
                               ...                        
28614         jews to celebrate rosh hashasha or something
28615    internal affairs investigator disappointed con...
28616    the most beautiful acceptance speech this week...
28617    mars probe destroyed by orbiting spielberg-gat...
28618                   dad clarifies this not a food stop
Name: headline, Length: 28619, dtype: object


In [11]:
training_sentences=train['headline']
testing_sentences=test['headline']
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [12]:
test

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0
...,...,...
26704,american politics in moral free-fall,0
26705,america's best 20 hikes,0
26706,reparations and obama,0
26707,israeli ban targeting boycott supporters raise...,0


In [13]:
training_padded.shape,testing_padded.shape

((28619, 100), (26709, 100))

In [14]:
training_padded

array([[   1,  355, 3167, ...,    0,    0,    0],
       [7475, 1775,  758, ...,    0,    0,    0],
       [ 863,   33,    1, ...,    0,    0,    0],
       ...,
       [   4,  100,  629, ...,    0,    0,    0],
       [1870, 1313, 3317, ...,    0,    0,    0],
       [ 217, 3283,   21, ...,    0,    0,    0]])

In [15]:
testing_padded

array([[ 324,    1,  619, ...,    0,    0,    0],
       [   4, 8881, 3663, ...,    0,    0,    0],
       [ 142,  709,    2, ...,    0,    0,    0],
       ...,
       [9672,    9,   72, ...,    0,    0,    0],
       [1623,  406, 4185, ...,    0,    0,    0],
       [   1, 1795,    6, ...,    0,    0,    0]])

In [16]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(25, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [17]:
model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 16)           160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 25)                425       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 26        
Total params: 160,451
Trainable params: 160,451
Non-trainable params: 0
_________________________________________________________________


In [18]:
num_epochs = 30
history = model.fit(training_padded, train['is_sarcastic'], epochs=num_epochs,verbose=2)


Epoch 1/30
895/895 - 1s - loss: 0.5998 - accuracy: 0.6816
Epoch 2/30
895/895 - 1s - loss: 0.3536 - accuracy: 0.8550
Epoch 3/30
895/895 - 1s - loss: 0.2829 - accuracy: 0.8857
Epoch 4/30
895/895 - 1s - loss: 0.2438 - accuracy: 0.9025
Epoch 5/30
895/895 - 1s - loss: 0.2179 - accuracy: 0.9146
Epoch 6/30
895/895 - 1s - loss: 0.1964 - accuracy: 0.9239
Epoch 7/30
895/895 - 1s - loss: 0.1802 - accuracy: 0.9317
Epoch 8/30
895/895 - 1s - loss: 0.1670 - accuracy: 0.9364
Epoch 9/30
895/895 - 1s - loss: 0.1559 - accuracy: 0.9421
Epoch 10/30
895/895 - 1s - loss: 0.1463 - accuracy: 0.9460
Epoch 11/30
895/895 - 1s - loss: 0.1380 - accuracy: 0.9496
Epoch 12/30
895/895 - 1s - loss: 0.1308 - accuracy: 0.9522
Epoch 13/30
895/895 - 1s - loss: 0.1245 - accuracy: 0.9557
Epoch 14/30
895/895 - 1s - loss: 0.1173 - accuracy: 0.9590
Epoch 15/30
895/895 - 1s - loss: 0.1137 - accuracy: 0.9595
Epoch 16/30
895/895 - 1s - loss: 0.1086 - accuracy: 0.9611
Epoch 17/30
895/895 - 1s - loss: 0.1036 - accuracy: 0.9628
Epoch 

In [19]:
pred=model.predict(testing_padded)
y=test.is_sarcastic

In [20]:
pred

array([[7.5192138e-06],
       [4.5753717e-03],
       [9.9948317e-01],
       ...,
       [7.0222697e-05],
       [3.0998617e-05],
       [6.3737105e-05]], dtype=float32)

In [21]:
pred1=pred>0.5
pred1
    

array([[False],
       [False],
       [ True],
       ...,
       [False],
       [False],
       [False]])

In [22]:
from sklearn.metrics import accuracy_score
print('Accuracy using our sarcasm predictor',accuracy_score(y,pred1))
from sklearn.metrics import classification_report
print(classification_report(y,pred1))

Accuracy using our sarcasm predictor 0.9877194953012093
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     14985
           1       0.99      0.98      0.99     11724

    accuracy                           0.99     26709
   macro avg       0.99      0.99      0.99     26709
weighted avg       0.99      0.99      0.99     26709



In [33]:
sentence = ["granny starting to fear spiders in the garden might be real", "game of thrones season finale showing this sunday night",
           "new parents wisely start college fund that will pay for 12 weeks of education","the 20 funniest tweets from women this week"
           ,"child's last steps captured on video","Thanks for helping me"]
sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
print(model.predict(padded))

[[9.6545494e-01]
 [8.9132488e-03]
 [9.8639518e-01]
 [2.9111105e-08]
 [8.9048821e-01]
 [7.1921945e-03]]


In [None]:
# The sarcasm predictor is doing a pretty good job