In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import requests
import csv
# from selenium import webdriver
# from selenium.webdriver.common.by import By 

### Data extraction 

In [2]:
url = 'https://storage.googleapis.com/learning-datasets/sarcasm.json'
response = requests.get(url)

if response.status_code == 200:
    data = response.json()
    # print(data)
else:
      print(f"Failed to retrieve file: {response.status_code}")

In [3]:
data[0]

{'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5',
 'headline': "former versace store clerk sues over secret 'black code' for minority shoppers",
 'is_sarcastic': 0}

In [4]:
newsdata = []
sent = []
label = []
for item in data:
    newsdata.append({'headline':item['headline'],'label':item['is_sarcastic']})
    sent.append(item['headline'])
    label.append(item['is_sarcastic'])


with open("data.csv", mode = "w" , newline="", encoding = "utf-8") as file:
    filenames = ['headline','label']
    
    writer = csv.DictWriter(file,fieldnames = filenames)
    writer.writeheader()

    writer.writerows(newsdata)

In [5]:
tokenizer = Tokenizer(oov_token="<00V>")
tokenizer.fit_on_texts(sent)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sent)
padded = pad_sequences(sequences,padding='post')

print(padded[0])
print(padded.shape)

[  308 15115   679  3337  2298    48   382  2576 15116     6  2577  8434
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]
(26709, 40)


#### Divide data for training and testing Data

In [6]:

training_size = 26709*8//100

training_sent = sent[0:training_size]
testing_sent = sent[training_size:]
training_label = label[0:training_size]
testing_label = label[training_size:]



In [7]:
tokenizer = Tokenizer(oov_token="<00V>")
tokenizer.fit_on_texts(training_sent)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(training_sent)
train_padded = pad_sequences(train_sequences,padding='post')

test_sequences = tokenizer.texts_to_sequences(testing_sent)
test_padded = pad_sequences(test_sequences,padding='post')

In [8]:
vocab_size = 10000
embedding_dim = 16
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 20000

In [9]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [10]:
model.summary()

In [12]:
import numpy as np

training_label = np.array(training_label)
testing_label = np.array(testing_label)


In [13]:
num_epochs = 30
history = model.fit(train_padded, training_label, epochs=num_epochs, validation_data=(test_padded, testing_label), verbose=2)


Epoch 1/30
67/67 - 5s - 77ms/step - accuracy: 0.5562 - loss: 0.6855 - val_accuracy: 0.5608 - val_loss: 0.6836
Epoch 2/30
67/67 - 2s - 25ms/step - accuracy: 0.5641 - loss: 0.6722 - val_accuracy: 0.5608 - val_loss: 0.6783
Epoch 3/30
67/67 - 2s - 24ms/step - accuracy: 0.6081 - loss: 0.6379 - val_accuracy: 0.5633 - val_loss: 0.6521
Epoch 4/30
67/67 - 2s - 24ms/step - accuracy: 0.7898 - loss: 0.5559 - val_accuracy: 0.7930 - val_loss: 0.6051
Epoch 5/30
67/67 - 2s - 24ms/step - accuracy: 0.9110 - loss: 0.4310 - val_accuracy: 0.7094 - val_loss: 0.5521
Epoch 6/30
67/67 - 3s - 44ms/step - accuracy: 0.9382 - loss: 0.3171 - val_accuracy: 0.7997 - val_loss: 0.5032
Epoch 7/30
67/67 - 2s - 26ms/step - accuracy: 0.9569 - loss: 0.2298 - val_accuracy: 0.8021 - val_loss: 0.4750
Epoch 8/30
67/67 - 2s - 26ms/step - accuracy: 0.9743 - loss: 0.1680 - val_accuracy: 0.7957 - val_loss: 0.4595
Epoch 9/30
67/67 - 2s - 26ms/step - accuracy: 0.9822 - loss: 0.1251 - val_accuracy: 0.7956 - val_loss: 0.4541
Epoch 10/3