In [6]:
import json

In [7]:
!wget --no-check-certificate \
    https://storage.googleapis.com/learning-datasets/sarcasm.json \
    -O /tmp/sarcasm.json

--2024-02-21 13:37:33--  https://storage.googleapis.com/learning-datasets/sarcasm.json
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.24.59, 2404:6800:4006:804::201b
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.24.59|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5643545 (5.4M) [application/json]
Saving to: ‘/tmp/sarcasm.json’


2024-02-21 13:37:35 (5.00 MB/s) - ‘/tmp/sarcasm.json’ saved [5643545/5643545]



In [74]:
with open("/tmp/sarcasm.json", "r") as f:
    ds = json.load(f)

In [75]:
# dataset
sentences = []
labels = []
urls = []

In [76]:
for entry in ds:
    sentences.append(entry["headline"])
    labels.append(entry["is_sarcastic"])
    urls.append(entry["article_link"])

In [77]:
print(len(sentences))

26709


In [78]:
train_X = sentences[0 : int(len(sentences) * 0.7)]
train_y = labels[0 : int(len(sentences) * 0.7)]

test_X = sentences[int(len(sentences) * 0.7) :]
test_y = labels[int(len(sentences) * 0.7) :]

Modelling

In [79]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [80]:
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
# tokenizer will have train words only
tokenizer.fit_on_texts(train_X)

In [81]:
len(tokenizer.index_word)

24791

In [82]:
# training
training_sequences = tokenizer.texts_to_sequences(train_X)
training_padded_sequences = pad_sequences(training_sequences, padding  = "post")

In [83]:
print(training_padded_sequences.shape)

(18696, 40)


In [84]:
print(len(train_y))

18696


In [94]:
# testing
testing_sequences = tokenizer.texts_to_sequences(test_X)
testing_padded_sequences = pad_sequences(testing_sequences, padding  = "post", maxlen = 40)

In [86]:
# embedding and model
import tensorflow as tf
import numpy as np

In [87]:
# primary function is to map discrete categorical variables, such as word indices, to continuous vectors of fixed size
# GlobalAveragePooling1D computes the average value over the entire sequence. For each feature map (channel), it computes the average value of all the elements along the sequence dimension.
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(10000, 16, input_length = len(training_padded_sequences[0])), # vocabulary size, outpput size/embedding size, input length
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 40, 16)            160000    
                                                                 
 global_average_pooling1d_5  (None, 16)                0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dense_10 (Dense)            (None, 24)                408       
                                                                 
 dense_11 (Dense)            (None, 1)                 25        
                                                                 
Total params: 160433 (626.69 KB)
Trainable params: 160433 (626.69 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [88]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
history = model.fit(np.array(training_padded_sequences), np.array(train_y), epochs=30, verbose=2)

Epoch 1/30
585/585 - 1s - loss: 0.6008 - accuracy: 0.6693 - 1s/epoch - 2ms/step
Epoch 2/30
585/585 - 1s - loss: 0.3377 - accuracy: 0.8641 - 866ms/epoch - 1ms/step
Epoch 3/30
585/585 - 1s - loss: 0.2519 - accuracy: 0.8981 - 864ms/epoch - 1ms/step
Epoch 4/30
585/585 - 1s - loss: 0.2027 - accuracy: 0.9216 - 858ms/epoch - 1ms/step
Epoch 5/30
585/585 - 1s - loss: 0.1677 - accuracy: 0.9392 - 964ms/epoch - 2ms/step
Epoch 6/30
585/585 - 1s - loss: 0.1423 - accuracy: 0.9487 - 863ms/epoch - 1ms/step
Epoch 7/30
585/585 - 1s - loss: 0.1205 - accuracy: 0.9572 - 856ms/epoch - 1ms/step
Epoch 8/30
585/585 - 1s - loss: 0.1037 - accuracy: 0.9645 - 853ms/epoch - 1ms/step
Epoch 9/30
585/585 - 1s - loss: 0.0906 - accuracy: 0.9708 - 852ms/epoch - 1ms/step
Epoch 10/30
585/585 - 1s - loss: 0.0789 - accuracy: 0.9743 - 855ms/epoch - 1ms/step
Epoch 11/30
585/585 - 1s - loss: 0.0689 - accuracy: 0.9786 - 855ms/epoch - 1ms/step
Epoch 12/30
585/585 - 1s - loss: 0.0592 - accuracy: 0.9821 - 862ms/epoch - 1ms/step
Epoc

In [96]:
# testing
y_pred = model.predict(testing_padded_sequences)



In [100]:
threshold = 0.5
binary_predictions = convert_to_binary(y_pred, threshold)

In [101]:
from sklearn.metrics import accuracy_score
accuracy_score(binary_predictions, test_y)

0.7977037314364158

In [99]:
def convert_to_binary(y_pred, threshold=0.5):
    """
    Convert continuous predictions to binary using a threshold.

    Args:
    - y_pred (array-like): Continuous predictions.
    - threshold (float): Threshold value for classification.

    Returns:
    - Binary predictions (array-like).
    """
    return [1 if pred >= threshold else 0 for pred in y_pred]