In [1]:
import json

In [2]:
!wget --no-check-certificate \
    https://storage.googleapis.com/learning-datasets/sarcasm.json \
    -O /tmp/sarcasm.json

--2024-02-25 11:45:46--  https://storage.googleapis.com/learning-datasets/sarcasm.json
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.24.59, 2404:6800:4006:804::201b
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.24.59|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5643545 (5.4M) [application/json]
Saving to: ‘/tmp/sarcasm.json’


2024-02-25 11:45:48 (4.79 MB/s) - ‘/tmp/sarcasm.json’ saved [5643545/5643545]



In [3]:
with open("/tmp/sarcasm.json", "r") as f:
    ds = json.load(f)

In [4]:
# dataset
sentences = []
labels = []
urls = []

In [5]:
for entry in ds:
    sentences.append(entry["headline"])
    labels.append(entry["is_sarcastic"])
    urls.append(entry["article_link"])

In [6]:
print(len(sentences))

26709


In [7]:
train_X = sentences[0 : int(len(sentences) * 0.7)]
train_y = labels[0 : int(len(sentences) * 0.7)]

test_X = sentences[int(len(sentences) * 0.7) :]
test_y = labels[int(len(sentences) * 0.7) :]

Modelling

In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

2024-02-25 11:45:59.753979: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-02-25 11:46:01.900881: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-25 11:46:01.900983: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-25 11:46:02.220150: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-25 11:46:02.914945: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-02-25 11:46:02.916323: I tensorflow/core/platform/cpu_feature_guard.cc:1

In [9]:
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
# tokenizer will have train words only
tokenizer.fit_on_texts(train_X)

In [10]:
len(tokenizer.index_word)

24791

In [26]:
# exporting the tokenizer
tokenizer_json = tokenizer.to_json()
# Convert the dictionary to a JSON-formatted string
tokenizer_json_str = json.dumps(tokenizer_json)

# Write the JSON-formatted string to the file
with open("tokenizer.json", "w") as json_file:
    json_file.write(tokenizer_json_str)



In [11]:
# training
training_sequences = tokenizer.texts_to_sequences(train_X)
training_padded_sequences = pad_sequences(training_sequences, padding  = "post")

In [12]:
print(training_padded_sequences.shape)

(18696, 40)


In [13]:
print(len(train_y))

18696


In [14]:
# testing
testing_sequences = tokenizer.texts_to_sequences(test_X)
testing_padded_sequences = pad_sequences(testing_sequences, padding  = "post", maxlen = 40)

In [15]:
# embedding and model
import tensorflow as tf
import numpy as np

In [16]:
# Embedding layers's primary function is to map discrete categorical variables, such as word indices, to continuous vectors of fixed size
# GlobalAveragePooling1D computes the average value over the entire sequence. For each feature map (channel), it computes the average value of all the elements along the sequence dimension.
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(10000, 16, input_length = len(training_padded_sequences[0])), # vocabulary size, outpput size/embedding size, input length
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 40, 16)            160000    
                                                                 
 global_average_pooling1d (  (None, 16)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 24)                408       
                                                                 
 dense_1 (Dense)             (None, 1)                 25        
                                                                 
Total params: 160433 (626.69 KB)
Trainable params: 160433 (626.69 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [17]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
history = model.fit(np.array(training_padded_sequences), np.array(train_y), epochs=30, verbose=2)

Epoch 1/30
585/585 - 2s - loss: 0.5862 - accuracy: 0.6753 - 2s/epoch - 3ms/step
Epoch 2/30
585/585 - 1s - loss: 0.3302 - accuracy: 0.8683 - 1s/epoch - 2ms/step
Epoch 3/30
585/585 - 1s - loss: 0.2455 - accuracy: 0.9042 - 1s/epoch - 2ms/step
Epoch 4/30
585/585 - 1s - loss: 0.1991 - accuracy: 0.9238 - 1s/epoch - 2ms/step
Epoch 5/30
585/585 - 1s - loss: 0.1639 - accuracy: 0.9404 - 1s/epoch - 2ms/step
Epoch 6/30
585/585 - 1s - loss: 0.1391 - accuracy: 0.9498 - 1s/epoch - 2ms/step
Epoch 7/30
585/585 - 1s - loss: 0.1193 - accuracy: 0.9590 - 1s/epoch - 2ms/step
Epoch 8/30
585/585 - 1s - loss: 0.1027 - accuracy: 0.9659 - 1s/epoch - 2ms/step
Epoch 9/30
585/585 - 1s - loss: 0.0894 - accuracy: 0.9703 - 1s/epoch - 2ms/step
Epoch 10/30
585/585 - 1s - loss: 0.0786 - accuracy: 0.9744 - 1s/epoch - 2ms/step
Epoch 11/30
585/585 - 1s - loss: 0.0699 - accuracy: 0.9790 - 1s/epoch - 2ms/step
Epoch 12/30
585/585 - 1s - loss: 0.0600 - accuracy: 0.9823 - 1s/epoch - 2ms/step
Epoch 13/30
585/585 - 1s - loss: 0.05

In [18]:
# saving the model
model.save("sarcasm_detection.h5")

  saving_api.save_model(


In [19]:
# testing
y_pred = model.predict(testing_padded_sequences)



In [23]:
threshold = 0.5
binary_predictions = convert_to_binary(y_pred, threshold)

In [24]:
from sklearn.metrics import accuracy_score
accuracy_score(binary_predictions, test_y)

0.7982029202545863

In [21]:
def convert_to_binary(y_pred, threshold=0.5):
    """
    Convert continuous predictions to binary using a threshold.

    Args:
    - y_pred (array-like): Continuous predictions.
    - threshold (float): Threshold value for classification.

    Returns:
    - Binary predictions (array-like).
    """
    return [1 if pred >= threshold else 0 for pred in y_pred]