In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import requests
import re
import string

from io import StringIO
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Set pandas option to display the full content of all columns, preventing truncation.
pd.set_option('display.max_colwidth', None)

In [7]:
# Load Sarcasm Dataset
url = "https://storage.googleapis.com/learning-datasets/sarcasm.json"
response = requests.get(url)
df = pd.read_json(StringIO(response.text))
print("Dataset loaded successfully!")

print(df.head())

Dataset loaded successfully!
                                                                                              article_link  \
0                      https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5   
1                 https://www.huffingtonpost.com/entry/roseanne-revival-review_us_5ab3a497e4b054d118e04365   
2            https://local.theonion.com/mom-starting-to-fear-son-s-web-series-closest-thing-she-1819576697   
3          https://politics.theonion.com/boehner-just-wants-wife-to-listen-not-come-up-with-alt-1819574302   
4  https://www.huffingtonpost.com/entry/jk-rowling-wishes-snape-happy-birthday_us_569117c4e4b0cad15e64fdcb   

                                                                               headline  \
0        former versace store clerk sues over secret 'black code' for minority shoppers   
1  the 'roseanne' revival catches up to our thorny political mood, for better and worse   
2       mom starting to fear son's we

In [28]:
result = df[df['is_sarcastic'] == 1]
result

Unnamed: 0,headline,is_sarcastic
2,mom starting to fear son's web series closest thing she will have to grandchild,1
3,"boehner just wants wife to listen, not come up with alternative debt-reduction ideas",1
8,top snake handler leaves sinking huckabee campaign,1
15,nuclear bomb detonates during rehearsal for 'spider-man' musical,1
16,cosby lawyer asks why accusers didn't come forward to be smeared by legal team years ago,1
...,...,...
26693,new bailiff tired of hearing how old bailiff did things,1
26694,breaking: 'the onion' in kill range of boston bomber suspect,1
26695,seaworld crowd applauds for dolphin playfully spraying blood from blowhole,1
26702,pentagon to withhold budget figures out of respect for american families,1


In [10]:
df = df.drop('article_link', axis=1)
print(df.head())

                                                                               headline  \
0        former versace store clerk sues over secret 'black code' for minority shoppers   
1  the 'roseanne' revival catches up to our thorny political mood, for better and worse   
2       mom starting to fear son's web series closest thing she will have to grandchild   
3  boehner just wants wife to listen, not come up with alternative debt-reduction ideas   
4                      j.k. rowling wishes snape happy birthday in the most magical way   

   is_sarcastic  
0             0  
1             0  
2             1  
3             1  
4             0  


In [11]:
df.shape

(26709, 2)

In [12]:
# Define Hyperparameters
vocab_size = 10000    # Define the maximum number of unique words to consider in the vocabulary
max_length = 100      # Set the maximum length for each sentence
oov_tok = "<OOV>"     # Specify the token for out-of-vocabulary words
trunc_type = 'post'   # Determine where to truncate sequences if they exceed max_length ( 'post' means truncate from the end)
padding_type = 'post' # Determine where to add padding if sequences are shorter than max_length ('post' means add padding to the end)
embedding_dim = 16    # Set the dimensionality of the word embeddings

In [14]:
# Prepare data for splitting
sentences = df['headline'].tolist()
labels = df['is_sarcastic'].tolist()

In [15]:
# Split data into training and testing sets
traning_sentence, testing_sentence, traning_labels, testing_labels = train_test_split(
    sentences, labels, test_size=0.2, random_state=42, stratify=labels
)

In [18]:
# Tokenization and Padding
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(traning_sentence) # Tokenizer learns vocabulary from training sentences

traning_sequences = tokenizer.texts_to_sequences(traning_sentence)
testing_sequences = tokenizer.texts_to_sequences(testing_sentence)

traning_padded = pad_sequences(traning_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print(f"\nOriginal sentence example (index 12 from training set): '{traning_sentence[11]}'")
print(f"Padded sequence example (index 12 from training set): {traning_padded[11]}")



Original sentence example (index 12 from training set): 'what aziz ansari, and most straight men, don't get about consent.'
Padded sequence example (index 12 from training set): [  33 3282 3283    9   95  732  311  185   59   17 5967    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]


In [19]:
# Convert to NumPy Arrays
traning_padded = np.array(traning_padded)
traning_labels = np.array(traning_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

print("\nData prepared for TensorFlow model training!")
print(f"Shape of traning_padded: {traning_padded.shape}")
print(f"Shape of traning_labels: {traning_labels.shape}")
print(f"Shape of testing_padded: {testing_padded.shape}")
print(f"Shape of testing_labels: {testing_labels.shape}")


Data prepared for TensorFlow model training!
Shape of traning_padded: (21367, 100)
Shape of traning_labels: (21367,)
Shape of testing_padded: (5342, 100)
Shape of testing_labels: (5342,)


In [22]:
# Define the Model Architecture
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    GlobalAveragePooling1D(),
    Dense(24, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Explicitly build the model for summary display
model.build(input_shape=(None, max_length))

# Compile the Model
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# Print Model Summary
print("\nModel Summary:")
model.summary()


Model Summary:


In [24]:
# Train the Model
history = model.fit(traning_padded, traning_labels, epochs=30, validation_data=(testing_padded, testing_labels), verbose=2)

# Evaluate the Model
loss, accuracy = model.evaluate(testing_padded, testing_labels, verbose=2)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")


Epoch 1/30
668/668 - 4s - 6ms/step - accuracy: 0.9199 - loss: 0.2047 - val_accuracy: 0.8480 - val_loss: 0.3500
Epoch 2/30
668/668 - 5s - 8ms/step - accuracy: 0.9235 - loss: 0.1981 - val_accuracy: 0.8478 - val_loss: 0.3702
Epoch 3/30
668/668 - 3s - 5ms/step - accuracy: 0.9286 - loss: 0.1855 - val_accuracy: 0.8480 - val_loss: 0.3720
Epoch 4/30
668/668 - 3s - 4ms/step - accuracy: 0.9276 - loss: 0.1792 - val_accuracy: 0.8521 - val_loss: 0.3535
Epoch 5/30
668/668 - 6s - 9ms/step - accuracy: 0.9376 - loss: 0.1659 - val_accuracy: 0.8282 - val_loss: 0.4338
Epoch 6/30
668/668 - 3s - 5ms/step - accuracy: 0.9403 - loss: 0.1608 - val_accuracy: 0.8540 - val_loss: 0.3675
Epoch 7/30
668/668 - 5s - 8ms/step - accuracy: 0.9361 - loss: 0.1607 - val_accuracy: 0.8514 - val_loss: 0.3844
Epoch 8/30
668/668 - 5s - 7ms/step - accuracy: 0.9429 - loss: 0.1501 - val_accuracy: 0.8448 - val_loss: 0.4139
Epoch 9/30
668/668 - 3s - 5ms/step - accuracy: 0.9438 - loss: 0.1497 - val_accuracy: 0.8428 - val_loss: 0.4045
E

In [29]:

# Function to interpret probability into descriptive categories
def interpret_sarcasm_probability(probability):
    if probability >= 0.9:
        return "Highly Sarcastic"
    elif probability >= 0.7:
        return "Moderately Sarcastic"
    elif probability >= 0.5:
        return "Potentially Sarcastic"
    elif probability >= 0.2:
        return "Low Sarcasm"
    else:
        return "Not Sarcastic"

# testing the model
new_sentences = [
    "I am myself a king of my state",
    "Code was shared in mail or in this chat?",
    "Data engineering is better then data science",
    "My Friend is going to loose",
    "game of thrones season finale showing this sunday",
    "top snake handler leaves sinking huckabee campaign	"
]

# Convert raw new sentences to sequences using the trained tokenizer
new_sequences = tokenizer.texts_to_sequences(new_sentences)

# Pad the new sequences
new_padded = pad_sequences(new_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Make predictions
raw_predictions = model.predict(new_padded)

for i, sentence_original in enumerate(new_sentences):
    probability = raw_predictions[i][0]
    descriptive_status = interpret_sarcasm_probability(probability)

    print(f"Original: '{sentence_original}'")
    print(f"Probability of Sarcasm: {probability:.4f}")
    print(f"Sarcasm Level: {descriptive_status}")
    print("-" * 40)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step
Original: 'I am myself a king of my state'
Probability of Sarcasm: 0.0037
Sarcasm Level: Not Sarcastic
----------------------------------------
Original: 'Code was shared in mail or in this chat?'
Probability of Sarcasm: 0.0001
Sarcasm Level: Not Sarcastic
----------------------------------------
Original: 'Data engineering is better then data science'
Probability of Sarcasm: 0.0000
Sarcasm Level: Not Sarcastic
----------------------------------------
Original: 'My Friend is going to loose'
Probability of Sarcasm: 0.9616
Sarcasm Level: Highly Sarcastic
----------------------------------------
Original: 'game of thrones season finale showing this sunday'
Probability of Sarcasm: 0.0569
Sarcasm Level: Not Sarcastic
----------------------------------------
Original: 'top snake handler leaves sinking huckabee campaign	'
Probability of Sarcasm: 0.9950
Sarcasm Level: Highly Sarcastic
--------------------------------------