In [93]:
# %pip install opendatasets
# %pip install prophet

import opendatasets as od
import pandas as pd
import numpy as np
import prophet

od.download('https://www.kaggle.com/datasets/rmisra/news-headlines-dataset-for-sarcasm-detection')


Skipping, found downloaded files in ".\news-headlines-dataset-for-sarcasm-detection" (use force=True to force download)


In [94]:
import pandas as pd

df = pd.read_json('news-headlines-dataset-for-sarcasm-detection/Sarcasm_Headlines_Dataset_v2.json', lines=True)
df.head(5)

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


In [95]:
df.shape

(28619, 3)

In [96]:
labels = df['is_sarcastic']
sentences = df['headline']

In [97]:
sentences

0        thirtysomething scientists unveil doomsday clo...
1        dem rep. totally nails why congress is falling...
2        eat your veggies: 9 deliciously different recipes
3        inclement weather prevents liar from getting t...
4        mother comes pretty close to using word 'strea...
                               ...                        
28614         jews to celebrate rosh hashasha or something
28615    internal affairs investigator disappointed con...
28616    the most beautiful acceptance speech this week...
28617    mars probe destroyed by orbiting spielberg-gat...
28618                   dad clarifies this not a food stop
Name: headline, Length: 28619, dtype: object

In [98]:
from gensim.parsing import strip_tags, strip_numeric, strip_multiple_whitespaces, stem_text, strip_punctuation, remove_stopwords
from gensim.parsing import  preprocess_string
import re


# Custom filter method
transform_to_lower = lambda s: s.lower()

remove_single_char = lambda s: re.sub(r'\s+\w{1}\s+', '', s)

# Filters to be executed in pipeline
CLEAN_FILTERS = [strip_tags,
                strip_numeric,
                strip_punctuation, 
                strip_multiple_whitespaces, 
                transform_to_lower,
                # remove_stopwords,
                remove_single_char]

# Method does the filtering of all the unrelevant text elements
def cleaning_pipe(document):
    # Invoking gensim.parsing.preprocess_string method with set of filters
    processed_words = preprocess_string(document, CLEAN_FILTERS)
    
    return processed_words


In [99]:
sentences_processed = sentences.apply(cleaning_pipe)

In [100]:
print(sentences[0])
print(sentences_processed[0])

thirtysomething scientists unveil doomsday clock of hair loss
['thirtysomething', 'scientists', 'unveil', 'doomsday', 'clock', 'of', 'hair', 'loss']


In [101]:
# split data into training and testing sets
from sklearn.model_selection import train_test_split

train_sentences, test_sentences, train_labels, test_labels = train_test_split(sentences, labels, test_size=0.2, random_state=42)


In [102]:
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras import layers, models, Sequential

vocab_size = 50000
maxlen = 200

tokenizer = Tokenizer( num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index

In [103]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, padding='post', maxlen=maxlen, truncating='post')
print(train_sentences[0])
print(train_sequences[0])

thirtysomething scientists unveil doomsday clock of hair loss
[27, 13, 109, 638, 17, 781, 67, 4774, 5, 43, 1939]


In [104]:
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, padding='post', maxlen=maxlen, truncating='post')

In [105]:
import tensorflow as tf

#  prefetch and cache data for faster training

train_dataset = tf.data.Dataset.from_tensor_slices((train_padded, train_labels))
train_dataset = train_dataset.batch(32, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE)

test_dataset = tf.data.Dataset.from_tensor_slices((test_padded, test_labels))
test_dataset = test_dataset.batch(32, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE)

In [106]:
model = Sequential([
    layers.Embedding(vocab_size, 16, input_length=maxlen),
    layers.GlobalAveragePooling1D(),
    layers.Dense(24, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

In [107]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [108]:
# model.fit(train_padded, train_labels, epochs=30, validation_data=(test_padded, test_labels), verbose=1)

In [109]:
model_lstm = Sequential([
    layers.Embedding(vocab_size, 64),
    layers.Bidirectional(layers.LSTM(64, return_sequences=True)),
    layers.Bidirectional(layers.LSTM(32)),
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])
model_lstm.summary()

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_12 (Embedding)    (None, None, 64)          3200000   
                                                                 
 bidirectional_12 (Bidirecti  (None, None, 128)        66048     
 onal)                                                           
                                                                 
 bidirectional_13 (Bidirecti  (None, 64)               41216     
 onal)                                                           
                                                                 
 dense_24 (Dense)            (None, 64)                4160      
                                                                 
 dense_25 (Dense)            (None, 1)                 65        
                                                                 
Total params: 3,311,489
Trainable params: 3,311,489
N

In [110]:
model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [111]:
model_lstm.fit(train_padded, train_labels, epochs=50, validation_data=(test_padded, test_labels), verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x2417ec3f970>

In [112]:
# KNN
# y_pred_knn = knn.predict(X_test)

# print("\nAccuracy-",accuracy_score(y_test, y_pred_knn),'\n')
# cm = confusion_matrix(y_test, y_pred_knn)
# sns.heatmap(cm, annot=True)
# print(classification_report(y_test,y_pred_knn))