In [None]:
import pandas as pd
import numpy as np
import string
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.metrics import accuracy_score, f1_score


In [None]:
import json

In [None]:
data = []
with open('Sarcasm_Headlines_Dataset.json') as f:
    for line in f:
        data.append(json.loads(line))

In [None]:
df = pd.DataFrame(data)

In [None]:
df.head

<bound method NDFrame.head of                                             article_link  \
0      https://www.huffingtonpost.com/entry/versace-b...   
1      https://www.huffingtonpost.com/entry/roseanne-...   
2      https://local.theonion.com/mom-starting-to-fea...   
3      https://politics.theonion.com/boehner-just-wan...   
4      https://www.huffingtonpost.com/entry/jk-rowlin...   
...                                                  ...   
26704  https://www.huffingtonpost.com/entry/american-...   
26705  https://www.huffingtonpost.com/entry/americas-...   
26706  https://www.huffingtonpost.com/entry/reparatio...   
26707  https://www.huffingtonpost.com/entry/israeli-b...   
26708  https://www.huffingtonpost.com/entry/gourmet-g...   

                                                headline  is_sarcastic  
0      former versace store clerk sues over secret 'b...             0  
1      the 'roseanne' revival catches up to our thorn...             0  
2      mom starting to fear so

In [None]:
# Preprocess the text
def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

In [None]:
# Create a DataFrame
df = pd.DataFrame(data)
df = df.drop(['article_link'], axis=1)
df['headline'] = df['headline'].apply(preprocess_text)

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['headline'], df['is_sarcastic'], test_size=0.3, random_state=42)

In [None]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)


In [None]:
# Convert text sequences to numerical sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [None]:
# Pad sequences to ensure equal length
max_length = 100  # Maximum sequence length
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

In [None]:
# Create the LSTM model
embedding_dim = 100  # Dimensionality of the word embeddings
vocab_size = len(tokenizer.word_index) + 1  # Vocabulary size

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))

In [None]:
# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Train the model
model.fit(X_train_pad, y_train, validation_data=(X_test_pad, y_test), epochs=10, batch_size=64)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f6035edbd90>

In [None]:
# Evaluate the model
y_pred_prob = model.predict(X_test_pad)
y_pred = np.round(y_pred_prob).astype(int)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print('Accuracy:', accuracy)
print('F1-score:', f1)

Accuracy: 0.5545987769873955
F1-score: 0.0
