# Sarcasm Detection      (Total marks: 40)

### Load Data

In [None]:
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, Flatten
from sklearn.model_selection import train_test_split

# Read the JSON file
with open('Sarcasm_Headlines_Dataset.json', 'r') as f:
    data = [json.loads(line) for line in f]

# Convert to DataFrame
df = pd.DataFrame(data)

# Display the first few rows
print(f"Dataset shape: {df.shape}")
df.head()

### Drop `article_link` from dataset

In [None]:
# Drop the article_link column
df = df.drop('article_link', axis=1)
df.head()

### Get length of each headline and add a column for that

In [None]:
# Calculate the length of each headline
df['headline_length'] = df['headline'].apply(lambda x: len(x.split()))

# Display statistics of headline length
print("Headline length statistics:")
print(df['headline_length'].describe())

# Visualize the distribution of headline lengths
plt.figure(figsize=(10, 6))
plt.hist(df['headline_length'], bins=30)
plt.title('Distribution of Headline Lengths')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')
plt.show()

### Initialize parameter values
- Set values for max_features, maxlen, & embedding_size
- max_features: Number of words to take from tokenizer(most frequent words)
- maxlen: Maximum length of each sentence to be limited to 25
- embedding_size: size of embedding vector

In [None]:
max_features = 10000
maxlen = 25
embedding_size = 50  # Using 50d GloVe embeddings

### Apply `tensorflow.keras` Tokenizer and get indices for words
- Initialize Tokenizer object with number of words as 10000
- Fit the tokenizer object on headline column
- Convert the text to sequence


In [None]:
# Initialize the tokenizer
tokenizer = Tokenizer(num_words=max_features)

# Fit the tokenizer on the headlines
tokenizer.fit_on_texts(df['headline'])

# Convert headlines to sequences
sequences = tokenizer.texts_to_sequences(df['headline'])

# Display a sample headline and its sequence
sample_idx = 0
print(f"Original headline: {df['headline'][sample_idx]}")
print(f"Tokenized sequence: {sequences[sample_idx]}")

### Pad sequence
- Pad each example with a maximum length
- Convert target column into numpy array

In [None]:
# Pad the sequences
X = pad_sequences(sequences, maxlen=maxlen)

# Convert target to numpy array
y = np.array(df['is_sarcastic'])

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

# Display a sample padded sequence
print(f"\nPadded sequence for sample headline: {X[sample_idx]}")

### Vocab mapping
- There is no word for 0th index

In [None]:
# Get the word index mapping
word_index = tokenizer.word_index
print(f"Total unique words: {len(word_index)}")

# Display a few word-index mappings
print("\nSample word-index mappings:")
items = list(word_index.items())[:10]
for word, idx in items:
    print(f"{word}: {idx}")

### Set number of words
- Since the above 0th index doesn't have a word, add 1 to the length of the vocabulary

In [None]:
# Calculate vocabulary size (add 1 for the 0th index which is reserved for padding)
vocab_size = min(max_features, len(word_index) + 1)
print(f"Vocabulary size: {vocab_size}")

### Load Glove Word Embeddings

In [None]:
# Load GloVe embeddings
embeddings_index = {}
with open('glove.6B.50d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print(f"Found {len(embeddings_index)} word vectors in GloVe.")

### Create embedding matrix

In [None]:
# Create embedding matrix
embedding_matrix = np.zeros((vocab_size, embedding_size))
for word, i in word_index.items():
    if i >= vocab_size:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros
        embedding_matrix[i] = embedding_vector

# Calculate how many words from our vocabulary are found in GloVe
found_words = 0
for i in range(1, vocab_size):
    if np.sum(embedding_matrix[i]) > 0:
        found_words += 1
        
print(f"Found embeddings for {found_words} words out of {vocab_size-1} words in vocabulary.")
print(f"Coverage: {found_words/(vocab_size-1)*100:.2f}%")

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

### Define model
- Hint: Use Sequential model instance and then add Embedding layer, Bidirectional(LSTM) layer, flatten it, then dense and dropout layers as required.
In the end add a final dense layer with sigmoid activation for binary classification.

In [None]:
# Define the model
model = Sequential()

# Add Embedding layer with pre-trained weights
model.add(Embedding(vocab_size, embedding_size, 
                    weights=[embedding_matrix],
                    input_length=maxlen,
                    trainable=False))

# Add Bidirectional LSTM layer
model.add(Bidirectional(LSTM(64, return_sequences=True)))

# Add another Bidirectional LSTM layer
model.add(Bidirectional(LSTM(32)))

# Flatten the output
model.add(Flatten())

# Add Dense layer with dropout for regularization
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))

# Output layer with sigmoid activation for binary classification
model.add(Dense(1, activation='sigmoid'))

# Display model summary
model.summary()

### Compile the model

In [None]:
# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

### Fit the model

In [None]:
# Define early stopping callback
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

# Train the model
history = model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=64,
    validation_data=(X_val, y_val),
    callbacks=[early_stopping]
)

# Evaluate the model on the validation set
loss, accuracy = model.evaluate(X_val, y_val)
print(f"Validation Loss: {loss:.4f}")
print(f"Validation Accuracy: {accuracy:.4f}")

# Plot training & validation accuracy values
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

# Plot training & validation loss values
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

plt.tight_layout()
plt.show()