# Task for Today  

***

## News Headline Sarcasm Detection  
  
Given *news headlines*, let's try to predict whether a given headline contains **sarcasm**.  
  
We will use a TensorFlow/Keras text model with word embeddings to make our predictions.

# Getting Started

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

import tensorflow as tf

from sklearn.metrics import confusion_matrix, classification_report

In [2]:
data = pd.read_json('NLP DataSet - Sarcasm Detection/Sarcasm_Headlines_Dataset (1).json', lines=True)

In [3]:
data

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...
...,...,...,...
28614,1,jews to celebrate rosh hashasha or something,https://www.theonion.com/jews-to-celebrate-ros...
28615,1,internal affairs investigator disappointed con...,https://local.theonion.com/internal-affairs-in...
28616,0,the most beautiful acceptance speech this week...,https://www.huffingtonpost.com/entry/andrew-ah...
28617,1,mars probe destroyed by orbiting spielberg-gat...,https://www.theonion.com/mars-probe-destroyed-...


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28619 entries, 0 to 28618
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   is_sarcastic  28619 non-null  int64 
 1   headline      28619 non-null  object
 2   article_link  28619 non-null  object
dtypes: int64(1), object(2)
memory usage: 670.9+ KB


# Preprocessing

In [5]:
def get_sequences(texts, tokenizer, train=True, max_seq_length=None):
    sequences = tokenizer.texts_to_sequences(texts)
    
    if train == True:
        max_seq_length = np.max(list(map(len, sequences)))
    
    sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_seq_length, padding='post')
    
    return sequences

In [6]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop article_link column
    df = df.drop('article_link', axis=1)
    
    # Split df into X and y
    y = df['is_sarcastic']
    X = df['headline']
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Create and fit tokenizer
    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(X_train)
    
    print("Vocab length:", len(tokenizer.word_index) + 1)
    
    # Get sequence data
    X_train = get_sequences(texts=X_train, tokenizer=tokenizer, train=True)
    X_test = get_sequences(texts=X_test, tokenizer=tokenizer, train=False, max_seq_length=X_train.shape[1])
    
    print("Sequence length:", X_train.shape[1])
    
    return X_train, X_test, y_train, y_test

In [7]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

Vocab length: 25849
Sequence length: 152


In [8]:
X_train

array([[   13,  1176,   126, ...,     0,     0,     0],
       [   90,  1413,   161, ...,     0,     0,     0],
       [   82,   160,   100, ...,     0,     0,     0],
       ...,
       [   27,   171, 12537, ...,     0,     0,     0],
       [ 1907,  3385,   294, ...,     0,     0,     0],
       [   54,     4,  1041, ...,     0,     0,     0]])

In [9]:
y_train.value_counts()

0    10533
1     9500
Name: is_sarcastic, dtype: int64

# Training

In [10]:
inputs = tf.keras.Input(shape=(152,))
x = tf.keras.layers.Embedding(input_dim=24846, output_dim=64)(inputs)
x = tf.keras.layers.Flatten()(x)
x = tf.keras.layers.Dense(128, activation='relu')(x)
x = tf.keras.layers.Dense(128, activation='relu')(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])

print(model.summary())

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 152)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 152, 64)           1590144   
_________________________________________________________________
flatten (Flatten)            (None, 9728)              0         
_________________________________________________________________
dense (Dense)                (None, 128)               1245312   
_________________________________________________________________
dense_1 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 129       
Total params: 2,852,097
Trainable params: 2,852,097
Non-trainable params: 0
___________________________________________________

In [11]:
history = model.fit(X_train, y_train, validation_split=0.2, batch_size=32, epochs=100, callbacks = 
                    [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)])

Epoch 1/100

InvalidArgumentError:  indices[29,2] = 24851 is not in [0, 24846)
	 [[node model/embedding/embedding_lookup (defined at <ipython-input-11-b7c54bdf50fb>:1) ]] [Op:__inference_test_function_3015]

Errors may have originated from an input operation.
Input Source operations connected to node model/embedding/embedding_lookup:
 model/embedding/embedding_lookup/2762 (defined at C:\Users\jaspr\anaconda3\lib\contextlib.py:113)

Function call stack:
test_function


# Results

In [None]:
results = model.evaluate(X_test, y_test, verbose=0)

print("Accuracy: {:.2f}%".format(results[1] * 100))
print("     AUC: {:.5f}".format(results[2]))

y_pred = np.squeeze(model.predict(X_test) >= 0.5).astype(np.int)
cm = confusion_matrix(y_test, y_pred, labels=[0, 1])
clr = classification_report(y_test, y_pred, labels=[0, 1], target_names=["No Sarcasm", "Sarcasm"])

plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt='g', vmin=0, cmap='Blues', cbar=False)
plt.xticks(ticks=[0.5, 1.5], labels=["No Sarcasm", "Sarcasm"])
plt.yticks(ticks=[0.5, 1.5], labels=["No Sarcasm", "Sarcasm"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

print("Classification Report:\n----------------------\n", clr)

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/fpvnx9pNRYc