Dataset: https://www.kaggle.com/datasets/rmisra/news-headlines-dataset-for-sarcasm-detection?resource=download


In [61]:
# Imports
import json
import pandas as pd
import numpy as np
import nltk 
import torch
import gensim as gs
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, auc, roc_curve

In [2]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
# Mount the drive
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [6]:
# use GPU if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [7]:
%cd /content/drive/'My Drive'/'Clubs'/'CAIS++'/'Winter Project'

/content/drive/My Drive/Clubs/CAIS++/Winter Project


Parse the JSON File


In [8]:
# Read in data as a list
def parse_data(file):
    for l in open(file,'r'):
        yield json.loads(l)

data = list(parse_data('./data/Sarcasm_Headlines_Dataset.json'))

In [9]:
# Display part of the data
data[0:3]

[{'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5',
  'headline': "former versace store clerk sues over secret 'black code' for minority shoppers",
  'is_sarcastic': 0},
 {'article_link': 'https://www.huffingtonpost.com/entry/roseanne-revival-review_us_5ab3a497e4b054d118e04365',
  'headline': "the 'roseanne' revival catches up to our thorny political mood, for better and worse",
  'is_sarcastic': 0},
 {'article_link': 'https://local.theonion.com/mom-starting-to-fear-son-s-web-series-closest-thing-she-1819576697',
  'headline': "mom starting to fear son's web series closest thing she will have to grandchild",
  'is_sarcastic': 1}]

Convert to dataframe format


In [10]:
# Create three lists

texts= []
labels = []
# urls = []

for entry in data:
  labels.append(entry['is_sarcastic'])
  texts.append(entry['headline'])
  # urls.append(entry['article_link'])

In [11]:
# Create dataframe
df = pd.DataFrame()
df['texts'] = texts
df['labels'] = labels

In [12]:
# Display dataframe
df.head()

Unnamed: 0,texts,labels
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


In [13]:
# Check how many headlines are sarcastic vs. not

yes = 0
no = 0

for num in df['labels']:
  if num == 0:
    no += 1
  else:
    yes += 1

print('Sarcastic Headlines: ' + str(yes))
print('Non-Sarcastic Headlines: ' + str(no))

Sarcastic Headlines: 11724
Non-Sarcastic Headlines: 14985


Preprocess the Texts


In [14]:
# Make everything lower case
for i in df.index:
  df.at[i, 'texts'] = df.at[i, 'texts'].lower()

In [15]:
# Remove stop words
stop = stopwords.words('english')

for i in df.index:
  old = df.at[i, 'texts'].split()
  new = []

  for w in old:
    if w not in stop:
      new.append(w)
  
  df.at[i, 'texts'] = ' '.join(new)


In [16]:
# Remove all punctuation and numbers

remove = ['`', '!', '@', '#', '$', '%', '^', '&' ,'*' ,'(',')','_','+','\\','-','=','[',']','{','}',';',':','"','|',',','.','<','>','/','?','~', '\'', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

for i in df.index:
  for char in remove:
    df.at[i, 'texts'] = df.at[i, 'texts'].replace(char, '')


In [17]:
# Create a preprocess function that does the above tasks on any string

def preprocess(s):
  # Lower case
  s = s.lower()

  # Remove stop words
  stop = stopwords.words('english')

  old = s.split()
  new = []

  for w in old:
    if w not in stop:
      new.append(w)
  
  s = ' '.join(new)


  # Remove punctuation and numbers
  remove = ['`', '!', '@', '#', '$', '%', '^', '&' ,'*' ,'(',')','_','+','\\','-','=','[',']','{','}',';',':','"','|',',','.','<','>','/','?','~', '\'', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

  for char in remove:
    s = s.replace(char, '')
  
  return s

In [18]:
example = '30 Scientists Land on the Mars Today!'
print( preprocess( example ))

 scientists land mars today


Split into Training and Testing Data

In [35]:
traindf, testdf = train_test_split(df, test_size=0.2, random_state=42)
traindf.head()

Unnamed: 0,texts,labels
17712,annoying ad turns man prowhaling,1
24708,david cameron scottish people ill kill leave,1
5394,report texting driving okay look every couple ...,1
15813,verizon introduces new chargeyouatwhim plan,1
3429,kim kardashian wants everyone honest kanye,0


Create Word2Vec Model


In [36]:
# Split into list of lists
words = []
for headline in traindf['texts'].values:
  words.append(headline.split())

In [37]:
# Create word2vec model
w2v = gs.models.word2vec.Word2Vec(size=256, window=8, min_count=1)
w2v.build_vocab(words)

# vocab size
len(w2v.wv.vocab)

24784

In [38]:
# Train model
numEpochs=50
w2v.train(words, total_examples=len(words), epochs=numEpochs)

(7391834, 7475150)

In [41]:
w2v.wv.most_similar("storm")

[('injures', 0.8085179924964905),
 ('wounds', 0.8039955496788025),
 ('torrential', 0.8025774955749512),
 ('marys', 0.7939261198043823),
 ('cousin', 0.7916151285171509),
 ('collapse', 0.7851724028587341),
 ('defibrillator', 0.7834933996200562),
 ('barred', 0.7831355333328247),
 ('quash', 0.7774592638015747),
 ('waldorf', 0.7724045515060425)]

In [46]:
# Tokenize (unique token to each word)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(traindf['texts'])

# Number of total words
totalWords = len(tokenizer.word_index) + 1
print("Total Words: " + str(totalWords))

Total Words: 24785


In [49]:
# Use pad_sequences() to create train and test data
Xtrain = pad_sequences(tokenizer.texts_to_sequences(traindf.texts), maxlen=256)
Xtest = pad_sequences(tokenizer.texts_to_sequences(testdf.texts), maxlen=256)

Xtrain.shape

(21367, 256)

In [50]:
# Now use targets (0 = not sarcastic, 4 = sarcastic)
Ytrain = traindf.labels
Ytest = testdf.labels

Ytrain.shape

(21367,)

Create the LSTM Model and train it using K-Fold Cross Validation

In [53]:
# Create an embedding matrix (word to embedding of the word)

emb_mat = np.zeros((totalWords, 256))

for word, i in tokenizer.word_index.items():
  if word in w2v.wv:
    emb_mat[i] = w2v.wv[word]


# Embedding layer (first layer of LSTM)
emb_layer = Embedding(totalWords, 256, weights=[emb_mat], input_length=256, trainable=False)

In [63]:
# https://github.com/christianversloot/machine-learning-articles/blob/main/how-to-use-k-fold-cross-validation-with-keras.md

kfold = KFold(n_splits=3, shuffle=True)

fold = 1

inputs = np.concatenate((Xtrain, Xtest), axis=0)
targets = np.concatenate((Ytrain, Ytest), axis=0)

acc_per_fold = []
loss_per_fold = []

# Save models for predictions
models = []

for train, test in kfold.split(inputs, targets):

  # Define the model architecture
  model = Sequential()
  model.add(emb_layer)
  model.add(Dropout(0.4))
  model.add(LSTM(100, dropout=0.15, recurrent_dropout=0.15))
  model.add(Dense(1, activation='sigmoid'))

  # Compile model
  model.compile(loss='binary_crossentropy', optimizer="adam", metrics=['accuracy'])

  # Generate a print
  print('------------------------------------------------------------------------')
  print(f'Training for fold {fold} ...')

  # Fit data to model
  model_history=model.fit(inputs[train], targets[train], batch_size=1024,epochs=10,validation_split=0.1,verbose=1)

  # Generate generalization metrics
  #scores = model.evaluate(inputs[test], targets[test], verbose=0)
  #print(scores)
  #print(f'Score for fold {fold}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
  #acc_per_fold.append(scores[1] * 100)
  #loss_per_fold.append(scores[0])


  scores = model.predict(inputs[test], verbose=1, batch_size=2048)
  Ypred2 = np.where(scores>0.5,1,0)
  Ytest2 = targets[test]

  cm=confusion_matrix(Ypred2, Ytest2)
  print(cm)
  print("True Positives: " + str(cm[0][0]))
  print("False Negatives: " + str(cm[0][1]))
  print("False Positives: " + str(cm[1][0]))
  print("True Negatives: " + str(cm[1][1]))
  print()

  print(classification_report(Ytest2, Ypred2))
  print()

  fpr, tpr, thresholds = roc_curve(Ytest2, Ypred2, pos_label = 1)
  print("AUC: " + str(auc(fpr, tpr)))
  print()
  print()


  # Increase fold number
  fold = fold + 1

  # Save model
  models.append(model)



------------------------------------------------------------------------
Training for fold 1 ...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




[[4202 1259]
 [ 811 2631]]
True Positives: 4202
False Negatives: 1259
False Positives: 811
True Negatives: 2631

              precision    recall  f1-score   support

           0       0.77      0.84      0.80      5013
           1       0.76      0.68      0.72      3890

    accuracy                           0.77      8903
   macro avg       0.77      0.76      0.76      8903
weighted avg       0.77      0.77      0.77      8903


AUC: 0.7572851203836606


------------------------------------------------------------------------
Training for fold 2 ...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




[[3993 1038]
 [ 932 2940]]
True Positives: 3993
False Negatives: 1038
False Positives: 932
True Negatives: 2940

              precision    recall  f1-score   support

           0       0.79      0.81      0.80      4925
           1       0.76      0.74      0.75      3978

    accuracy                           0.78      8903
   macro avg       0.78      0.77      0.78      8903
weighted avg       0.78      0.78      0.78      8903


AUC: 0.7749131390158562


------------------------------------------------------------------------
Training for fold 3 ...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[[4518 1477]
 [ 529 2379]]
True Positives: 4518
False Negatives: 1477
False Positives: 529
True Negatives: 2379

              precision    recall  f1-score   support

           0       0.75      0.90      0.82      5047
           1       0.82      0.62      0.70      3856

    accuracy                           0.77     

In [65]:
 # Second out of the three models has the best accuracy metrics
 models[1]

<keras.engine.sequential.Sequential at 0x7f2d7c771d00>

In [67]:
# Create functions to help with preprocessing and predicting
def predict(s, index):
  # Preprocess
  s = preprocess(s)
  s =pad_sequences(tokenizer.texts_to_sequences([s]), maxlen=256)

  # Predict
  score = models[index].predict(s)
  score = score[0]
  if score < 0.5:
    print("Not Sarcastic: " + str(score))
  else:
    print("Sarcastic: " + str(score))


Examples of Headlines

In [68]:
# https://www.huffpost.com/entry/im-cold-makeup-tiktok_l_63a0ca5fe4b0f4895ade22b5
headline = "TikTok's 'I'm Cold' Makeup Trend Makes You Look Unwell, But People Love It Anyway"
predict(headline, 1)

Not Sarcastic: [0.37517497]


In [72]:
# https://www.theonion.com/fetterman-struggling-to-adapt-to-size-of-capitol-buildi-1849773669
headline = "Fetterman Struggling To Adapt To Size Of Capitol Building"
predict(headline, 1)

Sarcastic: [0.6920613]
