## Sarcasm Detection in News Headlines

In this notebook, I will implement BERT model using PyTorch

In [1]:
!pip install transformers



In [2]:
!pip install tqdm



### 1. Importing Libraries

In [33]:

import torch
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# BERT model
from transformers import AutoTokenizer, BertForSequenceClassification

from tqdm.auto import tqdm
from functools import partial
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import DataCollatorWithPadding

# T5 model
from transformers import T5Tokenizer, T5ForConditionalGeneration

import warnings
import string
warnings.filterwarnings("ignore")

In [5]:
from google.colab import files
uploades = files.upload()

Saving Sarcasm_Headlines_Dataset_v2.json to Sarcasm_Headlines_Dataset_v2 (2).json


### 2. Importing dataset

In [6]:
df = pd.read_json('Sarcasm_Headlines_Dataset_v2.json', lines=True)
df.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


In [7]:
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
model = BertForSequenceClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english", num_labels=2, ignore_mismatched_sizes=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 1024]) in the checkpoint and torch.Size([2, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Run the model on TPU if available

In [8]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [9]:
model = model.to("cuda")

### 3. Collator

In [11]:
def collator(batch, tokenizer=None, device=None):
  max_len = max([len(b['input_ids']) for b in batch])
  pad_token = tokenizer.pad_token_id

  keys = list(batch[0].keys())
  output = {k: [] for k in keys}

  for b in batch:
    for k in keys:
      if k == "labels":
        output[k].append(b[k])
        continue
      if k == "input_ids":
        output[k].append(b[k] + [pad_token] * (max_len - len(b[k])))
      else:
        output[k].append(b[k] + [0] * (max_len - len(b[k])))

  for k in keys:
    output[k] = torch.tensor(output[k], dtype=torch.long).to(device)

  return output

### 4. Define Dataset and DataLoader

In [12]:
class HeadlinesDataset(Dataset):
  def __init__(self, df, tokenizer):
    super().__init__()

    self.df = df
    self.tokenizer = tokenizer

  def __getitem__(self, index) -> dict:
    out_dict = self.tokenizer(df.loc[index, "headline"])

    label = df.loc[index, "is_sarcastic"]
    out_dict["labels"] = label
    return out_dict

  def __len__(self):
    return df.shape[0]

In [13]:
data_collator = DataCollatorWithPadding(tokenizer, padding=True)

In [15]:
data_collator = partial(collator, tokenizer=tokenizer, device=device)

In [16]:
dataset = HeadlinesDataset(df, tokenizer)

train_len = int(0.85 * len(dataset))
test_len = len(dataset) - train_len

train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_len, test_len])
train_dataloader = DataLoader(train_dataset, batch_size=8, collate_fn=data_collator)
test_dataloader = DataLoader(test_dataset, batch_size=8, collate_fn=data_collator)

### 5. Tuning the BERT model

In [18]:
optim = AdamW(model.parameters(), lr=1e-5)

In [25]:
n_epoch = 3

In [28]:
def validation(test_dataloader):
  model.eval()

  outputs = []
  inputs = []
  with torch.no_grad():
    for batch in tqdm(test_dataloader):
      out = model(**batch)
      outputs += out.logits.argmax(dim=1).tolist()
      inputs += batch['labels'].to('cpu').tolist()

  print("Accuracy", accuracy_score(inputs, outputs))
  model.train()

In [29]:
for epoch in range(n_epoch):
  for i, input in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
    input['input_ids'] = input['input_ids'].to(device)
    input['attention_mask'] = input['attention_mask'].to(device)
    input['token_type_ids'] = input['token_type_ids'].to(device)
    input['labels'] = input['labels'].to(device)
    optim.zero_grad()

    out = model(**input)
    out.loss.backward()
    optim.step()

  validation(test_dataloader)


  0%|          | 0/3041 [00:00<?, ?it/s]

  0%|          | 0/537 [00:00<?, ?it/s]

Accuracy 0.9340787328208712


  0%|          | 0/3041 [00:00<?, ?it/s]

  0%|          | 0/537 [00:00<?, ?it/s]

Accuracy 0.9354763568600046


  0%|          | 0/3041 [00:00<?, ?it/s]

  0%|          | 0/537 [00:00<?, ?it/s]

Accuracy 0.9361751688795714


In [30]:
output = './models/bert_sarcasm_detection_v1'

In [31]:
model.save_pretrained(output)

In [32]:
tokenizer.save_pretrained(output)

('./models/bert_sarcasm_detection_v1/tokenizer_config.json',
 './models/bert_sarcasm_detection_v1/special_tokens_map.json',
 './models/bert_sarcasm_detection_v1/vocab.txt',
 './models/bert_sarcasm_detection_v1/added_tokens.json',
 './models/bert_sarcasm_detection_v1/tokenizer.json')

In [39]:
!tar -czvf models.tar.gz models/

models/
models/bert_sarcasm_detection_v1/
models/bert_sarcasm_detection_v1/vocab.txt
models/bert_sarcasm_detection_v1/tokenizer.json
models/bert_sarcasm_detection_v1/pytorch_model.bin
models/bert_sarcasm_detection_v1/config.json
models/bert_sarcasm_detection_v1/special_tokens_map.json
models/bert_sarcasm_detection_v1/tokenizer_config.json


In [41]:
files.download('models.tar.gz')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>