In [1]:
import pandas as pd
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
# Read in the data
data = pd.read_json("Sarcasm_Headlines_Dataset.json", lines=True)

In [4]:
# Preprocess the headline text
def preprocess_text(text):
    doc = nlp(text)
    processed_text = []
    for token in doc:
        if not token.is_stop and not token.is_punct:
            processed_text.append(token.lemma_.lower())
    return " ".join(processed_text)

data["headline_processed"] = data["headline"].apply(preprocess_text)

In [5]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch

from torch.utils.data import TensorDataset, random_split, DataLoader



  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Load the pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [7]:
# Tokenize the text data
tokenized_data = data["headline_processed"].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [8]:
# Pad the tokenized sequences to the same length
max_len = 0
for i in tokenized_data.values:
    if len(i) > max_len:
        max_len = len(i)

In [9]:
padded_data = torch.tensor([i + [0]*(max_len-len(i)) for i in tokenized_data.values])
attention_masks = torch.tensor([[int(token_id > 0) for token_id in i] for i in padded_data])

In [10]:
# Create the dataset and dataloader
dataset = TensorDataset(padded_data, attention_masks, torch.tensor(data["is_sarcastic"].values))
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=True)


In [11]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

for epoch in range(5):
    print(f'Epoch {epoch + 1}')
    total_loss = 0
    total_correct = 0
    total_samples = 0

    for i, batch in enumerate(train_loader):
        model.train()
        optimizer.zero_grad()
        input_ids = batch[0]
        attention_mask = batch[1]
        labels = batch[2]
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        # Compute accuracy
        preds = outputs.logits.argmax(dim=-1)
        correct = (preds == labels).sum().item()
        total_correct += correct
        total_samples += len(labels)

        total_loss += loss.item()

        if i % 100 == 0:
            avg_loss = total_loss / (i+1)
            avg_acc = total_correct / total_samples
            print(f'Batch {i}: Loss = {avg_loss:.4f}, Accuracy = {avg_acc:.4f}')

    avg_loss = total_loss / len(train_loader)
    avg_acc = total_correct / total_samples
    print(f'Epoch {epoch+1}: Loss = {avg_loss:.4f}, Accuracy = {avg_acc:.4f}')




Epoch 1
Batch 0: Loss = 0.7133, Accuracy = 0.5625
Batch 100: Loss = 0.5870, Accuracy = 0.6971
Batch 200: Loss = 0.5406, Accuracy = 0.7299
Batch 300: Loss = 0.5073, Accuracy = 0.7494
Batch 400: Loss = 0.4860, Accuracy = 0.7629
Batch 500: Loss = 0.4685, Accuracy = 0.7745
Batch 600: Loss = 0.4612, Accuracy = 0.7794
Epoch 1: Loss = 0.4556, Accuracy = 0.7819
Epoch 2
Batch 0: Loss = 0.2907, Accuracy = 0.8125
Batch 100: Loss = 0.2875, Accuracy = 0.8775
Batch 200: Loss = 0.2826, Accuracy = 0.8804
Batch 300: Loss = 0.2839, Accuracy = 0.8794
Batch 400: Loss = 0.2805, Accuracy = 0.8808
Batch 500: Loss = 0.2792, Accuracy = 0.8814
Batch 600: Loss = 0.2776, Accuracy = 0.8816
Epoch 2: Loss = 0.2775, Accuracy = 0.8817
Epoch 3
Batch 0: Loss = 0.1895, Accuracy = 0.8750
Batch 100: Loss = 0.1221, Accuracy = 0.9527
Batch 200: Loss = 0.1274, Accuracy = 0.9538
Batch 300: Loss = 0.1291, Accuracy = 0.9537
Batch 400: Loss = 0.1325, Accuracy = 0.9512
Batch 500: Loss = 0.1371, Accuracy = 0.9491
Batch 600: Loss = 

NameError: name 'labels' is not defined