In [1]:
import pandas as pd
import spacy
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import TensorDataset, random_split, DataLoader
from sklearn.metrics import accuracy_score, f1_score



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
# Read in the data
data = pd.read_json("Sarcasm_Headlines_Dataset.json", lines=True)


In [4]:
# Preprocess the headline text
def preprocess_text(text):
    doc = nlp(text)
    processed_text = []
    for token in doc:
        if not token.is_stop and not token.is_punct:
            processed_text.append(token.lemma_.lower())
    return " ".join(processed_text)

data["headline_processed"] = data["headline"].apply(preprocess_text)

In [5]:
# Load the pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [6]:
# Tokenize the text data
tokenized_data = data["headline_processed"].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [8]:
# Pad the tokenized sequences to the same length
max_len = max(len(i) for i in tokenized_data.values)
padded_data = torch.tensor([i + [0]*(max_len-len(i)) for i in tokenized_data.values])
attention_masks = torch.tensor([[int(token_id > 0) for token_id in i] for i in padded_data])



In [10]:

# Create the dataset and dataloader
dataset = TensorDataset(padded_data, attention_masks, torch.tensor(data["is_sarcastic"].values))
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=True)

optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

best_val_loss = float('inf')
patience = 3
epochs_without_improvement = 0

for epoch in range(10):
    print(f'Epoch {epoch + 1}')
    total_loss = 0
    total_correct = 0
    total_samples = 0

    for i, batch in enumerate(train_loader):
        model.train()
        optimizer.zero_grad()
        input_ids = batch[0]
        attention_mask = batch[1]
        labels = batch[2]
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        # Compute accuracy
        preds = outputs.logits.argmax(dim=-1)
        correct = (preds == labels).sum().item()
        total_correct += correct
        total_samples += len(labels)

        total_loss += loss.item()

        if i % 100 == 0:
            avg_loss = total_loss / (i+1)
            avg_acc = total_correct / total_samples
            print(f'Batch {i}: Loss = {avg_loss:.4f}, Accuracy = {avg_acc:.4f}')

    avg_loss = total_loss / len(train_loader)
    avg_acc = total_correct / total_samples
    print(f'Epoch {epoch+1}: Loss = {avg_loss:.4f}, Accuracy = {avg_acc:.4f}')

    # Validation
    val_loss = 0
    val_correct = 0
    val_samples = 0

    with torch.no_grad():
        for val_batch in val_loader:
            model.eval()
            val_input_ids = val_batch[0]
            val_attention_mask = val_batch[1]
            val_labels = val_batch[2]
            val_outputs = model(val_input_ids, attention_mask=val_attention_mask, labels=val_labels)
            val_loss += val_outputs.loss.item()

            val_preds = val_outputs.logits.argmax(dim=-1)
            val_correct += (val_preds == val_labels).sum().item()
            val_samples += len(val_labels)

    val_loss /= len(val_loader)
    val_acc = val_correct / val_samples
    print(f'Validation: Loss = {val_loss:.4f}, Accuracy = {val_acc:.4f}')

    # Early stopping check
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_without_improvement = 0
    else:
        epochs_without_improvement += 1

    if epochs_without_improvement >= patience:
        print(f'Early stopping at epoch {epoch+1}...')
        break

Epoch 1
Batch 0: Loss = 0.4145, Accuracy = 0.8438
Batch 100: Loss = 0.4935, Accuracy = 0.7661
Batch 200: Loss = 0.4806, Accuracy = 0.7736
Batch 300: Loss = 0.4720, Accuracy = 0.7765
Batch 400: Loss = 0.4623, Accuracy = 0.7827
Batch 500: Loss = 0.4492, Accuracy = 0.7899
Batch 600: Loss = 0.4409, Accuracy = 0.7937
Epoch 1: Loss = 0.4363, Accuracy = 0.7964
Validation: Loss = 0.3810, Accuracy = 0.8282
Epoch 2
Batch 0: Loss = 0.3236, Accuracy = 0.8125
Batch 100: Loss = 0.2529, Accuracy = 0.8976
Batch 200: Loss = 0.2641, Accuracy = 0.8937
Batch 300: Loss = 0.2717, Accuracy = 0.8884
Batch 400: Loss = 0.2706, Accuracy = 0.8869
Batch 500: Loss = 0.2695, Accuracy = 0.8865
Batch 600: Loss = 0.2728, Accuracy = 0.8854
Epoch 2: Loss = 0.2709, Accuracy = 0.8865
Validation: Loss = 0.3856, Accuracy = 0.8478
Epoch 3
Batch 0: Loss = 0.0668, Accuracy = 1.0000
Batch 100: Loss = 0.1322, Accuracy = 0.9486
Batch 200: Loss = 0.1306, Accuracy = 0.9504
Batch 300: Loss = 0.1389, Accuracy = 0.9475
Batch 400: Loss 

: 