In [142]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import pandas as pd
torch.manual_seed(123)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [143]:
import json

In [144]:
!wget https://raw.githubusercontent.com/Aditya-0911/Sarcasm-Detection/main/Sarcasm_Headlines_Dataset_v2.json -O sarcasm.json

--2025-05-23 12:12:16--  https://raw.githubusercontent.com/Aditya-0911/Sarcasm-Detection/main/Sarcasm_Headlines_Dataset_v2.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6057046 (5.8M) [text/plain]
Saving to: ‘sarcasm.json’


2025-05-23 12:12:16 (86.2 MB/s) - ‘sarcasm.json’ saved [6057046/6057046]



In [145]:
df = pd.read_json("sarcasm.json", lines=True)

In [146]:
df

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...
...,...,...,...
28614,1,jews to celebrate rosh hashasha or something,https://www.theonion.com/jews-to-celebrate-ros...
28615,1,internal affairs investigator disappointed con...,https://local.theonion.com/internal-affairs-in...
28616,0,the most beautiful acceptance speech this week...,https://www.huffingtonpost.com/entry/andrew-ah...
28617,1,mars probe destroyed by orbiting spielberg-gat...,https://www.theonion.com/mars-probe-destroyed-...


In [147]:
df.drop(['article_link'], axis=1, inplace=True)

In [148]:
# tokenize

def tokenize(text):
  text = text.lower()
  text = text.replace('?', '')
  text = text.replace("'", '')
  return text.split()

In [149]:
tokenize(df['headline'][0])

['thirtysomething',
 'scientists',
 'unveil',
 'doomsday',
 'clock',
 'of',
 'hair',
 'loss']

In [150]:
# Vocab

vocab = {
    '<PAD>': 0,
    '<UNK>': 1,
    # ... then add your actual tokens starting from index 2
}

def build_vocab(row):

  tokenized_headline = tokenize(row['headline'])

  merged_tokens = tokenized_headline

  for token in merged_tokens:
    if token not in vocab:
      vocab[token] = len(vocab)

In [151]:
df.apply(build_vocab, axis=1)

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
28614,
28615,
28616,
28617,


In [152]:
len(vocab)

34073

In [153]:
def text2index(text, vocab):
    indexed_text = []
    for token in tokenize(text):
        if token in vocab:
            indexed_text.append(vocab[token])
        else:
            indexed_text.append(vocab['<UNK>'])  # handle unknowns
    return torch.tensor(indexed_text, dtype=torch.long)

In [154]:
text2index(df['headline'][0], vocab)

tensor([2, 3, 4, 5, 6, 7, 8, 9])

In [155]:
class dataset(Dataset):

  def __init__(self, df, vocab):
    self.df = df
    self.vocab = vocab

  def __len__(self):
    return len(self.df)

  def __getitem__(self, idx):

    text = self.df['headline'][idx]
    label = self.df['is_sarcastic'][idx]

    indexed_text = text2index(text, self.vocab)

    return torch.tensor(indexed_text, dtype=torch.long), torch.tensor(label, dtype=torch.long)

In [156]:
data = dataset(df, vocab)

In [157]:
data[1]

  return torch.tensor(indexed_text, dtype=torch.long), torch.tensor(label, dtype=torch.long)


(tensor([10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]), tensor(0))

In [158]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    texts, labels = zip(*batch)
    padded_texts = pad_sequence(texts, batch_first=True, padding_value=vocab['<PAD>'])  # PAD here
    labels = torch.tensor(labels, dtype=torch.float)  # for BCEWithLogitsLoss
    return padded_texts, labels

In [159]:
dataloader = DataLoader(data,batch_size=128,pin_memory=True,collate_fn=collate_fn, shuffle = True)

In [160]:
class SimpleRNNClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size,padding_idx=0)
        self.rnn = nn.RNN(embedding_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)  # single output for binary classification

    def forward(self, x):
        x = self.embedding(x)  # (batch_size, seq_len, embedding_size)
        _, h_n = self.rnn(x)   # h_n: (1, batch_size, hidden_size)
        h_n = h_n.squeeze(0)   # (batch_size, hidden_size)
        logits = self.fc(h_n)  # (batch_size, 1)
        return logits.squeeze(1)  # (batch_size,)

In [161]:
vocab_size = len(vocab)
embedding_size = 50
hidden_size = 64

lr = 0.001
epochs = 10

In [162]:
model = SimpleRNNClassifier(vocab_size, embedding_size, hidden_size).to(device)

lossfn = nn.BCEWithLogitsLoss()
opt = torch.optim.Adam(model.parameters(), lr=lr)

In [163]:
for epoch in range(epochs):

  total_loss = 0

  for text, label in dataloader:

    text, label = text.to(device), label.to(device)

    opt.zero_grad()

    pred = model(text)
    loss = lossfn(pred,label)

    loss.backward()

    opt.step()

    total_loss += loss.item()

  print(f'Epoch: {epoch+1}, Loss: {total_loss:3f}')

  return torch.tensor(indexed_text, dtype=torch.long), torch.tensor(label, dtype=torch.long)


Epoch: 1, Loss: 149.391882
Epoch: 2, Loss: 137.252198
Epoch: 3, Loss: 136.932996
Epoch: 4, Loss: 129.456074
Epoch: 5, Loss: 134.030686
Epoch: 6, Loss: 131.641384
Epoch: 7, Loss: 133.671421
Epoch: 8, Loss: 133.654012
Epoch: 9, Loss: 133.046583
Epoch: 10, Loss: 130.191495


In [164]:
def classify_text(text, model, vocab, device):
    model.eval()  # set to evaluation mode
    with torch.no_grad():
        indexed = text2index(text, vocab).unsqueeze(0).to(device)  # batch size = 1
        output = model(indexed)
        prob = torch.sigmoid(output).item()  # for BCEWithLogitsLoss
        return 1 if prob >= 0.5 else 0, prob  # return class and probability

In [165]:
text = "Man Struck By Lightning: Faces Battery Charge "
label, confidence = classify_text(text, model, vocab, device)

print(f"Prediction: {'Sarcastic' if label == 1 else 'Not Sarcastic'} (Confidence: {confidence:.4f})")

Prediction: Sarcastic (Confidence: 0.6548)
