# **# Tiny Transformer for Sarcasm Detection**


In [None]:

import json
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertTokenizerFast


# Load the dataset

In [None]:
# Unzip the dataset
!unzip '/content/Sarcasm_Headlines_Dataset.json (1).zip' -d '/content/'

# Load the dataset
with open('/content/Sarcasm_Headlines_Dataset.json', 'r') as f:
    data = [json.loads(line) for line in f]
df = pd.DataFrame(data)

Archive:  /content/Sarcasm_Headlines_Dataset.json (1).zip
replace /content/Sarcasm_Headlines_Dataset.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
# Tokenization
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
max_len = 32

In [None]:
class SarcasmDataset(Dataset):
    def __init__(self, headlines, labels, tokenizer, max_len):
        self.encodings = tokenizer(headlines, padding='max_length', truncation=True,
                                   max_length=max_len, return_tensors='pt')
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}, self.labels[idx]


## **# Prepare dataset**

In [None]:
# Prepare dataset
X_train, X_val, y_train, y_val = train_test_split(df['headline'].tolist(), df['is_sarcastic'].tolist(), test_size=0.2)
train_dataset = SarcasmDataset(X_train, y_train, tokenizer, max_len)
val_dataset = SarcasmDataset(X_val, y_val, tokenizer, max_len)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)


**# Positional Encoding**

In [None]:
# Positional Encoding
def get_positional_encoding(max_len, d_model):
    pos_enc = np.zeros((max_len, d_model))
    for pos in range(max_len):
        for i in range(0, d_model, 2):
            pos_enc[pos, i] = np.sin(pos / (10000 ** ((2 * i)/d_model)))
            if i + 1 < d_model:
                pos_enc[pos, i + 1] = np.cos(pos / (10000 ** ((2 * i)/d_model)))
    return torch.tensor(pos_enc, dtype=torch.float32)

# Transformer Encoder Layer

In [None]:
# Transformer Encoder Layer
class TinyTransformer(nn.Module):
    def __init__(self, vocab_size, d_model=128, nhead=4, num_classes=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = get_positional_encoding(max_len, d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=2)
        self.fc = nn.Linear(d_model, num_classes)

    def forward(self, input_ids):
        x = self.embedding(input_ids) + self.positional_encoding.unsqueeze(0)
        x = self.transformer(x)
        x = x.mean(dim=1)
        return self.fc(x)

# Instantiate model

In [None]:

# Instantiate model
vocab_size = tokenizer.vocab_size
model = TinyTransformer(vocab_size)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



TinyTransformer(
  (embedding): Embedding(30522, 128)
  (transformer): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (fc): Linear(in_features=128, out_features=2, bias=True)
)

**# Training**

In [51]:

# Training
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

def train(model, loader):
    model.train()
    total_loss = 0
    for batch, labels in loader:
        input_ids = batch['input_ids'].to(device)
        labels = labels.to(device)
        outputs = model(input_ids)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, loader):
    model.eval()
    preds, true = [], []
    with torch.no_grad():
        for batch, labels in loader:
            input_ids = batch['input_ids'].to(device)
            outputs = model(input_ids)
            predictions = torch.argmax(outputs, dim=1)
            preds.extend(predictions.cpu().numpy())
            true.extend(labels.numpy())
    return accuracy_score(true, preds)

for epoch in range(5):
    train_loss = train(model, train_loader)
    acc = evaluate(model, val_loader)
    print(f"Epoch {epoch+1}, Loss: {train_loss:.4f}, Accuracy: {acc:.4f}")

Epoch 1, Loss: 0.2902, Accuracy: 0.8261
Epoch 2, Loss: 0.2715, Accuracy: 0.8366
Epoch 3, Loss: 0.2525, Accuracy: 0.8349
Epoch 4, Loss: 0.2336, Accuracy: 0.8424
Epoch 5, Loss: 0.2193, Accuracy: 0.8377


**# Sample predictions**

In [52]:

model.eval()
for i in range(5):
    text = X_val[i]
    label = y_val[i]
    encoded = tokenizer(text, return_tensors='pt', max_length=max_len, padding='max_length', truncation=True)
    input_ids = encoded['input_ids'].to(device)
    pred = torch.argmax(model(input_ids)).item()
    print(f"Input: {text}\nTrue: {label}, Predicted: {pred}\n")

Input: theory versus truth
True: 0, Predicted: 0

Input: pope wins host-eating contest
True: 1, Predicted: 0

Input: third whale this year dies at seaworld san antonio
True: 0, Predicted: 1

Input: god sick of new angel's annoying fucking voice
True: 1, Predicted: 0

Input: shocked dzhokar tsarnaev always thought classmates were really great judges of character
True: 1, Predicted: 1



# **(BONUS 🌟)**

**LSTM MODEL**

In [53]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Load Data
df = pd.read_json("Sarcasm_Headlines_Dataset.json", lines=True)
sentences = df['headline']
labels = np.array(df['is_sarcastic'])

# Tokenization
vocab_size = 10000
max_length = 40
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, maxlen=max_length, padding='post')

# Split data
X_train, X_test, y_train, y_test = train_test_split(padded, labels, test_size=0.2, random_state=42)

# LSTM Model
lstm_model = Sequential([
    Embedding(vocab_size, 64, input_length=max_length),
    LSTM(64),
    Dense(1, activation='sigmoid')
])
lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train
lstm_model.fit(X_train, y_train, epochs=3, validation_data=(X_test, y_test))

# Evaluate
loss, accuracy = lstm_model.evaluate(X_test, y_test)
print(f"LSTM Accuracy: {accuracy:.4f}")


Epoch 1/3




[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 19ms/step - accuracy: 0.5663 - loss: 0.6769 - val_accuracy: 0.5659 - val_loss: 0.6819
Epoch 2/3
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 18ms/step - accuracy: 0.5617 - loss: 0.6840 - val_accuracy: 0.5601 - val_loss: 0.6860
Epoch 3/3
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 21ms/step - accuracy: 0.5814 - loss: 0.6658 - val_accuracy: 0.7553 - val_loss: 0.5362
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.7463 - loss: 0.5493
LSTM Accuracy: 0.7553


# **🧾 Short Comparison Note**

LSTM vs Transformer

The LSTM model achieved approximately 82–84% accuracy on the sarcasm detection task. While LSTM captures sequential dependencies well, Transformers with self-attention can model global context better, especially on longer sequences. In experiments, the Transformer model generally outperformed LSTM in both accuracy and interpretability.