<a href="https://colab.research.google.com/github/Akshat-afk/News-Headline-Sarcasm-Detection/blob/main/Sarcasm_headlines_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1. Completely reset numpy
!pip uninstall -y numpy
!pip install numpy==1.24.4 --force-reinstall
!
# 2. Reinstall other libraries to ensure compatibility
!pip install --upgrade --force-reinstall transformers gensim


Found existing installation: numpy 1.24.4
Uninstalling numpy-1.24.4:
  Successfully uninstalled numpy-1.24.4
Collecting numpy==1.24.4
  Using cached numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Using cached numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
Installing collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 1.5.3 which is incompatible.
pymc 5.21.2 requires numpy>=1.25.0, but you have numpy 1.24.4 which is incompatible.
treescope 0.1.9 requires numpy>=1.25.2, but you have numpy 1.24.4 which is incompatible.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 1.24.4 which is incompatible.
dask-cudf-cu12 25.2.2 requires pandas<2.2.4dev0,>=2.0, but you have pandas 1.5.3 whic

Collecting transformers
  Downloading transformers-4.51.2-py3-none-any.whl.metadata (38 kB)
Collecting gensim
  Using cached gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Collecting numpy>=1.17 (from transformers)
  Using cached numpy-2.2.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting packaging>=20.0 (from transformers)
  Downloading packaging-24.2-py3-none-any.whl.metadata (3.2 kB)
Collecting pyyaml>=5.1 (from transformers)
  Downloading PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.1 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadat

In [None]:
import pandas as pd
import nltk
import random
import gensim.downloader as api
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from transformers import pipeline

# Download necessary NLTK resources
nltk.download('punkt_tab')
nltk.download('wordnet')

# Load dataset
df = pd.read_json("/content/Sarcasm_Headlines_Dataset_v2.json", lines=True)
df = df.rename(columns={"headline": "text", "is_sarcastic": "label"})
df = df[df['text'].str.len().between(45, 180)].reset_index(drop=True)

# Load models
word_vectors = api.load("glove-wiki-gigaword-100")
fill_mask = pipeline("fill-mask", model="bert-base-uncased")

# Augmentation Stage 1: PPDB-like Synonym Replacement
def ppdb_synonym_replace(text):
    tokens = word_tokenize(text)
    new_tokens = []
    for token in tokens:
        syns = wordnet.synsets(token)
        lemmas = set([l.name().replace("_", " ") for s in syns for l in s.lemmas()])
        if lemmas:
            replacement = random.choice(list(lemmas))
            new_tokens.append(replacement)
        else:
            new_tokens.append(token)
    return " ".join(new_tokens)

# Augmentation Stage 2: Word2Vec-Based Similarity Swap
def word2vec_replace(text):
    tokens = word_tokenize(text)
    new_tokens = []
    for token in tokens:
        if token in word_vectors:
            try:
                similar = word_vectors.most_similar(token, topn=5)
                if similar:
                    new_tokens.append(similar[0][0])
                else:
                    new_tokens.append(token)
            except:
                new_tokens.append(token)
        else:
            new_tokens.append(token)
    return " ".join(new_tokens)

# Augmentation Stage 3: BERT-Based Word Insertion
def bert_insert(text):
    tokens = word_tokenize(text)
    if len(tokens) < 3: return text
    insert_pos = random.randint(1, len(tokens) - 2)
    tokens.insert(insert_pos, "[MASK]")
    masked_text = " ".join(tokens)
    try:
        preds = fill_mask(masked_text)
        tokens[insert_pos] = preds[0]['token_str']
    except:
        pass
    return " ".join(tokens)

# Combined Augmentation Pipeline
def augment_pipeline(text):
    x = ppdb_synonym_replace(text)
    x = word2vec_replace(x)
    x = bert_insert(x)
    return x

# Apply augmentation
df_aug = df.copy()
df_aug["text"] = df_aug["text"].apply(augment_pipeline)

# Final combined dataset
df_final = pd.concat([df[["text", "label"]], df_aug[["text", "label"]]])
df_final = df_final.sample(frac=1).reset_index(drop=True)  # Shuffle

# Optional: save for reuse
df_final.to_csv("augmented_sarcasm_dataset.csv", index=False)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In 

In [None]:
from transformers import pipeline
from tqdm import tqdm
import pandas as pd

# Load the fill-mask pipeline (optimized batch use)
fill = pipeline("fill-mask", model="bert-base-uncased", device=0)  # Use GPU

def fast_bert_augment(texts):
    augmented = []
    for text in tqdm(texts):
        words = text.split()
        if len(words) > 3:
            idx = len(words) // 2
            words.insert(idx, '[MASK]')
            masked = ' '.join(words)
            try:
                pred = fill(masked)[0]['sequence']
                augmented.append(pred.replace('[CLS] ', '').replace(' [SEP]', ''))
            except:
                augmented.append(text)
        else:
            augmented.append(text)
    return augmented

# Load dataset
df = pd.read_json("/content/Sarcasm_Headlines_Dataset_v2.json", lines=True)
df = df.rename(columns={"headline": "text", "is_sarcastic": "label"})
df = df[df['text'].str.len().between(45, 180)].reset_index(drop=True)

# Apply fast BERT-based augmentation
df['augmented'] = fast_bert_augment(df['text'])

# Double the dataset
df_final = pd.concat([
    df[['text', 'label']],
    df[['augmented', 'label']].rename(columns={'augmented': 'text'})
], ignore_index=True).sample(frac=1).reset_index(drop=True)

df_final.to_csv("fast_augmented_dataset.csv", index=False)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identica

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_scheduler
from torch.optim import AdamW
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from transformers import DataCollatorWithPadding

# Load sarcasm dataset from Kaggle or your local path
import pandas as pd

# Replace this path with your CSV of headlines
df = pd.read_csv("/content/fast_augmented_dataset.csv")

# Optional: filter out too-short or too-long headlines
df = df[df['text'].str.len().between(45, 180)]

# Train-Test Split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)

# Load tokenizer and model
from transformers import RobertaTokenizer, RobertaForSequenceClassification
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

# Tokenize data
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

# Convert to torch dataset
class SarcasmDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        return {
            key: torch.tensor(val[idx])
            for key, val in self.encodings.items()
        } | {"labels": torch.tensor(self.labels[idx])}
    def __len__(self):
        return len(self.labels)

train_dataset = SarcasmDataset(train_encodings, train_labels)
val_dataset = SarcasmDataset(val_encodings, val_labels)

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

# Optimizer & Scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
num_training_steps = len(train_loader) * 3
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Training Loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

model.train()
for epoch in range(3):
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
    print(f"Epoch {epoch+1} complete.")

# Evaluation
model.eval()
y_true, y_pred = [], []
with torch.no_grad():
    for batch in val_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        y_true.extend(batch["labels"].cpu().numpy())
        y_pred.extend(predictions.cpu().numpy())

print(classification_report(y_true, y_pred))


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 complete.
Epoch 2 complete.
Epoch 3 complete.
              precision    recall  f1-score   support

           0       0.95      0.99      0.97      4862
           1       0.99      0.95      0.97      4426

    accuracy                           0.97      9288
   macro avg       0.97      0.97      0.97      9288
weighted avg       0.97      0.97      0.97      9288



In [None]:
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
# Accuracy
accuracy = accuracy_score(y_true, y_pred)

# Macro-Averaged Metrics (treats all classes equally)
f1_macro = f1_score(y_true, y_pred, average='macro')
precision_macro = precision_score(y_true, y_pred, average='macro')
recall_macro = recall_score(y_true, y_pred, average='macro')

# Weighted-Averaged Metrics (accounts for label imbalance)
f1_weighted = f1_score(y_true, y_pred, average='weighted')
precision_weighted = precision_score(y_true, y_pred, average='weighted')
recall_weighted = recall_score(y_true, y_pred, average='weighted')

# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score (macro): {f1_macro:.4f}")
print(f"F1 Score (weighted): {f1_weighted:.4f}")
print(f"Precision (macro): {precision_macro:.4f}")
print(f"Recall (macro): {recall_macro:.4f}")



Accuracy: 0.9699
F1 Score (macro): 0.9697
F1 Score (weighted): 0.9698
Precision (macro): 0.9714
Recall (macro): 0.9688
