<a href="https://colab.research.google.com/github/FionaAmuda/Sarcasm-Detection-/blob/main/Copy_of_Sarcasm_detection_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### **Imported Libraries**

In [None]:
!pip install --upgrade transformers # Update the transformers library using pip

In [2]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

#### **Dataset**

In [3]:
df = pd.read_csv("/content/Generalized Sarcasm Data.csv")

#### **EDA**

In [4]:
df.head()

Unnamed: 0,text,label
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


In [5]:
df.isna().sum()

Unnamed: 0,0
text,1
label,0


In [6]:
df.dropna(inplace=True)

In [7]:
df.shape

(30176, 2)

In [8]:
df = df.sample(30000)
df.shape

(30000, 2)

#### **PreProcessing**

In [9]:
texts = df["text"].tolist()
labels = df["label"].tolist()

In [10]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
tokens = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [11]:
labels_tensor = torch.tensor(labels)

#### **Split Data**

In [12]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    tokens["input_ids"], labels_tensor, test_size=0.2, random_state=42
)

train_masks, val_masks, _, _ = train_test_split(
    tokens["attention_mask"], labels_tensor, test_size=0.2, random_state=42
)

In [13]:
from torch.utils.data import DataLoader, TensorDataset
train_dataset = TensorDataset(train_texts, train_masks, train_labels)
val_dataset = TensorDataset(val_texts, val_masks, val_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

#### **Fit the classification model**

In [14]:
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### **Initiating Training Parameters**

In [16]:

from torch.optim import AdamW


optimizer = AdamW(model.parameters(), lr=5e-5)
loss_fn = torch.nn.CrossEntropyLoss()

#### **Training**

In [17]:
from torch.nn.functional import softmax

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

epochs = 4
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        b_input_ids, b_masks, b_labels = [x.to(device) for x in batch]
        optimizer.zero_grad()
        outputs = model(b_input_ids, attention_mask=b_masks)
        loss = loss_fn(outputs.logits, b_labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}")

Epoch 1, Loss: 0.361823529431286
Epoch 2, Loss: 0.234895987919687
Epoch 3, Loss: 0.1744810937171181
Epoch 4, Loss: 0.11593447928856282


#### **Evaluation**

In [21]:
model.eval()
correct, total = 0, 0
model.eval()
correct, total = 0, 0

# Initialize lists before the loop
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in val_loader:
        b_input_ids, b_masks, b_labels = [x.to(device) for x in batch]
        outputs = model(b_input_ids, attention_mask=b_masks)
        preds = softmax(outputs.logits, dim=1).argmax(dim=1)
        correct += (preds == b_labels).sum().item()
        total += b_labels.size(0)

        all_preds.extend(preds.cpu().numpy())  # Store predictions
        all_labels.extend(b_labels.cpu().numpy())  # Store true labels

print(f"Accuracy: {correct / total:.2f}")


Accuracy: 0.89


In [22]:


# Compute precision, recall, and F1-score
precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')

# Compute accuracy
accuracy = accuracy_score(all_labels, all_preds)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")

Accuracy: 0.89
Precision: 0.89
Recall: 0.89
F1-score: 0.88
