In [None]:
%pip install opendatasets
import opendatasets as od

od.download('https://www.kaggle.com/datasets/rmisra/news-headlines-dataset-for-sarcasm-detection')

In [None]:
%pip install torch
%pip install torchvision
%pip install torchsummary
%pip install numpy
%pip install pandas
%pip install matplotlib
%pip install scikit-learn
%pip install transformers

In [1]:
import torch
import torch.nn as nn
import pandas as pd

from torch.optim import Adam
from transformers import AutoTokenizer, AutoModelForMaskedLM
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
device

'mps'

In [3]:
df = pd.read_json('./news-headlines-dataset-for-sarcasm-detection/Sarcasm_Headlines_Dataset.json', lines=True)
df.head()


Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [4]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df.drop(['article_link'], axis=1, inplace=True)
df.shape

(26708, 2)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df['headline'], df['is_sarcastic'], test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-uncased')
bert = AutoModelForMaskedLM.from_pretrained('google-bert/bert-base-uncased')

In [7]:
class dataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y.values

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        X = tokenizer(
            self.X.iloc[idx],
            max_length=100,
            truncation=True,
            padding='max_length',
            return_tensors="pt"
        )
        X = {k: v.squeeze(0) for k, v in X.items()}
        label = torch.tensor(self.Y[idx], dtype=torch.float32)
        return X, label

In [8]:
train_data = dataset(X_train, y_train)
val_data = dataset(X_val, y_val)
test_data = dataset(X_test, y_test)

In [9]:
BATCH_SZIE = 32
EPOCHS = 10
LR = 1e-4

In [10]:
train_dataloader = DataLoader(train_data, batch_size=BATCH_SZIE, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=BATCH_SZIE, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=BATCH_SZIE, shuffle=True)

In [11]:
class Net(nn.Module):
    def __init__(self, bert):
        super(Net, self).__init__()
        self.bert = bert
        self.dropout = nn.Dropout(0.25)
        self.fc1 = nn.Linear(30522, 384)
        self.fc2 = nn.Linear(384, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)[0][:,0]
        pooled_output = self.fc1(pooled_output)
        pooled_output = self.dropout(pooled_output)
        pooled_output = self.fc2(pooled_output)
        return self.sigmoid(pooled_output)


In [12]:
for param in bert.parameters():
    param.requires_grad = False

model = Net(bert).to(device)

In [13]:
loss_fn = nn.BCELoss()
optimizer = Adam(model.parameters(), lr=LR)

In [15]:
total_loss_train_plot = []
total_loss_val_plot = []
total_acc_train_plot = []
total_acc_val_plot = []

for epoch in range(EPOCHS):
    total_loss_train = 0
    total_loss_val = 0
    total_acc_train = 0
    total_acc_val = 0


    for idx, data in enumerate(train_dataloader):
        inputs, labels = data
        inputs = {k: v.to(device) for k, v in inputs.items()}
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs['input_ids'].squeeze(1), inputs['attention_mask'].squeeze(1)).squeeze(1)
        train_loss = loss_fn(outputs, labels)

        train_loss.backward()
        optimizer.step()

        total_loss_train += train_loss.item()
        total_acc_train += (outputs.round() == labels).sum().item()

    with (torch.inference_mode()):
        for idx, data in enumerate(val_dataloader):
            inputs, labels = data
            inputs = {k: v.to(device) for k, v in inputs.items()}
            labels = labels.to(device)
            val_outputs = model(inputs['input_ids'].squeeze(1), inputs['attention_mask'].squeeze(1)).squeeze(1)
            total_loss_val += loss_fn(val_outputs, labels).item()
            total_acc_val += (val_outputs.round() == labels).sum().item()

    total_loss_train_plot.append(round(total_loss_train / 1000, 4))
    total_loss_val_plot.append(round(total_loss_val / 1000, 4))

    total_acc_train_plot.append(round(total_acc_train / train_data.__len__() * 100, 4))
    total_acc_val_plot.append(round(total_acc_val / val_data.__len__() * 100, 4))


    print(f'Epoch {epoch + 1}/{EPOCHS}, train_loss: {total_loss_train_plot[-1]}, train_acc: {total_acc_train_plot[-1]}, val_loss: {total_loss_val_plot[-1]}, val_acc: {total_acc_val_plot[-1]}')

Epoch 1/10, train_loss: 25.5487, train_acc: 56.2878, val_loss: 5.5521, val_acc: 55.8662
Epoch 2/10, train_loss: 25.5821, train_acc: 56.2878, val_loss: 5.5521, val_acc: 55.8662
Epoch 3/10, train_loss: 25.5933, train_acc: 56.2878, val_loss: 5.5521, val_acc: 55.8662
Epoch 4/10, train_loss: 25.5821, train_acc: 56.2878, val_loss: 5.5385, val_acc: 55.8662
Epoch 5/10, train_loss: 25.571, train_acc: 56.2878, val_loss: 5.5521, val_acc: 55.8662
Epoch 6/10, train_loss: 25.5821, train_acc: 56.2878, val_loss: 5.5656, val_acc: 55.8662
Epoch 7/10, train_loss: 25.5821, train_acc: 56.2878, val_loss: 5.5385, val_acc: 55.8662
Epoch 8/10, train_loss: 25.5598, train_acc: 56.2878, val_loss: 5.5385, val_acc: 55.8662
Epoch 9/10, train_loss: 25.571, train_acc: 56.2878, val_loss: 5.5656, val_acc: 55.8662
Epoch 10/10, train_loss: 25.571, train_acc: 56.2878, val_loss: 5.5656, val_acc: 55.8662
