In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, random_split
from sklearn.metrics import accuracy_score

In [3]:
df = pd.read_json('Sarcasm_Headlines_Dataset_v2.json', lines=True)
df = df[['headline','is_sarcastic']]
df['is_sarcastic'] = df['is_sarcastic'].astype(int)

In [None]:
import re
from gensim.utils import simple_preprocess

X = [simple_preprocess(doc) for doc in df['headline'].tolist()]

In [7]:
def load_glove_model(glove_file_path):
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = [float(val) for val in values[1:]]
            embeddings_index[word] = vector
    return embeddings_index

glove = load_glove_model("glove.6B\glove.6B.100d.txt")  

  glove = load_glove_model("glove.6B\glove.6B.100d.txt")


In [8]:
import numpy as np

def add_padding(X, max_length):
    padded_X = []
    for sentence in X:
        if len(sentence) < max_length:
            sentence += ['<PAD>'] * (max_length - len(sentence))
        else:
            sentence = sentence[:max_length]
        padded_X.append(sentence)
    return padded_X
X = add_padding(X, 20)

In [9]:
X = np.array(X)
print(X.shape)

(28619, 20)


In [10]:
def token_to_vectors(tokens: list[str], glove_model: dict) -> list[list[float]]:
    vectors = []
    for token in tokens:
        if token in glove_model:
            vectors.append(glove_model[token])
        else:
            vectors.append([0.0] * 100)
    return vectors

X = [token_to_vectors(doc, glove) for doc in X]  # Now convert text to GloVe vectors
X = torch.tensor(X, dtype=torch.float32)  # Now convert to tensor
y = torch.tensor(df['is_sarcastic'].values, dtype=torch.float32).view(-1, 1)


In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
class SarcasmDataset(Dataset):
    def __init__(self, vectorized_data, labels):
        self.inputs = vectorized_data
        self.labels = labels

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx],self.labels[idx]
train_loader = DataLoader(SarcasmDataset(X_train, y_train), batch_size=32, shuffle=True)
valid_loader = DataLoader(SarcasmDataset(X_test, y_test), batch_size=32, shuffle=False)

print(train_loader.dataset[0])

(tensor([[-0.2377,  0.5939,  0.5870,  ..., -0.5830,  0.2004,  0.5031],
        [-0.1573, -0.7550,  0.3684,  ..., -0.5414,  0.6782, -0.1725],
        [-0.0543,  0.6230,  0.6135,  ..., -0.6142, -0.1594,  1.0991],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]), tensor([0.]))


In [49]:
class GloveLSTM(nn.Module):
    def __init__(self, embedding_dim, hidden_dim):
        super().__init__()
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x, _ = self.lstm(x)
        x = x[:, -1, :]  # last timestep
        x = self.fc(x)
        return self.sigmoid(x)


In [52]:
loss_fn = nn.BCELoss()
model = GloveLSTM(100, 128)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001,weight_decay=1e-5)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

for epoch in range(15):
    model.train()
    running_loss = 0.0
    for batch_inputs, batch_labels in train_loader:
        batch_inputs = batch_inputs.to(device)
        batch_labels = batch_labels.to(device)

        optimizer.zero_grad()
        outputs = model(batch_inputs)
        loss = loss_fn(outputs, batch_labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {running_loss:.4f}")

Epoch 1, Loss: 374.1336
Epoch 2, Loss: 270.1108
Epoch 3, Loss: 234.2584
Epoch 4, Loss: 206.1987
Epoch 5, Loss: 183.5362
Epoch 6, Loss: 158.7999
Epoch 7, Loss: 136.9773
Epoch 8, Loss: 115.4746
Epoch 9, Loss: 93.5934
Epoch 10, Loss: 77.2950
Epoch 11, Loss: 63.8382
Epoch 12, Loss: 54.0355
Epoch 13, Loss: 44.1434
Epoch 14, Loss: 36.9996
Epoch 15, Loss: 38.1665


In [53]:
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, labels in valid_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = model(inputs)
        preds = (outputs > 0.5).float()

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

acc = accuracy_score(all_labels, all_preds)
print(f"Validation Accuracy: {acc:.4f}")

all_preds = []
all_labels = []
with torch.no_grad():
    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = model(inputs)
        preds = (outputs > 0.5).float()

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
acc = accuracy_score(all_labels, all_preds)
print(f"Train Accuracy: {acc:.4f}")

Validation Accuracy: 0.8592
Train Accuracy: 0.9920


In [55]:
torch.save(model.state_dict(), 'SarcasmClassifier.pth')