<a href="https://colab.research.google.com/github/Altamash20/Text-classifier/blob/main/TextClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers --quiet
!pip install opendatasets --quiet

import opendatasets as od
od.download("https://www.kaggle.com/datasets/rmisra/news-headlines-dataset-for-sarcasm-detection")

In [2]:
import torch
import torch.nn as nn
from torch.optim import Adam
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

device = 'cuda' if torch.cuda.is_available() else "cpu"

print("Available device: ", device)

Available device:  cuda


In [None]:
data_df = pd.read_json('/content/news-headlines-dataset-for-sarcasm-detection/Sarcasm_Headlines_Dataset.json', lines = True)
data_df.dropna(inplace = True)
data_df.drop_duplicates(inplace=True)
data_df.drop(["article_link"], inplace=True, axis=1)
print(data_df.shape)
data_df.head()

In [4]:
X_train, X_test, y_train, y_test = train_test_split(np.array(data_df["headline"]), np.array(data_df["is_sarcastic"]), test_size=0.3)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.5)

print("Training size: ", X_train.shape[0], "rows which is:", round(X_train.shape[0]/data_df.shape[0], 4)*100, "%")
print("Validation size: ", X_val.shape[0], "rows which is:", round(X_val.shape[0]/data_df.shape[0], 4)*100, "%")
print("Testing size: ", X_test.shape[0], "rows which is:", round(X_test.shape[0]/data_df.shape[0], 4)*100, "%")

Training size:  18695 rows which is: 70.0 %
Validation size:  4006 rows which is: 15.0 %
Testing size:  4007 rows which is: 15.0 %


In [None]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
bert_model = AutoModel.from_pretrained("google-bert/bert-base-uncased")

In [6]:
class dataset(Dataset):
  def __init__(self, X, Y):
    self.X = [tokenizer(x,
                        max_length = 100,
                        truncation = True,
                        padding = "max_length",
                        return_tensors = 'pt').to(device)
                        for x in X
    ]
    self.Y = torch.tensor(Y, dtype = torch.float32).to(device)

  def __len__(self):
    return len(self.X)
  def __getitem__(self, idx):
    return self.X[idx], self.Y[idx]

training_data = dataset(X_train, y_train)
validation_data = dataset(X_val, y_val)
testing_data = dataset(X_test, y_test)

In [7]:
from math import exp
BATCH_SIZE = 20
EPOCHS = 10
LR = 1*exp(-4)

In [8]:
training_dataloader = DataLoader(training_data, batch_size=BATCH_SIZE, shuffle=True)
validation_dataloader = DataLoader(training_data, batch_size=BATCH_SIZE, shuffle=True)
testing_dataloader = DataLoader(training_data, batch_size=BATCH_SIZE, shuffle=True)

In [9]:
class MyModel(nn.Module):
  def __init__(self, bert):
    super(MyModel, self).__init__()

    self.bert = bert
    self.dropout = nn.Dropout(0.25)
    self.linear1 = nn.Linear(768, 384)
    self.linear2 = nn.Linear(384, 1)
    self.sigmoid = nn.Sigmoid()

  def forward(self, input_ids, attention_mask):
    pooled_output = self.bert(input_ids, attention_mask, return_dict = False)[0][:,0]
    output = self.linear1(pooled_output)
    output = self.dropout(output)
    output = self.linear2(output)
    output = self.sigmoid(output)
    return output

In [10]:
for param in bert_model.parameters():
  param.requires_grad = False
model = MyModel(bert_model).to(device)

In [11]:
model

MyModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affin

In [12]:
criterion = nn.BCELoss()
optimizer = Adam(model.parameters(), lr=LR)

In [13]:
total_loss_train_plot = []
total_loss_validation_plot = []
total_acc_train_plot = []
total_acc_validation_plot = []

for epoch in range(EPOCHS):
  total_acc_train = 0
  total_loss_train = 0
  total_acc_val = 0
  total_loss_val = 0
  for idx, data in enumerate(training_dataloader):
    inputs, labels = data
    inputs.to(device)
    labels.to(device)

    prediction = model(inputs["input_ids"].squeeze(1), inputs["attention_mask"].squeeze(1)).squeeze(1)
    batch_loss = criterion(prediction, labels)
    total_loss_train += batch_loss.item()

    acc = (prediction.round() == labels).sum().item()

    total_acc_train += acc

    batch_loss.backward()
    optimizer.step()
    optimizer.zero_grad()

  with torch.no_grad():
    for idx, data in enumerate(validation_dataloader):
      inputs, labels = data
      inputs.to(device)
      labels.to(device)

      prediction = model(inputs["input_ids"].squeeze(1), inputs["attention_mask"].squeeze(1)).squeeze(1)
      batch_loss = criterion(prediction, labels)
      total_loss_val += batch_loss.item()

      acc = (prediction.round()==labels).sum().item()

      total_acc_val += acc

  total_loss_train_plot.append(round(total_loss_train/1000, 4))
  total_loss_validation_plot.append(round(total_loss_train/1000, 4))
  total_acc_train_plot.append(round((total_acc_train/training_data.__len__())*100, 4))
  total_acc_validation_plot.append(round((total_acc_val/validation_data.__len__())*100, 4))

  print(f"""
  Epoch No. {epoch+1} Train Loss: {round(total_loss_train/1000, 4)} Train Accuracy: {round((total_acc_train/training_data.__len__())*100, 4)}
                      Validation Loss: {round(total_loss_train/1000, 4)} Validation Accuracy: {round((total_acc_val/validation_data.__len__())*100, 4)}
  """)


  Epoch No. 1 Train Loss: 13.5242 Train Accuracy: 73.993
                      Validation Loss: 13.5242 Validation Accuracy: 373.4149
  

  Epoch No. 2 Train Loss: 24.9882 Train Accuracy: 72.827
                      Validation Loss: 24.9882 Validation Accuracy: 249.9251
  

  Epoch No. 3 Train Loss: 51.3548 Train Accuracy: 45.0709
                      Validation Loss: 51.3548 Validation Accuracy: 205.7414
  

  Epoch No. 4 Train Loss: 52.29 Train Accuracy: 44.076
                      Validation Loss: 52.29 Validation Accuracy: 205.7164
  

  Epoch No. 5 Train Loss: 52.2933 Train Accuracy: 44.076
                      Validation Loss: 52.2933 Validation Accuracy: 205.7664
  

  Epoch No. 6 Train Loss: 50.5322 Train Accuracy: 45.9267
                      Validation Loss: 50.5322 Validation Accuracy: 206.3155
  

  Epoch No. 7 Train Loss: 52.17 Train Accuracy: 44.2043
                      Validation Loss: 52.17 Validation Accuracy: 206.2406
  

  Epoch No. 8 Train Loss: 51.6023 Trai

In [None]:
with torch.no_grad():
  total_loss_test = 0
  total_acc_test = 0

  for idx, data in enumerate(testing_dataloader):
    inputs, labels = data
    inputs.to(device)
    labels.to(device)

    prediction = model(inputs["input_ids"].squeeze(1), inputs["attention_mask"].squeeze(1)).squeeze(1)
    batch_loss = criterion(prediction, labels)
    total_loss_val += batch_loss.item()

    acc = (prediction.round()==labels).sum().item()

    total_acc_val += acc
print(f"Accuracy Score on testing Data is: {round((total_acc_test/testing_data.__len__())*100, 4)}")


In [None]:
fig, axs = plt.subplots(nrows = 1, ncols = 2, figsize = (15, 5))

axs[0].plot(total_loss_train_plot, label = 'Training Loss')
axs[0].plot(total_loss_validation_plot, label = 'Validation Loss')
axs[0].set_title('Training and Validation Loss over Epochs')
axs[0].set_xlabel('Epochs')
axs[0].set_ylabel('Loss')
axs[0].set_ylim([0, 1])
axs[0].legend()


axs[1].plot(total_acc_train_plot, label = 'Training Accuracy')
axs[1].plot(total_acc_validation_plot, label = 'Validation Accuracy')
axs[1].set_title('Training and Validation Accuracy over Epochs')
axs[1].set_xlabel('Epochs')
axs[1].set_ylabel('Accuracy')
axs[1].set_ylim([0, 100])
axs[1].legend()


plt.tight_layout()
plt.show()