<a href="https://colab.research.google.com/github/Dhruv-2020EE30592/Extra-Notebooks/blob/main/Sarcasm_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# checking device

from torch import cuda

device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

In [None]:
# importing data and checking for missing values

import os
import pandas as pd

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

df = pd.read_csv('/kaggle/input/sarcasm-detection/training_data.csv')
df_test = pd.read_csv('/kaggle/input/sarcasm-detection/test_data.csv')
df_sample = pd.read_csv('/kaggle/input/sarcasm-detection/sample_submisson.csv')

df.info()
df.head()

In [None]:
# data preprocessing

import torch
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.df = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        encoding = tokenizer.encode_plus(
            self.df.text.iloc[index],
            add_special_tokens = True,
            max_length = self.max_length,
            padding = 'max_length',
            truncation = True,
            return_token_type_ids = True,
            return_attention_mask = True,
            return_tensors = 'pt',
        )
        return {
            'ID': torch.tensor(self.df.ID.iloc[index]),
            'input_ids': encoding['input_ids'].flatten().clone().detach().long(),
            'attention_mask': encoding['attention_mask'].flatten().clone().detach().long(),
            'token_type_ids': encoding['token_type_ids'].flatten().clone().detach().long(),
            'targets': torch.tensor(self.df.label.iloc[index])
        }

In [None]:
# dataloader

from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from transformers import DistilBertTokenizer


MAX_LEN = 512
BATCH_SIZE = 16
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

df_train, df_val = train_test_split(df, test_size = 0.2)

ds_train = CustomDataset(df_train, tokenizer, MAX_LEN)
dl_train = DataLoader(ds_train, batch_size = BATCH_SIZE, shuffle=True)

ds_val = CustomDataset(df_val, tokenizer, MAX_LEN)
dl_val = DataLoader(ds_val, batch_size = BATCH_SIZE, shuffle=True)

In [None]:
# defining model
# we use the first token for sequence classification as that contains all information in the BERT model

from transformers import DistilBertModel, AdamW
import torch.nn as nn

LEARNING_RATE = 1e-05

class CustomModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.l2 = torch.nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids, attention_mask = attention_mask)
        output = self.l2(output_1['last_hidden_state'][:, 0])
        return output

model = CustomModel()
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr = LEARNING_RATE)

In [None]:
# training

from tqdm import tqdm

NUM_EPOCHS = 5

def loss_fnx(outputs, targets):
    return torch.nn.CrossEntropyLoss()(outputs, targets)

for epoch in range(NUM_EPOCHS):
#     training
    model.train()
    train_loop = tqdm(dl_train, desc = f'Epoch {epoch + 1}/{NUM_EPOCHS}, Training')
    train_loss = []
    for _, data in enumerate(train_loop):
        input_ids = data['input_ids'].to(device, dtype = torch.long)
        attention_mask = data['attention_mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)
        outputs = model(input_ids, attention_mask)
        loss = loss_fnx(outputs, targets)
        train_loss.append(loss)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print(f'Training Loss: {torch.mean(torch.tensor(train_loss))}')
#     checkpoint
    checkpoint_filename = f'checkpoint_epoch_{epoch + 1}.pth'
    torch.save({
        'epoch': epoch+1,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, checkpoint_filename)
#     validation
    model.eval()
    val_loop = tqdm(dl_val, desc = f'Epoch {epoch + 1}/{NUM_EPOCHS}, Validation')
    val_loss = []
    with torch.no_grad():
        for _, data in enumerate(val_loop):
            input_ids = data['input_ids'].to(device, dtype = torch.long)
            attention_mask = data['attention_mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(input_ids, attention_mask)
            loss = loss_fnx(outputs, targets)
            val_loss.append(loss)
    print(f'Validation Loss: {torch.mean(torch.tensor(val_loss))}')

In [None]:
# testing

class CustomDatasetTesting(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.df = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        encoding = tokenizer.encode_plus(
            self.df.text.iloc[index],
            add_special_tokens = True,
            max_length = self.max_length,
            padding = 'max_length',
            truncation = True,
            return_token_type_ids = True,
            return_attention_mask = True,
            return_tensors = 'pt',
        )
        return {
            'ID': torch.tensor(self.df.ID.iloc[index]),
            'input_ids': encoding['input_ids'].flatten().clone().detach().long(),
            'attention_mask': encoding['attention_mask'].flatten().clone().detach().long(),
            'token_type_ids': encoding['token_type_ids'].flatten().clone().detach().long()
        }

ds_test = CustomDatasetTesting(df_test, tokenizer, MAX_LEN)
dl_train = DataLoader(ds_test, batch_size = BATCH_SIZE, shuffle=False)

checkpoint = torch.load('checkpoint_epoch_2.pth')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

model.eval()
test_loop = tqdm(dl_test, desc = f'Testing')
correct = 0
incorrect = 0
with torch.no_grad():
    for _, data in enumerate(test_loop):
        input_ids = data['input_ids'].to(device, dtype = torch.long)
        attention_mask = data['attention_mask'].to(device, dtype = torch.long)
        outputs = model(input_ids, attention_mask)
        pred_outputs = torch.argmax(outputs, dim=1)