<a href="https://colab.research.google.com/github/Arindam-18/BTP/blob/main/from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import transformers
from sklearn.metrics import classification_report
from torch.optim import AdamW
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from torchtext.vocab import build_vocab_from_iterator
from transformers import BertModel, BertConfig
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
df = pd.read_csv("sample_sarcasm.csv")
df = pd.get_dummies(df, columns=['sarcasm'])
text = df.iloc[:, 0]
labels = df.iloc[:, 1:].values
split = round(len(df) * 0.75)
train_text = df.iloc[:split, 0]
train_labels = df.iloc[:split, 1:].values
val_text = df.iloc[split:, 0]
val_labels = df.iloc[split:, 1:].values

device = torch.device("cuda")

In [None]:
seq_len = [len(i.split()) for i in text]
max_seq_len = max(seq_len)

In [None]:
def token(text):
    for sample in text:
        yield word_tokenize(sample.lower())

special_symbols = ['[PAD]', '[CLS]', '[SEP]', '[UNK]']
vocab = build_vocab_from_iterator(token(text), min_freq=1, specials=special_symbols, special_first=True)
vocab.set_default_index(3)

In [None]:
def get_inputs(text):
    seq = np.zeros((len(text), max_seq_len+2))
    mask = np.zeros((len(text), max_seq_len+2))
    for idx,line in enumerate(text):
        wds = word_tokenize(line)
        seq[idx][1:len(wds)+1] = [vocab[w] for w in wds]
        seq[idx][0] = 1
        seq[idx][len(wds)+1] = 2

        mask[idx][0:len(wds)+2] = 1

    return seq, mask

In [None]:
seq_train, mask_train = get_inputs(train_text)
seq_val, mask_val = get_inputs(val_text)

train_seq = torch.tensor(seq_train, dtype=torch.int)
train_mask = torch.tensor(mask_train, dtype=torch.int)
train_y = torch.tensor(train_labels.tolist(), dtype=torch.float)

val_seq = torch.tensor(seq_val, dtype=torch.int)
val_mask = torch.tensor(mask_val, dtype=torch.int)
val_y = torch.tensor(val_labels.tolist(), dtype=torch.float)

In [None]:
batch_size = 16

train_data = TensorDataset(train_seq, train_mask, train_y)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_seq, val_mask, val_y)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

In [None]:
config = BertConfig(vocab_size=len(vocab), hidden_size=1536)
bert = BertModel(config)

In [None]:
class Classifier(nn.Module):
    def __init__(self, bert):
        super(Classifier, self).__init__()
        self.bert = bert
        self.dropout = nn.Dropout(0.1)
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(1536, 512)
        self.fc2 = nn.Linear(512, 2)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, sent_id, mask):
        a = self.bert(sent_id, attention_mask=mask)
        x = self.fc1(a[1])
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

In [None]:
model = Classifier(bert)
model = model.to(device)
optimizer = AdamW(model.parameters(), lr = 1e-3)
cross_entropy  = nn.CrossEntropyLoss()

In [None]:
def train():
    model.train()
    total_loss, total_accuracy = 0, 0

    total_preds = []
    total_labels = []

    for step, batch in enumerate(train_dataloader):
        batch = [r.to(device) for r in batch]
        sent_id, mask, labels = batch

        model.zero_grad()

        preds = model(sent_id, mask)
        loss = cross_entropy(preds, labels)
        total_loss = total_loss + loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        preds = preds.detach().cpu().numpy()
        preds = np.argmax(preds, axis=1)
        total_preds += list(preds)
        total_labels += labels.tolist()

    avg_loss = total_loss / len(train_dataloader)
    return avg_loss

In [None]:
def evaluate():
    model.eval()
    total_loss, total_accuracy = 0, 0

    total_preds = []
    total_labels = []
    for step, batch in enumerate(val_dataloader):
        batch = [t.to(device) for t in batch]
        sent_id, mask, labels = batch

        with torch.no_grad():
            preds = model(sent_id, mask)

            loss = cross_entropy(preds, labels)
            total_loss = total_loss + loss.item()
            preds = preds.detach().cpu().numpy()
            preds = np.argmax(preds, axis=1)
            total_preds += list(preds)
            total_labels += labels.tolist()

    avg_loss = total_loss / len(val_dataloader)
    return avg_loss

In [None]:
def save_checkpoint(filename, epoch, model, optimizer):
    state = {"epoch": epoch, "model": model, "optimizer": optimizer}
    torch.save(state, filename)

In [None]:
best_valid_loss = float("inf")

train_losses = []
valid_losses = []

for epoch in range(5):
    print(f"Epoch {epoch+1} / 5")

    train_loss = train()

    valid_loss = evaluate()

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        file_name = "topic_saved_weights.pt"
        save_checkpoint(file_name, epoch, model, optimizer)

    print(f"Training Loss: {train_loss}, Valid Loss: {valid_loss}\n")
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)

Epoch 1 / 5
Training Loss: 0.9089615494012833, Valid Loss: 0.6262342408299446

Epoch 2 / 5
Training Loss: 0.6409081419308981, Valid Loss: 0.6634743511676788

Epoch 3 / 5
Training Loss: 0.6778949884076914, Valid Loss: 0.6253784224390984

Epoch 4 / 5
Training Loss: 0.6400723395248255, Valid Loss: 0.6206180527806282

Epoch 5 / 5
Training Loss: 0.630627109358708, Valid Loss: 0.6192828118801117



In [None]:
path = "topic_saved_weights.pt"

checkpoint = torch.load(path, map_location=device)
model = checkpoint.get("model")

seq, mask = get_inputs(text)
with torch.no_grad():
    preds = model(torch.tensor(seq, dtype=torch.int).to(device), torch.tensor(mask, dtype=torch.int).to(device))
    preds = preds.detach().cpu().numpy()

preds = np.argmax(preds, axis=1)

sum = 0
for x, y in zip(np.argmax(labels, axis=1), preds):
    sum += x == y

print((sum * 100 / len(labels)).item())

68.0
