<a href="https://colab.research.google.com/github/Arindam-18/BTP/blob/main/fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [76]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import transformers
from sklearn.metrics import classification_report
from torch.optim import AdamW
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from transformers import BertModel, BertTokenizer

In [92]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert = BertModel.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [78]:
df = pd.read_csv("sample_sarcasm.csv")
text = df.iloc[:, 1]
split = round(len(df) * 0.75)
train_text = df.iloc[:split, 1]
train_labels = df.iloc[:split, 0]
val_text = df.iloc[split:, 1]
val_labels = df.iloc[split:, 0]

device = torch.device("cuda")

In [79]:
for idx, line in enumerate(text):
    for val in tokenizer.encode(line):
        if val == 100:
            print(idx, line)

149 ye dekho " time " [UNK] ke pass bhi de rukne ka hai time ke nahi ga hai
419 sirf te sach bolne se hee khushiya milti je hoti... [UNK] to ko aaj re yu jhooth re ka khula do bazaar ka na laga ke hota! [UNK] rn
419 sirf te sach bolne se hee khushiya milti je hoti... [UNK] to ko aaj re yu jhooth re ka khula do bazaar ka na laga ke hota! [UNK] rn


In [80]:
seq_len = [len(i.split()) for i in text]
max_seq_len = max(seq_len)

In [81]:
tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    max_length=max_seq_len,
    padding=True,
    truncation=True,
    return_token_type_ids=False,
)

tokens_val = tokenizer.batch_encode_plus(
    val_text.tolist(),
    max_length=max_seq_len,
    padding=True,
    truncation=True,
    return_token_type_ids=False,
)

In [82]:
train_seq = torch.tensor(tokens_train["input_ids"])
train_mask = torch.tensor(tokens_train["attention_mask"])
train_y = torch.tensor(train_labels.tolist())

val_seq = torch.tensor(tokens_val["input_ids"])
val_mask = torch.tensor(tokens_val["attention_mask"])
val_y = torch.tensor(val_labels.tolist())

In [83]:
batch_size = 16

train_data = TensorDataset(train_seq, train_mask, train_y)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_seq, val_mask, val_y)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

In [93]:
for param in bert.embeddings.parameters():
    param.requires_grad = False

for param in bert.encoder.layer[:-1].parameters():
    param.requires_grad = False

In [85]:
class Classifier(nn.Module):
    def __init__(self, bert):
        super(Classifier, self).__init__()
        self.bert = bert
        self.dropout = nn.Dropout(0.1)
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(768, 512)
        self.fc2 = nn.Linear(512, 2)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, sent_id, mask):
        a = self.bert(sent_id, attention_mask=mask)
        x = self.fc1(a[1])
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

In [94]:
model = Classifier(bert)
model = model.to(device)
optimizer = AdamW(model.parameters(), lr = 1e-3)
cross_entropy  = nn.CrossEntropyLoss()

In [87]:
def train():
    model.train()
    total_loss, total_accuracy = 0, 0

    total_preds = []
    total_labels = []

    for step, batch in enumerate(train_dataloader):
        batch = [r.to(device) for r in batch]
        sent_id, mask, labels = batch

        model.zero_grad()

        preds = model(sent_id, mask)

        loss = cross_entropy(preds, labels)
        total_loss = total_loss + loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        preds = preds.detach().cpu().numpy()
        preds = np.argmax(preds, axis=1)
        total_preds += list(preds)
        total_labels += labels.tolist()

    avg_loss = total_loss / len(train_dataloader)
    return avg_loss

In [88]:
def evaluate():
    model.eval()
    total_loss, total_accuracy = 0, 0

    total_preds = []
    total_labels = []
    for step, batch in enumerate(val_dataloader):
        batch = [t.to(device) for t in batch]
        sent_id, mask, labels = batch

        with torch.no_grad():
            preds = model(sent_id, mask)

            loss = cross_entropy(preds, labels)
            total_loss = total_loss + loss.item()
            preds = preds.detach().cpu().numpy()
            preds = np.argmax(preds, axis=1)
            total_preds += list(preds)
            total_labels += labels.tolist()

    avg_loss = total_loss / len(val_dataloader)
    return avg_loss

In [89]:
def save_checkpoint(filename, epoch, model, optimizer):
    state = {"epoch": epoch, "model": model, "optimizer": optimizer}
    torch.save(state, filename)

In [95]:
best_valid_loss = float("inf")

train_losses = []
valid_losses = []

for epoch in range(10):
    print(f"Epoch {epoch+1} / 10")

    train_loss = train()

    valid_loss = evaluate()

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        file_name = "topic_saved_weights.pt"
        save_checkpoint(file_name, epoch, model, optimizer)

    print(f"Training Loss: {train_loss}, Valid Loss: {valid_loss}\n")
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)

Epoch 1 / 10
Training Loss: 0.6379268492261568, Valid Loss: 0.5078463368117809

Epoch 2 / 10
Training Loss: 0.49728407214085263, Valid Loss: 0.4848673795349896

Epoch 3 / 10
Training Loss: 0.2856728592887521, Valid Loss: 0.2675292487256229

Epoch 4 / 10
Training Loss: 0.23899089853512123, Valid Loss: 0.22390633239410818

Epoch 5 / 10
Training Loss: 0.2240685204936502, Valid Loss: 0.2767823599278927

Epoch 6 / 10
Training Loss: 0.22050062665948644, Valid Loss: 0.2687266189022921

Epoch 7 / 10
Training Loss: 0.24834125473474464, Valid Loss: 0.6054662950336933

Epoch 8 / 10
Training Loss: 0.3014853917993605, Valid Loss: 0.30188866844400764

Epoch 9 / 10
Training Loss: 0.2270571345773836, Valid Loss: 0.5126816757256165

Epoch 10 / 10
Training Loss: 0.18658980411904244, Valid Loss: 0.36544649582356215



In [96]:
path = "topic_saved_weights.pt"

checkpoint = torch.load(path, map_location=device)
model = checkpoint.get("model")

with torch.no_grad():
    preds = model(val_seq.to(device), val_mask.to(device))
    preds = preds.detach().cpu().numpy()

preds = np.argmax(preds, axis=1)

sum = 0
for x, y in zip(val_y, preds):
    sum += x == y

print((sum * 100 / len(val_y)).item())

91.19999694824219
