This notebook is for fine-tuning BERT

In [1]:
# Imports
import os, sys
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm


import torch
from torch import cuda
from transformers import BertTokenizer, BertModel

device = 'cuda' if cuda.is_available() else 'cpu'
device

'cpu'

In [2]:
# Training Data

data = pd.read_csv('D:/SDS/KGA_Bert/data/glue_data/SST-2/train.tsv', sep='\t', header=0)
data = data.sample(frac = 1, ignore_index=True)
data[:5]

Unnamed: 0,sentence,label
0,", flashy , overlong soap opera .",0
1,"succeeds with its dark , delicate treatment of...",1
2,manages to escape the shackles of its own clic...,1
3,that it certainly does n't feel like a film th...,0
4,about the worst thing chan has done in the uni...,0


In [4]:
# Data preprocessing

X = data['sentence'].apply(str.strip)
y = data['label'].apply(int)
X[:5], y[:5]

(0                     , flashy , overlong soap opera .
 1    succeeds with its dark , delicate treatment of...
 2    manages to escape the shackles of its own clic...
 3    that it certainly does n't feel like a film th...
 4    about the worst thing chan has done in the uni...
 Name: sentence, dtype: object,
 0    0
 1    1
 2    1
 3    0
 4    0
 Name: label, dtype: int64)

In [5]:
# Train/Test split
num_examples = len(data)
train_split = int(num_examples * 0.85)

train_data = pd.DataFrame({
    'sentence' : X[:train_split],
    'label' : y[:train_split]
})

test_data = pd.DataFrame({
    'sentence' : X[train_split:],
    'label' : y[train_split:]
})

In [18]:
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, dataset, tokenizer):
        self.sentences = dataset['sentence']
        self.labels = dataset['label']
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        text = self.sentences[idx]
        inputs = self.tokenizer.encode(text)
        return {'sentences' : torch.tensor(inputs, dtype=torch.long),
                'labels' : torch.tensor(self.labels[idx], dtype=torch.float)}

In [19]:
class DefaultBERTClass(torch.nn.Module):

    def __init__(self) -> None:
        super(DefaultBERTClass, self).__init__()
        
        self.bert_layer = BertModel.from_pretrained("bert-base-uncased")
        self.rnn = torch.nn.RNN(768, 100)
        self.bc = torch.nn.Linear(100, 1)
        

    def forward(self, text):
        embeddings = self.bert_layer(text)
        pooler = embeddings[0][:, 0]
        _, hidden = self.rnn(pooler)
        output = self.bc(hidden.squeeze(0))
        return output

In [32]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

def train(model, optimizer, data_loader):
    model.train()
    for data in tqdm(data_loader):
        inputs = data['sentences'].to(device, dtype = torch.long)
        targets = data['labels'].to(device, dtype = torch.float)
        outputs = model(inputs)
        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    return loss

def validation(model, valid_data):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for data in tqdm(valid_data):
            X_i, y_i = data
            print(data)
            outputs = model(X_i)
            outputs = torch.sigmoid(outputs).cpu().detach()
            fin_outputs.extend(outputs)
            fin_targets.extend(y_i)
    return torch.stack(fin_outputs), torch.stack(fin_targets)

In [33]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
training_data = CustomDataset(train_data, tokenizer)
training_loader = DataLoader(training_data)

In [34]:
MAX_LEN = 128
BATCH_SIZE = 32
EPOCHS = 3
LEARNING_RATE = 2e-05
NUM_OUT = 1

from sklearn.metrics import accuracy_score

In [35]:
model = DefaultBERTClass()
optimizer = torch.optim.SGD(params=model.parameters(), lr=LEARNING_RATE)

model.to(device)

for epoch in range(EPOCHS):
    loss = train(model, optimizer, training_loader)
    print(f'Epoch: {epoch}, Loss:  {loss.item()}')
    guess, targs = validation(model, test_data)
    guesses = torch.max(guess, dim=1)
    targets = torch.max(targs, dim=1)
    print('arracy on test set {}'.format(accuracy_score(guesses.indices, targets.indices)))

  0%|          | 0/57246 [00:00<?, ?it/s]

KeyboardInterrupt: 