This notebook is for fine-tuning BERT

In [3]:
# Imports
import os, sys
import numpy as np
import pandas as pd
from gc import collect
from tqdm.notebook import tqdm
import nltk


import torch
from torch import cuda
from transformers import BertTokenizer, BertModel

device = 'cuda' if cuda.is_available() else 'cpu'
device

  from .autonotebook import tqdm as notebook_tqdm


'cpu'

In [4]:
# Training Data

data_train = pd.read_csv(r'D:\SDS\KGA_Bert\data\glue_data\SST-2\train.tsv', sep='\t', header=0)
data_dev = pd.read_csv(r'D:\SDS\KGA_Bert\data\glue_data\SST-2\dev.tsv', sep='\t', header=0)
data_train = data_train.sample(frac = 1, ignore_index=True)
data_dev = data_dev.sample(frac = 1, ignore_index=True)

embedding_table = pd.read_csv(r"D:\SDS\KGA_Bert\data\KG_data\embedding_table.csv", index_col=0, delimiter='|',
                    converters={
                        '1' : lambda x: np.array(x.removeprefix('[').removesuffix(']').split(','), dtype='float'),
                        '2' : lambda x: np.array(x.removeprefix('[').removesuffix(']').split(','), dtype='float'),
                        '3' : lambda x: np.array(x.removeprefix('[').removesuffix(']').split(','), dtype='float')
                    })

In [5]:
MAX_LEN = len(max(data_train.sentence, key=len))
MAX_LEN

268

In [6]:
BATCH_SIZE = 16
EPOCHS = 10
LEARNING_RATE = 0.0001
NUM_OUT = 1

In [7]:
# Train/Test split
train_data = pd.DataFrame({
    'sentence' : data_train['sentence'].apply(str.strip),
    'label' : data_train['label'].apply(int)
}).reset_index()

test_data = pd.DataFrame({
    'sentence' : data_dev['sentence'].apply(str.strip),
    'label' : data_dev['label'].apply(int)
}).reset_index()

In [8]:
def get_nouns(sentence):
    tokens = nltk.word_tokenize(sentence)
    tags = nltk.pos_tag(tokens)
    retVal = []
    i = 0
    while i < len(tags):
        key, tag = tags[i]
        if 'NN' in tag:
            retVal.append(key)
        i += 1
    return retVal

In [9]:
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_size):
        self.sentences = dataset['sentence']
        self.labels = dataset['label']
        self.tokenizer = tokenizer
        self.max_size = max_size

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        text = self.sentences[idx]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_size,
            pad_to_max_length=True,
            truncation=True,
            return_token_type_ids=True
        )

        return {'sentences' : torch.tensor(inputs['input_ids'], dtype=torch.long, device=device),
                'mask' : torch.tensor(inputs['attention_mask'], dtype=torch.long, device=device),
                'token_type_ids': torch.tensor(inputs["token_type_ids"], dtype=torch.long, device=device),
                'labels' : torch.tensor(self.labels[idx], dtype=torch.float, device=device)}

In [10]:
class DefaultBERTClass(torch.nn.Module):

    def __init__(self) -> None:
        super(DefaultBERTClass, self).__init__()
        
        self.bert_layer = BertModel.from_pretrained(
            "bert-base-uncased"
            )
        #self.dropout = torch.nn.Dropout(p=0.3)
        self.hidd = torch.nn.Linear(self.bert_layer.config.hidden_size + 200, NUM_OUT)
        self.sig = torch.nn.Sigmoid()
        

    def forward(self, text, attention_mask, token_type_ids):
        embeddings = self.bert_layer(text, attention_mask = attention_mask)
        pooler = embeddings[0][:, 0]
        #dropout = self.dropout(pooler)

        # Concat Embeddings
        noun_embeddings = np.zeros(200)
        length = 0
        for noun in get_nouns(text):
            if noun in embedding_table.index:
                noun_embeddings += ((1/2) * embedding_table.loc[noun][1] + (1/3) * embedding_table.loc[noun][2] + (1/6) * embedding_table.loc[noun][3])
                length += 1
        
        if length != 0:
            noun_embeddings /= length

        concat_layer = np.concatenate([pooler, noun_embeddings])

        hidden = self.hidd(concat_layer)
        
        output = self.sig(hidden)
        return output

In [11]:
from sklearn.metrics import accuracy_score

def loss_fn(outputs, targets):
    return torch.nn.BCELoss()(outputs, targets)

def train(model, optimizer, data_loader):
    model.train()
    for data in tqdm(data_loader):
        inputs = data['sentences']
        mask = data['mask']
        token_type_ids = data['token_type_ids']
        targets = data['labels'].unsqueeze(1)

        #print(inputs, mask)

        outputs = model(inputs, mask, token_type_ids)

        #print(outputs, targets)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        #optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Memory optimization
        del inputs, mask, token_type_ids, targets
        collect()
        with torch.cuda.device(device):
            torch.cuda.empty_cache()

    return loss

def validation(model, data_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for data in tqdm(data_loader):
            inputs = data['sentences']
            mask = data['mask']
            token_type_ids = data['token_type_ids']
            targets = data['labels'].unsqueeze(1)

            outputs = model(inputs, mask, token_type_ids)

            del inputs, mask, token_type_ids
            collect()
            with torch.cuda.device(device):
                torch.cuda.empty_cache()

            fin_outputs.extend(outputs)
            fin_targets.extend(targets)

    return torch.stack(fin_outputs), torch.stack(fin_targets)

def get_accuracy(guess, targs):
    guesses = (guess >= 0.5).cpu().numpy()
    targets = (targs >= 0.5).cpu().numpy()
    return accuracy_score(guesses, targets)

In [12]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

training_data = CustomDataset(train_data, tokenizer, MAX_LEN)
training_loader = DataLoader(training_data, batch_size=BATCH_SIZE, shuffle=True)

testing_data = CustomDataset(test_data, tokenizer, MAX_LEN)
testing_loader = DataLoader(testing_data, batch_size=BATCH_SIZE, shuffle=False)

tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 3.64MB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [14]:
from IPython.display import clear_output
from matplotlib import pyplot as plt

def live_plot(accuracies):
    clear_output(wait=True)
    plt.figure()
    plt.xlim(0, EPOCHS)
    plt.ylim(0, 1)
    x= [float(i) for i in range(len(accuracies))]
    y= [float(i) for i in accuracies]
    
    if len(x) > 1:
        plt.plot(x,y)

    plt.grid(True)
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.show()

In [16]:
model = DefaultBERTClass()
optimizer = torch.optim.SGD(params=model.parameters(), lr=LEARNING_RATE)

model.to(device)

validation_accuracies = []

for epoch in range(EPOCHS):
    loss = train(model, optimizer, training_loader)
    print(f'Epoch: {epoch}, Loss:  {loss.item()}')
    guess, targs = validation(model, testing_loader)

    #print(guess, targs)
    
    accuracy = get_accuracy(guess, targs)
    validation_accuracies.append(accuracy)

    live_plot(validation_accuracies)

    print('accuracy on test set {}'.format(accuracy))

ImportError: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html