In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoConfig, AutoTokenizer

# Data Exploration

In [None]:
dftrain = pd.read_csv('datasets/train.csv')
dftest = pd.read_csv('datasets/test.csv')

trainset, valset = train_test_split(dftrain, test_size=0.2, random_state=69)
trainset = trainset.reset_index()
valset = valset.reset_index()

In [None]:
word_counts = dftrain['excerpt'].apply(lambda x: len(x.split()))
print(f'max number of words: {word_counts.max()}')

The max number of words is 205 so this can be the indicator of number of the tokenised text.

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Set up functions and datasets to train a model.
We will be training pretrained BERT model from huggingface library

In [None]:
from transformers import BertPreTrainedModel, BertModel
from transformers import AutoConfig, AutoTokenizer

class BertRegresser(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel(config)
        #The output layer that takes the [CLS] representation and gives an output
        self.cls_layer1 = nn.Linear(config.hidden_size,128)
        self.relu1 = nn.ReLU()
        self.ff1 = nn.Linear(128,128)
        self.tanh1 = nn.Tanh()
        self.ff2 = nn.Linear(128,1)

    def forward(self, input_ids, attention_mask):
        #Feed the input to Bert model to obtain contextualized representations
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        #Obtain the representations of [CLS] heads
        logits = outputs.last_hidden_state[:,0,:]
        output = self.cls_layer1(logits)
        output = self.relu1(output)
        output = self.ff1(output)
        output = self.tanh1(output)
        output = self.ff2(output)
        return output

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, data, maxlen, tokenizer):
        self.df = data
        self.tokenizer = tokenizer
        self.maxlen = maxlen
    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, i):
        # input ids, attention mask and label for that index i.
        excerpt = self.df.loc[i, 'excerpt']
        target = self.df.loc[i, 'target']
        tokens = self.tokenizer(excerpt, max_length = self.maxlen, padding='max_length', truncation=True, return_tensors='pt')

        target = torch.tensor(target, dtype=torch.float32)

        return tokens['input_ids'].squeeze(0), tokens['attention_mask'], target

In [None]:
from transformers import AdamW
import torch.nn as nn
from tqdm import tqdm, trange
def train(model, train_loader, val_loader, epochs, device):
    criterion = nn.MSELoss()
    best_acc = 0
    model.train()
    optim = AdamW(model.parameters(), lr=1e-3)
    for epoch in trange(epochs, desc="Epoch"):
        train_loss = 0
        for i, (input_ids, attention_mask, target) in enumerate(iterable=train_loader):
            optim.zero_grad()
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            target = target.to(device)
            
            outputs = model(input_ids = input_ids, attention_mask = attention_mask)
            
            loss = criterion(outputs, target.type_as(outputs))
            loss.backward()
            optim.step()

            train_loss += loss.item()
        
        print(f'Training loss: {train_loss/len(train_loader)}')
        val_loss = evaluate(model, val_loader, device)
        print(f'epoch :{epoch} Val loss: {val_loss}')

In [None]:
def evaluate(model, val_loader, device):
    model.eval()
    total_loss, count = 0, 0
    criterion = nn.MSELoss()
    with torch.no_grad():
        for input_ids, attention_mask, target in (val_loader):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            target = target.to(device)

            outputs = model(input_ids, attention_mask)
            total_loss += criterion(outputs, target.type_as(outputs)).item()
            count += 1
    
    return total_loss/count

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
config = AutoConfig.from_pretrained("bert-base-uncased")
model = BertRegresser.from_pretrained("bert-base-uncased", config = config)
model = model.to(device)

In [None]:
train_dataset = Dataset(data=trainset, maxlen=205, tokenizer = tokenizer)
val_dataset = Dataset(data=valset, maxlen=205, tokenizer = tokenizer)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=64, shuffle=True)

In [None]:
train(model, train_loader, val_loader, 10, device)

Unfortunately, I could not finish training the model with the computation power I have. I have tried to decrease the learning rate and using larger batches but it didn't seem to work.