This code was made with inspiration from:
https://www.kaggle.com/code/bradley13p/bert-pretrained

In [3]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
import transformers
from transformers import BertModel, BertTokenizer, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
from torch.optim import Adam
from tqdm.notebook import tqdm

In [4]:
df = pd.read_csv("dataset.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'dataset.csv'

In [None]:
model_name = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)

In [None]:
class DatasetClass(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, item):
        text = self.df.text[item]
        target = self.df.target[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            # padding="longest",    
            pad_to_max_length=True,
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors="pt",
        )

        return {
            "text": text,
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "targets": torch.tensor(target, dtype=torch.long),
        }

In [None]:
df_train, df_test = train_test_split(df, test_size=0.075)
df_train, df_val = train_test_split(df_train, test_size=0.1)

In [None]:
df_train.reset_index(inplace=True)
df_val.reset_index(inplace=True)
df_test.reset_index(inplace=True)

In [None]:
def create_data_loader(df, tokenizer, MAX_LEN=60, batch_size=200):
    ds = DatasetClass(df, tokenizer, MAX_LEN)
    return DataLoader(ds, batch_size, num_workers=1)

In [None]:
train_data_loader = create_data_loader(df_train, tokenizer)
val_data_loader = create_data_loader(df_val, tokenizer)
test_data_loader = create_data_loader(df_test, tokenizer)

In [None]:
class Classifier(nn.Module):

    def __init__(self, n_classes):
        super(Classifier, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        outs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        output = self.drop(outs["pooler_output"])
        return self.out(output)

In [None]:
class_names = ['negative', 'positive']
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = Classifier(len(class_names)).to(device)

In [None]:
epochs = 4

optimizer = Adam(model.parameters(), lr=2e-5, weight_decay=1e-5)
total_steps = len(train_data_loader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()

    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in tqdm(data_loader):
            input_ids = d["input_ids"]
            attention_mask = d["attention_mask"]
            targets = d["targets"]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)

            loss = loss_fn(outputs, targets)

            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

    return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
for epoch in range(epochs):
    # train
    model = model.train()

    losses = []
    correct_predictions = 0

    for i, d in enumerate(tqdm(train_data_loader)):
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)


        loss.backward()
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        train_loss = correct_predictions.double() / len(df_train)
        train_acc = np.mean(losses)

    print(f'Train loss {train_loss} accuracy {train_acc}')
    # eval
    val_acc, val_loss = eval_model(
    model,
    val_data_loader,
    loss_fn,
    device,
    len(df_val))
    print(f'Val   loss {val_loss} accuracy {val_acc}')

In [None]:
test_acc, test_loss = eval_model(model, test_data_loader, loss_fn, device, len(df_test))
print(f"Test loss: {test_loss:.4f} Test Accuracy: {test_acc:.4f}")

Results:
Test loss: 0.308615, Test Accuracy: 0.868821