# Simple BERT Sentiment Classification

Pretrained BERT(Cased) + Fully connected layer

In [8]:
import torch
import torch.nn as nn
import torchtext
import numpy as np
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
import pandas as pd
from pathlib import Path
import time

from tqdm import tqdm

RANDOM_SEED = 42
# For same result
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

root_dir = Path('.')
data_dir = Path(root_dir, '.data', 'sentence-classification')

PRETRAINED_MODEL = 'bert-base-cased'

tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL)
bert_model = BertModel.from_pretrained(PRETRAINED_MODEL)

torch.cuda.empty_cache()

## Load Dataset

In [9]:
from torch.utils.data import DataLoader, Dataset

class KaggleDataset(Dataset):
    def __init__(self, df, tokenizer, max_len, is_train):
        self.sentences = df['Sentence'].to_numpy()
        self.is_train = is_train
        if self.is_train:
            self.targets = df['Category'].to_numpy()
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        if self.is_train:
            target = self.targets[idx]
               
        encoding = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens = True, # Add CLS, SEP
            max_length = self.max_len,
            return_token_type_ids = False,
            padding = 'max_length',
            truncation = True,
            return_attention_mask = True,
            return_tensors = 'pt',
        )
        if self.is_train:
            return {
                'text' : sentence,
                'input_ids' : encoding['input_ids'].flatten(),
                'attention_mask' : encoding['attention_mask'].flatten(),
                'targets' : torch.tensor(target, dtype=torch.long)
            }
        else:
            return {
                'text' : sentence,
                'input_ids' : encoding['input_ids'].flatten(),
                'attention_mask' : encoding['attention_mask'].flatten(),
            }
    
def get_data_loader(df, tokenizer, max_len, batch_size, is_train, shuffle):
    dataset = KaggleDataset(
        df,
        tokenizer = tokenizer,
        max_len = max_len,
        is_train=is_train,
    )
    
    return DataLoader(
        dataset,
        shuffle = shuffle,
        batch_size=batch_size,
    )

In [10]:
max_len = 80
train_valid_frac = 0.8
batch_size = 16

train_raw = pd.read_csv(data_dir.joinpath('train_final.csv'))

train_df = train_raw.sample(frac=train_valid_frac, random_state=RANDOM_SEED)
valid_df = train_raw.drop(train_df.index)
test_df = pd.read_csv(data_dir.joinpath('eval_final_open.csv'))

print(f'Dataset Configuration')
print(f'-'*25)
print(f'Train/Valid = {train_valid_frac:.2f}/{1-train_valid_frac:.2f}')
print(f'Batch size = {batch_size}')
print(f'-'*25)
print(f'Train set : {len(train_df)}')
print(f'Valid set : {len(valid_df)}')
print(f'Test set : {len(test_df)}')

train_loader = get_data_loader(train_df, tokenizer, max_len, batch_size, True, True)
valid_loader = get_data_loader(valid_df, tokenizer, max_len, batch_size, True, True)
test_loader = get_data_loader(test_df, tokenizer, max_len, batch_size, False, False)

Dataset Configuration
-------------------------
Train/Valid = 0.80/0.20
Batch size = 16
-------------------------
Train set : 9235
Valid set : 2309
Test set : 4311


In [11]:
class SentimentModel(nn.Module):
    def __init__(self, bert, hidden_dim, output_dim):
        super(SentimentModel, self).__init__()
        self.bert = bert
        hidden_size = bert.config.to_dict()['hidden_size']
        self.fc1 = nn.Linear(hidden_size, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, input_ids, attention_mask):
        result = self.bert(
            input_ids = input_ids,
            attention_mask = attention_mask
        )
        out = self.fc1(result.pooler_output)
        out = self.relu(out)
        out = self.fc2(out)
        return out
    
def train_epoch(model, loader, loss_fn, optimizer, scheduler, dataset_size):
    losses = []
    correct_predictions = 0
    
    model = model.train()
    if torch.cuda.is_available():
        model = model.cuda()
        loss_fn = loss_fn.cuda()
    
    for batch in tqdm(loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        targets = batch['targets']
        if torch.cuda.is_available():
            input_ids = input_ids.cuda()
            attention_mask = attention_mask.cuda()
            targets = targets.cuda()
        
        outputs = model(
            input_ids = input_ids,
            attention_mask = attention_mask
        )
        
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)
        
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.detach().item())
        
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
    return correct_predictions.double() / dataset_size, np.mean(losses)

def valid_epoch(model, loader, loss_fn, dataset_size):
    losses = []
    correct_predictions = 0
    
    model = model.eval()
    if torch.cuda.is_available():
        model = model.cuda()
    
    with torch.no_grad():
        for batch in tqdm(loader):
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            targets = batch['targets']
            
            if torch.cuda.is_available():
                input_ids = input_ids.cuda()
                attention_mask = attention_mask.cuda()
                targets = targets.cuda()
                
            outputs = model(
                input_ids = input_ids,
                attention_mask = attention_mask
            )
            
            _, preds = torch.max(outputs, dim=1)
            
            loss = loss_fn(outputs, targets)
            
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.detach().item())
    return correct_predictions.double() / dataset_size, np.mean(losses)

def get_predictions(model, loader):
    model = model.eval()
    
    predictions = []
    predictions_probs = []
    
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            
            if torch.cuda.is_available():
                input_ids = input_ids.cuda()
                attention_mask = attention_mask.cuda()
                
            outputs = model(
                input_ids = input_ids,
                attention_mask = attention_mask
            )
            
            _, preds = torch.max(outputs, dim=1)            
            probs = nn.functional.softmax(outputs, dim=1)
            predictions.extend(torch.argmax(probs, dim=1))
    return torch.stack(predictions).cpu()


In [12]:
model = SentimentModel(bert_model, 1024, 5)
epochs = 10
# Adam optimizer with weight decay
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_loader) * epochs

# linearly decrease learning rate
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss()

for epoch in range(epochs):
    print(f'Epoch {epoch + 1} / {epochs}')
    time.sleep(1)
    train_acc, train_loss = train_epoch(
        model,
        train_loader,
        loss_fn,
        optimizer,
        scheduler,
        len(train_df)
    )

    print(f'Train : loss {train_loss} accuracy {train_acc}')
    time.sleep(1)
    valid_acc, valid_loss = valid_epoch(
        model,
        valid_loader,
        loss_fn,
        len(valid_df)
    ) 
    print(f'Valid : loss {valid_loss} accuracy {valid_acc}')
    print(f'-'*25)
    
predictions = get_predictions(model, test_loader)

submission = pd.DataFrame({'Id' : range(len(predictions)), 'Category' : predictions})
submission.to_csv('submission.csv', index=False)

Epoch 1 / 10


100%|██████████| 578/578 [01:01<00:00,  9.44it/s]


Train : loss 1.2333220199111423 accuracy 0.46388738494856524


100%|██████████| 145/145 [00:04<00:00, 34.93it/s]


Valid : loss 1.0739132823615238 accuracy 0.5361628410567345
-------------------------
Epoch 2 / 10


100%|██████████| 578/578 [00:58<00:00,  9.81it/s]


Train : loss 0.8194719771612887 accuracy 0.6723335138061722


100%|██████████| 145/145 [00:04<00:00, 35.06it/s]


Valid : loss 1.0001939296722413 accuracy 0.613685578172369
-------------------------
Epoch 3 / 10


100%|██████████| 578/578 [00:59<00:00,  9.69it/s]


Train : loss 0.5024804762567822 accuracy 0.8238224147265837


100%|██████████| 145/145 [00:04<00:00, 35.74it/s]


Valid : loss 1.0546015491773342 accuracy 0.6543958423559982
-------------------------
Epoch 4 / 10


100%|██████████| 578/578 [00:58<00:00,  9.91it/s]


Train : loss 0.30617062958234526 accuracy 0.9013535462912832


100%|██████████| 145/145 [00:04<00:00, 35.17it/s]


Valid : loss 1.223678456914836 accuracy 0.6691208315288003
-------------------------
Epoch 5 / 10


100%|██████████| 578/578 [00:58<00:00,  9.84it/s]


Train : loss 0.20156482838506723 accuracy 0.939794260963725


100%|██████████| 145/145 [00:04<00:00, 34.66it/s]


Valid : loss 1.4650620137951498 accuracy 0.6734517106972715
-------------------------
Epoch 6 / 10


100%|██████████| 578/578 [00:58<00:00,  9.89it/s]


Train : loss 0.15105154690377826 accuracy 0.9599350297780184


100%|██████████| 145/145 [00:04<00:00, 35.09it/s]


Valid : loss 1.677885841295637 accuracy 0.678648765699437
-------------------------
Epoch 7 / 10


100%|██████████| 578/578 [00:57<00:00,  9.97it/s]


Train : loss 0.10163411226709274 accuracy 0.9747698971304819


100%|██████████| 145/145 [00:04<00:00, 35.03it/s]


Valid : loss 1.834524476769027 accuracy 0.6873105240363794
-------------------------
Epoch 8 / 10


100%|██████████| 578/578 [00:58<00:00,  9.96it/s]


Train : loss 0.08072449907114258 accuracy 0.9806172171088251


100%|██████████| 145/145 [00:04<00:00, 35.27it/s]


Valid : loss 1.932583728535422 accuracy 0.6938068427890861
-------------------------
Epoch 9 / 10


100%|██████████| 578/578 [00:57<00:00, 10.07it/s]


Train : loss 0.0488019031869283 accuracy 0.9875473741201949


100%|██████████| 145/145 [00:04<00:00, 35.15it/s]


Valid : loss 2.007214207716029 accuracy 0.6938068427890861
-------------------------
Epoch 10 / 10


100%|██████████| 578/578 [00:57<00:00, 10.04it/s]


Train : loss 0.043822625163736334 accuracy 0.9896047644829453


100%|██████████| 145/145 [00:04<00:00, 35.24it/s]


Valid : loss 2.062837316795124 accuracy 0.6903421394543092
-------------------------
