# Simple BERT Sentiment Classification

Pretrained BERT(Cased) + Fully connected layer

-- Colab Version

In [3]:
!pip3 install torch
!pip3 install torchtext
!pip3 install transformers
!pip3 install tqdm
!pip3 install pathlib

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/b0/9e/5b80becd952d5f7250eaf8fc64b957077b12ccfe73e9c03d37146ab29712/transformers-4.6.0-py3-none-any.whl (2.3MB)
[K     |████████████████████████████████| 2.3MB 4.2MB/s 
Collecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 31.1MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 35.6MB/s 
Installing collect

In [4]:

import torch
import torch.nn as nn
import torchtext
import numpy as np
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
import pandas as pd
from pathlib import Path
import time
import matplotlib.pyplot as plt
from google.colab import drive

from tqdm import tqdm

drive.mount('/content/gdrive')

root_dir = Path('/content/gdrive/My Drive')
data_dir = Path(root_dir, 'dataset', 'sentence-classification')

PRETRAINED_MODEL = 'bert-large-cased'

tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL)
bert_model = BertModel.from_pretrained(PRETRAINED_MODEL)

torch.cuda.empty_cache()

Mounted at /content/gdrive


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=762.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1338740706.0, style=ProgressStyle(descr…




Some weights of the model checkpoint at bert-large-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Load Dataset

In [5]:
from torch.utils.data import DataLoader, Dataset

class KaggleDataset(Dataset):
    def __init__(self, df, tokenizer, max_len, is_train):
        self.sentences = df['Sentence'].to_numpy()
        self.is_train = is_train
        if self.is_train:
            self.targets = df['Category'].to_numpy()
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        if self.is_train:
            target = self.targets[idx]
               
        encoding = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens = True, # Add CLS, SEP
            max_length = self.max_len,
            return_token_type_ids = False,
            padding = 'max_length',
            truncation = True,
            return_attention_mask = True,
            return_tensors = 'pt',
        )
        if self.is_train:
            return {
                'text' : sentence,
                'input_ids' : encoding['input_ids'].flatten(),
                'attention_mask' : encoding['attention_mask'].flatten(),
                'targets' : torch.tensor(target, dtype=torch.long)
            }
        else:
            return {
                'text' : sentence,
                'input_ids' : encoding['input_ids'].flatten(),
                'attention_mask' : encoding['attention_mask'].flatten(),
            }
    
def get_data_loader(df, tokenizer, max_len, batch_size, is_train, shuffle):
    dataset = KaggleDataset(
        df,
        tokenizer = tokenizer,
        max_len = max_len,
        is_train=is_train,
    )
    
    return DataLoader(
        dataset,
        shuffle = shuffle,
        batch_size=batch_size,
    )

In [6]:
RANDOM_SEED = 884532
# For same result
# torch.manual_seed(RANDOM_SEED)
# np.random.seed(RANDOM_SEED)

max_len = 100
train_valid_frac = 0.8
batch_size = 32

train_raw = pd.read_csv(data_dir.joinpath('train_final.csv'))

train_df = train_raw.sample(frac=train_valid_frac, random_state=RANDOM_SEED)
valid_df = train_raw.drop(train_df.index)
test_df = pd.read_csv(data_dir.joinpath('eval_final_open.csv'))

print(f'Dataset Configuration')
print(f'-'*25)
print(f'Train/Valid = {train_valid_frac:.2f}/{1-train_valid_frac:.2f}')
print(f'Batch size = {batch_size}')
print(f'-'*25)
print(f'Train set : {len(train_df)}')
print(f'Valid set : {len(valid_df)}')
print(f'Test set : {len(test_df)}')

train_loader = get_data_loader(train_df, tokenizer, max_len, batch_size, True, True)
valid_loader = get_data_loader(valid_df, tokenizer, max_len, batch_size, True, True)
test_loader = get_data_loader(test_df, tokenizer, max_len, batch_size, False, False)

Dataset Configuration
-------------------------
Train/Valid = 0.80/0.20
Batch size = 32
-------------------------
Train set : 9235
Valid set : 2309
Test set : 4311


In [7]:
class SentimentModel(nn.Module):
    def __init__(self, bert, output_dim, dropout_p):
        super(SentimentModel, self).__init__()
        self.bert = bert
        self.dropout_p = dropout_p
        hidden_size = bert.config.to_dict()['hidden_size']
        self.dropout = nn.Dropout(p=self.dropout_p)
        self.fc = nn.Linear(hidden_size, output_dim)
        
    def forward(self, input_ids, attention_mask):
        result = self.bert(
            input_ids = input_ids,
            attention_mask = attention_mask
        )
        out = self.dropout(result.pooler_output)
        out = self.fc(out)
        if not self.train:
            out = out * (1-self.dropout_p)
        return out
    
# def train_epoch(model, loader, loss_fn, optimizer, scheduler, dataset_size):
def train_epoch(model, loader, loss_fn, optimizer, dataset_size):
    losses = []
    correct_predictions = 0
    
    model = model.train()
    
    for batch in tqdm(loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        targets = batch['targets']
        if torch.cuda.is_available():
            input_ids = input_ids.cuda()
            attention_mask = attention_mask.cuda()
            targets = targets.cuda()
        
        outputs = model(
            input_ids = input_ids,
            attention_mask = attention_mask
        )
        
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)
        
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.detach().item())
        
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        # scheduler.step()
    return correct_predictions.double() / dataset_size, np.mean(losses)

def valid_epoch(model, loader, loss_fn, dataset_size):
    losses = []
    correct_predictions = 0
    
    model = model.eval()
    
    with torch.no_grad():
        for batch in tqdm(loader):
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            targets = batch['targets']
            
            if torch.cuda.is_available():
                input_ids = input_ids.cuda()
                attention_mask = attention_mask.cuda()
                targets = targets.cuda()
                
            outputs = model(
                input_ids = input_ids,
                attention_mask = attention_mask
            )
            
            _, preds = torch.max(outputs, dim=1)
            
            loss = loss_fn(outputs, targets)
            
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.detach().item())
    return correct_predictions.double() / dataset_size, np.mean(losses)

def get_predictions(model, loader):
    model = model.eval()
    
    predictions = []
    predictions_probs = []
    
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            
            if torch.cuda.is_available():
                input_ids = input_ids.cuda()
                attention_mask = attention_mask.cuda()
                
            outputs = model(
                input_ids = input_ids,
                attention_mask = attention_mask
            )         
            predictions.extend(torch.argmax(outputs, dim=1))
            
    return torch.stack(predictions).cpu()


In [8]:
model = SentimentModel(bert_model, 5, 0.1)

epochs = 5
total_steps = len(train_loader) * epochs
learning_rate = 2e-5


loss_fn = nn.CrossEntropyLoss()

if torch.cuda.is_available():
    model = model.cuda()
    loss_fn = loss_fn.cuda()
    
# Adam optimizer with weight decay
# optimizer = AdamW(model.parameters(), lr=learning_rate, correct_bias=False)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# linearly decrease learning rate
# scheduler = get_linear_schedule_with_warmup(
#     optimizer,
#     num_warmup_steps=epochs,
#     num_training_steps=total_steps
# )

results = {
    'train_loss' : [],
    'train_acc' : [],
    'valid_loss' : [],
    'valid_acc' : []
}

best_valid_acc = 0

for epoch in range(epochs):
    print(f'Epoch {epoch + 1} / {epochs}')
    time.sleep(1)
    train_acc, train_loss = train_epoch(
        model,
        train_loader,
        loss_fn,
        optimizer,
        # scheduler,
        len(train_df)
    )
    print(f'Train : loss {train_loss:.5f} / accuracy {train_acc * 100:.5f}%')
    results['train_loss'].append(train_loss)
    results['train_acc'].append(train_acc)
    time.sleep(1)
    valid_acc, valid_loss = valid_epoch(
        model,
        valid_loader,
        loss_fn,
        len(valid_df)
    ) 
    print(f'Valid : loss {valid_loss:.5f} / accuracy {valid_acc * 100:.5f}%')
    results['valid_loss'].append(valid_loss)
    results['valid_acc'].append(valid_acc)
    
    if best_valid_acc < valid_acc:
        best_valid_acc = valid_acc
        torch.save(model.state_dict(), 'model.pt')
        print(f'Best valid acc : {best_valid_acc * 100:.5f}%')
        
    print(f'-'*25)


torch.save(model.state_dict(), 'model_final.pt')
model.load_state_dict(torch.load('model.pt'))
predictions = get_predictions(model, test_loader)

submission = pd.DataFrame({'Id' : range(len(predictions)), 'Category' : predictions})
submission.to_csv('submission.csv', index=False)

Epoch 1 / 5


100%|██████████| 289/289 [04:50<00:00,  1.01s/it]


Train : loss 1.17052 / accuracy 48.57607%


100%|██████████| 73/73 [00:24<00:00,  2.99it/s]


Valid : loss 1.02347 / accuracy 54.95886%
Best valid acc : 54.95886%
-------------------------
Epoch 2 / 5


100%|██████████| 289/289 [04:50<00:00,  1.01s/it]


Train : loss 0.79835 / accuracy 67.57986%


100%|██████████| 73/73 [00:24<00:00,  2.99it/s]


Valid : loss 0.94608 / accuracy 61.93157%
Best valid acc : 61.93157%
-------------------------
Epoch 3 / 5


100%|██████████| 289/289 [04:50<00:00,  1.01s/it]


Train : loss 0.49369 / accuracy 82.04656%


100%|██████████| 73/73 [00:24<00:00,  3.00it/s]


Valid : loss 1.03233 / accuracy 62.19142%
Best valid acc : 62.19142%
-------------------------
Epoch 4 / 5


100%|██████████| 289/289 [04:50<00:00,  1.01s/it]


Train : loss 0.32588 / accuracy 88.89009%


100%|██████████| 73/73 [00:24<00:00,  3.01it/s]


Valid : loss 1.12889 / accuracy 64.91988%
Best valid acc : 64.91988%
-------------------------
Epoch 5 / 5


100%|██████████| 289/289 [04:50<00:00,  1.00s/it]


Train : loss 0.23458 / accuracy 92.25772%


100%|██████████| 73/73 [00:24<00:00,  3.01it/s]


Valid : loss 1.22900 / accuracy 69.12083%
Best valid acc : 69.12083%
-------------------------
