# Simple BERT Sentiment Classification

Pretrained BERT(Cased) + Fully connected layer + Multi-Task-Learning

In [1]:
!pip3 install torch
!pip3 install torchtext
!pip3 install transformers
!pip3 install tqdm
!pip3 install pathlib

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 3.9MB/s 
[?25hCollecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 26.3MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 38.2MB/s 
Instal

In [10]:
import torch
import torch.nn as nn
import torchtext
import numpy as np
from transformers import BertTokenizer, BertModel, AdamW
import pandas as pd
from pathlib import Path
import time
from datetime import datetime
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from enum import Enum
from pprint import pprint

SENTIMENT_LABEL = 'sentiment'
IMDB_LABEL = 'imdb'
SARCASM_LABEL = 'sarcasm'

CROSS_ENTROPY_LOSS_LABEL = 'crossentropyloss'
BCE_LOSS_WITH_LOGITS_LABEL = 'bcelosswithlogits'

configs = {
    'path' : {
        'colab' : {
            'root' : '/content/gdrive/My Drive',
            'data' : 'dataset',
            'checkpoint' : 'checkpoints',
        },
        'local' : {
            'root' : '../',
            'data' : '.data',
            'checkpoint' : 'checkpoints',
        },   
    },
    'task' : {
        SENTIMENT_LABEL : {
            'train_file' : 'sentence-classification/train_final.csv',
            'test_file' : 'sentence-classification/eval_final_open.csv',
            'train_valid_frac' : 0.8,
            'out_features' : 5,
            'loss_fn' : CROSS_ENTROPY_LOSS_LABEL,
        },
        IMDB_LABEL : {
            'train_file' : 'imdb/train_IM.csv',
            'train_valid_frac' : 0.8,
            'out_features' : 2,
            'loss_fn' : CROSS_ENTROPY_LOSS_LABEL,
        },
        SARCASM_LABEL : {
            'train_file' : 'reddit-sarcasm/train_sarcasm.csv',
            'train_valid_frac' : 0.8,
            'out_features' : 2,
            'loss_fn' : CROSS_ENTROPY_LOSS_LABEL,
        },
    }
}

# Enum for Task Label
class Task(Enum):
    SENTIMENT = 1
    IMDB = 2
    SARCASM = 3

start_time = datetime.now()

if 'google.colab' in str(get_ipython()):
    print('Running on CoLab')
    from google.colab import drive

    drive.mount('/content/gdrive')

    root_dir = Path(configs['path']['colab']['root'])
    data_dir = Path(root_dir, configs['path']['colab']['data'])
    checkpoint_dir = Path(root_dir, configs['path']['colab']['checkpoint'])

else:
    print('Not running on CoLab')
    
    root_dir = Path(configs['path']['local']['root'])
    data_dir = Path(root_dir, configs['path']['local']['data'])
    checkpoint_dir = Path(root_dir, configs['path']['local']['checkpoint'])

Path(checkpoint_dir).mkdir(parents=False, exist_ok=True)

PRETRAINED_MODEL = 'bert-base-cased'

tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL)
bert_model = BertModel.from_pretrained(PRETRAINED_MODEL)

torch.cuda.empty_cache()

Running on CoLab
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Load Dataset

In [11]:
from torch.utils.data import DataLoader, Dataset

class ClassificationDataset(Dataset):
    def __init__(self, df, task, tokenizer, max_len, is_train):
        self.task = task
        self.sentences = df['Sentence'].to_numpy()
        self.is_train = is_train
        if self.is_train:
            self.targets = df['Category'].to_numpy()
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        if self.is_train:
            target = self.targets[idx]
               
        encoding = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens = True, # Add CLS, SEP
            max_length = self.max_len,
            return_token_type_ids = False,
            padding = 'max_length',
            truncation = True,
            return_attention_mask = True,
            return_tensors = 'pt',
        )
        if self.is_train:
            return {
                'task' : self.task,
                'text' : sentence,
                'input_ids' : encoding['input_ids'].flatten(),
                'attention_mask' : encoding['attention_mask'].flatten(),
                'targets' : torch.tensor(target, dtype=torch.long)
            }
        else:
            return {
                'task' :self.task,
                'text' : sentence,
                'input_ids' : encoding['input_ids'].flatten(),
                'attention_mask' : encoding['attention_mask'].flatten(),
            }

def load_csv_data(configs, seed):
    train_data = {}
    valid_data = {}
    test_data = {}

    for task in configs['task']:
        raw_data = pd.read_csv(data_dir.joinpath(configs['task'][task]['train_file']))
        if task == 'imdb':
            raw_data.rename(columns = {'Phrase': 'Sentence', 'Sentiment': 'Category'}, inplace=True)
        train_data[task] = raw_data.sample(frac=configs['task'][task]['train_valid_frac'], random_state=seed)
        valid_data[task] = raw_data.drop(train_data[task].index)
        if 'test_file' in configs['task'][task]:
            test_data[task] = pd.read_csv(data_dir.joinpath(configs['task'][task]['test_file']))
            if task == 'imdb':
                test_data[task].rename(columns = {'Phrase': 'Sentence', 'Sentiment': 'Category'}, inplace=True)
    
    return train_data, valid_data, test_data

def print_dataset_configs(configs, train, valid, test):
    for task in train:
        print(f'{task} dataset')
        print(f'='*25)
        print(f'Train/Valid : {configs["task"][task]["train_valid_frac"]:.2f}/{1-configs["task"][task]["train_valid_frac"]:.2f}')
        print(f'='*25)
        print(f'Train dataset length : {len(train[task])}')
        if task in valid:
            print(f'Valid dataset length : {len(valid[task])}')
        if task in test:
            print(f'Test dataset length : {len(test[task])}')
        print(f'='*25)
        print('')

def get_data_loader(phase, task_df, tokenizer, max_len, batch_size, is_train, shuffle):
    total_dataset = []
    
    for task in task_df:
        dataset = ClassificationDataset(
            task_df[task],
            convert_label_to_enum(task),
            tokenizer = tokenizer,
            max_len = max_len,
            is_train=is_train,
        )
        time.sleep(1)
        
        loader = DataLoader(
            dataset,
            batch_size = batch_size
        )
        
        print(f'Combine {phase} - {task} dataset')
        time.sleep(1)
        for batch in tqdm(loader):
            total_dataset.append(batch)
    
    return DataLoader(
        total_dataset,
        shuffle = shuffle,
        batch_size = 1,
    )

def convert_enum_to_label(enum):
    if enum == Task.SENTIMENT.value:
        return SENTIMENT_LABEL
    elif enum == Task.IMDB.value:
        return IMDB_LABEL
    elif enum == Task.SARCASM.value:
        return SARCASM_LABEL

def convert_label_to_enum(label):
    if label == SENTIMENT_LABEL:
        return Task.SENTIMENT.value
    elif label == IMDB_LABEL:
        return Task.IMDB.value
    elif label == SARCASM_LABEL:
        return Task.SARCASM.value
    
def convert_name_to_func(name):
    if name == CROSS_ENTROPY_LOSS_LABEL:
        return nn.CrossEntropyLoss()
    elif name == BCE_LOSS_WITH_LOGITS_LABEL:
        return nn.BCEWithLogitsLoss()

In [12]:
RANDOM_SEED = 884532
# For same result
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

max_len = 100
batch_size = 8

train_data, valid_data, test_data = load_csv_data(configs, RANDOM_SEED)


print(f'Batch size = {batch_size}')
print(f'-'*50)
print(f'Task Configuration')
print(f'-'*50)
print_dataset_configs(configs, train_data, valid_data, test_data)
print(f'-'*50)

train_loader = get_data_loader('Train', train_data, tokenizer, max_len, batch_size, True, True)
valid_loader = get_data_loader('Valid', valid_data, tokenizer, max_len, batch_size, True, True)
test_loader = get_data_loader('Test', test_data, tokenizer, max_len, batch_size, False, False)

Batch size = 8
--------------------------------------------------
Task Configuration
--------------------------------------------------
sentiment dataset
Train/Valid : 0.80/0.20
Train dataset length : 9235
Valid dataset length : 2309
Test dataset length : 4311

imdb dataset
Train/Valid : 0.80/0.20
Train dataset length : 40000
Valid dataset length : 10000

sarcasm dataset
Train/Valid : 0.80/0.20
Train dataset length : 808618
Valid dataset length : 202154

--------------------------------------------------
Combine Train - sentiment dataset


HBox(children=(FloatProgress(value=0.0, max=1155.0), HTML(value='')))


Combine Train - imdb dataset


HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))


Combine Train - sarcasm dataset


HBox(children=(FloatProgress(value=0.0, max=101078.0), HTML(value='')))


Combine Valid - sentiment dataset


HBox(children=(FloatProgress(value=0.0, max=289.0), HTML(value='')))


Combine Valid - imdb dataset


HBox(children=(FloatProgress(value=0.0, max=1250.0), HTML(value='')))


Combine Valid - sarcasm dataset


HBox(children=(FloatProgress(value=0.0, max=25270.0), HTML(value='')))


Combine Test - sentiment dataset


HBox(children=(FloatProgress(value=0.0, max=539.0), HTML(value='')))




In [13]:
class SentimentModel(nn.Module):
    def __init__(self, bert, configs, dropout_p):
        super(SentimentModel, self).__init__()
        self.bert = bert
        self.dropout_p = dropout_p
        hidden_size = bert.config.to_dict()['hidden_size']
        self.dropout = nn.Dropout(p=self.dropout_p)
        
        if SENTIMENT_LABEL in configs['task']:
            self.fc_sent = nn.Linear(
                hidden_size,
                configs['task'][SENTIMENT_LABEL]['out_features'],
            )
        
        if IMDB_LABEL in configs['task']:
            self.fc_im = nn.Linear(
                hidden_size,
                configs['task'][IMDB_LABEL]['out_features'],
            )

        if SARCASM_LABEL in configs['task']:
            self.fc_sarc = nn.Linear(
                hidden_size,
                configs['task'][SARCASM_LABEL]['out_features'],
            )
        
    def forward(self, input_ids, attention_mask, target_task):
        result = self.bert(
            input_ids = input_ids,
            attention_mask = attention_mask
        )
        out = self.dropout(result.pooler_output)

        if target_task == SENTIMENT_LABEL:
            out = self.fc_sent(out)
        elif target_task == IMDB_LABEL:
            out = self.fc_im(out)
        elif target_task == SARCASM_LABEL:
            out = self.fc_sarc(out)
        
        if not self.train:
            out = out * (1-self.dropout_p)

        return out

def train_epoch(model, loader, loss_fn, optimizer, scheduler, dataset_size):    
# def train_epoch(model, loader, loss_fn, optimizer, dataset_size):
    losses = {}
    means = {}
    correct_predictions = {}
    
    for task in loss_fn:
        losses[task] = []
        correct_predictions[task] = 0.0
    
    model = model.train()
    
    for batch in tqdm(loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'][0]
        attention_mask = batch['attention_mask'][0]
        targets = batch['targets'][0]
        task = convert_enum_to_label(batch['task'][0][0])
        if torch.cuda.is_available():
            input_ids = input_ids.cuda()
            attention_mask = attention_mask.cuda()
            targets = targets.cuda()
        
        outputs = model(
            input_ids = input_ids,
            attention_mask = attention_mask,
            target_task = task,
        )
        
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn[task](outputs, targets)
        
        correct_predictions[task] += torch.sum(preds == targets)
        losses[task].append(loss.detach().item())
        
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
    
    for task in correct_predictions:
        correct_predictions[task] = correct_predictions[task].double() / dataset_size[task]
        means[task] = np.sum(losses[task]) / dataset_size[task]
        
    return correct_predictions, losses, means

def valid_epoch(model, loader, loss_fn, dataset_size):
    losses = {}
    means = {}
    correct_predictions = {}

    for task in loss_fn:
        losses[task] = []
        correct_predictions[task] = 0.0
    
    model = model.eval()
    
    with torch.no_grad():
        for batch in tqdm(loader):
            input_ids = batch['input_ids'][0]
            attention_mask = batch['attention_mask'][0]
            targets = batch['targets'][0]
            task = convert_enum_to_label(batch['task'][0][0])
            
            if torch.cuda.is_available():
                input_ids = input_ids.cuda()
                attention_mask = attention_mask.cuda()
                targets = targets.cuda()
                
            outputs = model(
                input_ids = input_ids,
                attention_mask = attention_mask,
                target_task = task
            )
            
            _, preds = torch.max(outputs, dim=1)
            
            loss = loss_fn[task](outputs, targets)
            
            correct_predictions[task] += torch.sum(preds == targets)
            losses[task].append(loss.detach().item())

        for task in correct_predictions:
            correct_predictions[task] = correct_predictions[task].double() / dataset_size[task]
            means[task] = np.sum(losses[task]) / dataset_size[task] 

    return correct_predictions, losses, means

def get_predictions(model, loader, task):
    model = model.eval()
    
    predictions = []
    predictions_probs = []
    
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'][0]
            attention_mask = batch['attention_mask'][0]
            
            if torch.cuda.is_available():
                input_ids = input_ids.cuda()
                attention_mask = attention_mask.cuda()
                
            outputs = model(
                input_ids = input_ids,
                attention_mask = attention_mask,
                target_task = task
            )         
            predictions.extend(torch.argmax(outputs, dim=1))
            
    return torch.stack(predictions).cpu()


def print_model_results(phase, epoch, accuracy, losses):
    for task in accuracy:
        print(f'{phase} : {task} accruacy/loss : {accuracy[task]:.5f}/{losses[task]}')

In [14]:
model = SentimentModel(bert_model, configs, 0.1)

epochs = 10
total_steps = len(train_loader) * epochs
learning_rate = 2e-5

loss_fn = {}
train_size = {}
valid_size = {}

for task in configs['task']:
    loss_fn[task] = convert_name_to_func(configs['task'][task]['loss_fn'])
    train_size[task] = len(train_data[task])
    valid_size[task] = len(valid_data[task])

if torch.cuda.is_available():
    model = model.cuda()
    for task in loss_fn:
        # model fully connected layer cuda
        loss_fn[task] = loss_fn[task].cuda()
    
# Adam optimizer with weight decay
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Cosine annealing warm restarts
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
    optimizer,
    T_0 = int(epochs / 3),
    T_mult = 1,
    eta_min = 2e-8
)

results = {
    'train_loss' : [],
    'train_acc' : [],
    'valid_loss' : [],
    'valid_acc' : []
}

best_valid_acc = 0

for epoch in range(epochs):
    print(f'Epoch {epoch + 1} / {epochs}')
    time.sleep(1)
    train_acc, train_loss, train_loss_mean = train_epoch(
        model,
        train_loader,
        loss_fn,
        optimizer,
        scheduler,
        train_size
    )
    print_model_results('Train', epoch, train_acc, train_loss_mean)
    results['train_loss'].append(train_loss)
    results['train_acc'].append(train_acc)
    time.sleep(1)
    valid_acc, valid_loss, valid_loss_mean = valid_epoch(
        model,
        valid_loader,
        loss_fn,
        valid_size
    ) 
    print_model_results('Valid', epoch, valid_acc, valid_loss_mean)
    results['valid_loss'].append(valid_loss)
    results['valid_acc'].append(valid_acc)
    
    if best_valid_acc < valid_acc[SENTIMENT_LABEL]:
        best_valid_acc = valid_acc[SENTIMENT_LABEL]
        torch.save(model.state_dict(), Path(checkpoint_dir, f'Model_Valid_{start_time.strftime("%Y_%m_%d_%H_%M_%S")}.pt'))
        print(f'Best valid acc : {best_valid_acc * 100:.5f}%')
        
    print(f'-'*25)


torch.save(model.state_dict(), Path(checkpoint_dir, f'Model_Train_{start_time.strftime("%Y_%m_%d_%H_%M_%S")}.pt'))
model.load_state_dict(torch.load(Path(checkpoint_dir, f'Model_Valid_{start_time.strftime("%Y_%m_%d_%H_%M_%S")}.pt')))
predictions = get_predictions(model, test_loader, SENTIMENT_LABEL)

submission = pd.DataFrame({'Id' : range(len(predictions)), 'Category' : predictions})
submission.to_csv('submission.csv', index=False)

Epoch 1 / 10


HBox(children=(FloatProgress(value=0.0, max=107233.0), HTML(value='')))


Train : sentiment accruacy/loss : 0.45447/0.15379848925565215
Train : imdb accruacy/loss : 0.81990/0.05104908660305664
Train : sarcasm accruacy/loss : 0.73674/0.06575120051765802


HBox(children=(FloatProgress(value=0.0, max=26809.0), HTML(value='')))

KeyboardInterrupt: ignored

In [None]:
# sarcasm_raw = pd.read_csv(data_dir['sarcasm'].joinpath('train-balanced-sarcasm.csv'))
# sarcasm_df = pd.concat([sarcasm_df['parent_comment'], sarcasm_df['comment'], sarcasm_df['label']], axis=1)