# BERT Sentiment Classification
## Applied Methodology
- Multi-task learning
- Active dropout

## Available choices
- Pretrained model : RoBERTa, BERT
- Tasks : Kaggle, IMDB, Amazon Movie&TV Review

In [None]:
!pip3 install torch
!pip3 install torchtext
!pip3 install transformers
!pip3 install tqdm
!pip3 install pathlib

In [None]:
import re
import copy
import torch
import torch.nn as nn
import torchtext
import numpy as np
import pandas as pd
from pathlib import Path
import time
from datetime import datetime
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from enum import Enum
from pprint import pprint
from torch.utils.data import DataLoader, Dataset

RANDOM_SEED = 42
# For same result
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

# Label for Task
SENTIMENT_LABEL = 'sentiment'
IMDB_LABEL = 'imdb'
SARCASM_LABEL = 'sarcasm'
AMAZON_LABEL = 'amazon'
ALL_LABEL ='all'

# Label for loss function
CROSS_ENTROPY_LOSS_LABEL = 'crossentropyloss'

# Configurations for running code
configs = {
    'path' : {
        'colab' : {
            'root' : '/content/gdrive/My Drive',
            'data' : 'dataset',
            'checkpoint' : 'checkpoints',
        },
        'local' : {
            'root' : '../',
            'data' : '.data',
            'checkpoint' : 'checkpoints',
        },   
    },
    'bert_model' : 'roberta-base',
    'hidden_layers' : [1024],
    'active_dropout' : True,
    'mc_num' : 50,
    'dropout_p' : 0.5,
    'max_len' : 128,
    'batch_size' : 64,
    'epoch' : 5,
    'learning_rate' : 2e-5,
    'saved_checkpoint' : 'Model_Valid_Multi_ALL_20.pt',
    'use_concat_model' : True,
    'dataset' : {
        # SENTIMENT_LABEL : {
        #     'train_file' : 'sentence-classification/train_plus.csv',
        #     'test_file' : 'sentence-classification/eval_final_open.csv',
        #     'train_valid_frac' : 0.9,
        # },
        SENTIMENT_LABEL : {
            'train_file' : 'sentence-classification/train_final_p.csv',
            'test_file' : 'sentence-classification/eval_final_open.csv',
            'train_valid_frac' : 0.9,
        },
        # IMDB_LABEL : {
        #     'train_file' : 'imdb/train_IM.csv',
        #     'train_valid_frac' : 0.9,
        # },
        # SARCASM_LABEL : {
        #     'train_file' : 'reddit-sarcasm/train_sarcasm.csv',
        #     'train_valid_frac' : 0.9,
        # },
        # AMAZON_LABEL : {
        #     'train_file' : 'amazon-review/movie_tv_200000.csv',
        #     'train_valid_frac' : 0.9,
        # },
    },
    'model': {
        SENTIMENT_LABEL : {
            'out_features' : 5,
            'loss_fn' : CROSS_ENTROPY_LOSS_LABEL,
        },
        IMDB_LABEL : {
            'out_features' : 2,
            'loss_fn' : CROSS_ENTROPY_LOSS_LABEL,
        },
        SARCASM_LABEL : {
            'out_features' : 2,
            'loss_fn' : CROSS_ENTROPY_LOSS_LABEL,
        },
        AMAZON_LABEL : {
            'out_features' : 5,
            'loss_fn' : CROSS_ENTROPY_LOSS_LABEL,
        },
    }
}

# Value for each Task
class Task(Enum):
    SENTIMENT = 1
    IMDB = 2
    SARCASM = 3
    AMAZON = 4

# To store trained model's checkpoint(filename)
start_time = datetime.now()

if 'google.colab' in str(get_ipython()):
    print('Running on CoLab')
    from google.colab import drive

    drive.mount('/content/gdrive')

    root_dir = Path(configs['path']['colab']['root'])
    data_dir = Path(root_dir, configs['path']['colab']['data'])
    checkpoint_dir = Path(root_dir, configs['path']['colab']['checkpoint'])

else:
    print('Not running on CoLab')
    
    root_dir = Path(configs['path']['local']['root'])
    data_dir = Path(root_dir, configs['path']['local']['data'])
    checkpoint_dir = Path(root_dir, configs['path']['local']['checkpoint'])

Path(checkpoint_dir).mkdir(parents=False, exist_ok=True)
torch.cuda.empty_cache()

In [None]:
# Load pretrained model
PRETRAINED_MODEL = configs['bert_model']

if re.compile('^robert').match(PRETRAINED_MODEL):
    from transformers import RobertaModel, RobertaTokenizer, AdamW
    
    tokenizer = RobertaTokenizer.from_pretrained(PRETRAINED_MODEL)
    bert_model = RobertaModel.from_pretrained(PRETRAINED_MODEL)

elif re.compile('^bert').match(PRETRAINED_MODEL):
    from transformers import BertTokenizer, BertModel, AdamW

    tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL)
    bert_model = BertModel.from_pretrained(PRETRAINED_MODEL)

## Preprocessing Dataset

In [None]:
class ClassificationDataset(Dataset):
    '''
    Preprocessing dataframe to dataset. CSV should have two columns : "Sentence", "Category"
    :params
      df: dataframe loaded from csv
      task: value for each task - reference class Task(Enum)
    '''
    def __init__(self, df, task, tokenizer, max_len, is_train):
        self.task = task
        self.sentences = df['Sentence'].to_numpy()
        self.is_train = is_train
        if self.is_train:
            self.targets = df['Sentiment'].to_numpy()
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        if self.is_train:
            target = self.targets[idx]
               
        encoding = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens = True, # Add CLS, SEP
            max_length = self.max_len,
            return_token_type_ids = False,
            padding = 'max_length',
            truncation = True,
            return_attention_mask = True,
            return_tensors = 'pt',
        )
        if self.is_train:
            return {
                'task' : self.task,
                'text' : sentence,
                'input_ids' : encoding['input_ids'].flatten(),
                'attention_mask' : encoding['attention_mask'].flatten(),
                'targets' : torch.tensor(target, dtype=torch.long)
            }
        else:
            return {
                'task' :self.task,
                'text' : sentence,
                'input_ids' : encoding['input_ids'].flatten(),
                'attention_mask' : encoding['attention_mask'].flatten(),
            }

def load_csv_data(configs, seed):
    '''
    Return dictionary of dataframes for each task from csv
    :return
      dict, dict, dict: train, valid, test dataset is returned. Task label is key (e.g. SENTIMENT_LABEL)
    '''
    train_data = {}
    valid_data = {}
    test_data = {}

    for task in configs['dataset']:
        raw_data = pd.read_csv(data_dir.joinpath(configs['dataset'][task]['train_file']))
        train_data[task] = raw_data.sample(frac=configs['dataset'][task]['train_valid_frac'], random_state=seed)
        valid_data[task] = raw_data.drop(train_data[task].index)
        if 'test_file' in configs['dataset'][task]:
            test_data[task] = pd.read_csv(data_dir.joinpath(configs['dataset'][task]['test_file']))
    
    return train_data, valid_data, test_data

def print_dataset_configs(configs, train, valid, test):
    '''
    Print overview of preprocessed dataset
    '''
    for task in train:
        print(f'{task} dataset')
        print(f'='*25)
        print(f'Train/Valid : {configs["dataset"][task]["train_valid_frac"]:.2f}/{1-configs["dataset"][task]["train_valid_frac"]:.2f}')
        print(f'='*25)
        print(f'Train dataset length : {len(train[task])}')
        if task in valid:
            print(f'Valid dataset length : {len(valid[task])}')
        if task in test:
            print(f'Test dataset length : {len(test[task])}')
        print(f'='*25)
        print('')

def get_data_loader(phase, task_df, tokenizer, max_len, batch_size, is_train, shuffle):
    '''
    Get an entire dataloader. Each dataset of a task is preprocessed under the same conditions (e.g. batch_size)
    :params
      task_df: dataset for each task, dataframe
    '''
    total_dataset = []
    
    for task in task_df:
        dataset = ClassificationDataset(
            task_df[task],
            convert_label_to_enum(task),
            tokenizer = tokenizer,
            max_len = max_len,
            is_train=is_train,
        )
        time.sleep(1)
        
        loader = DataLoader(
            dataset,
            batch_size = batch_size
        )
        
        print(f'Combine {phase} - {task} dataset')
        time.sleep(1)
        for batch in tqdm(loader):
            total_dataset.append(batch)
    
    return DataLoader(
        total_dataset,
        shuffle = shuffle,
        batch_size = 1,
    )

def convert_enum_to_label(enum):
    if enum == Task.SENTIMENT.value:
        return SENTIMENT_LABEL
    elif enum == Task.IMDB.value:
        return IMDB_LABEL
    elif enum == Task.SARCASM.value:
        return SARCASM_LABEL
    elif enum == Task.AMAZON.value:
        return AMAZON_LABEL

def convert_label_to_enum(label):
    if label == SENTIMENT_LABEL:
        return Task.SENTIMENT.value
    elif label == IMDB_LABEL:
        return Task.IMDB.value
    elif label == SARCASM_LABEL:
        return Task.SARCASM.value
    elif label == AMAZON_LABEL:
        return Task.AMAZON.value
    
def convert_name_to_func(name):
    if name == CROSS_ENTROPY_LOSS_LABEL:
        return nn.CrossEntropyLoss()
    elif name == BCE_LOSS_WITH_LOGITS_LABEL:
        return nn.BCEWithLogitsLoss()

## Define model, Train, Valid, Prediction

In [None]:
class ActiveDropout(nn.Module):
    def __init__(self, p=0.5):
        super().__init__()
        self.p = p
    def forward(self, input):
        mask = torch.rand_like(input) > self.p
        return input * mask.to(input) / (1-self.p)

def linear_dropout_blocks(in_f, out_f, p, is_activated, *args, **kwargs):
    if is_activated:
        return nn.Sequential(
            nn.Linear(in_f, out_f, *args, **kwargs),
            nn.ReLU(),
            ActiveDropout(p)
        )
    else:
        return nn.Sequential(
            nn.Linear(in_f, out_f, *args, **kwargs),
            nn.ReLU(),
            nn.Dropout(p=p)
        )

class SentimentModel(nn.Module):
    '''
    Multi-task learning is applied
    fc_sent, fc_im, fc_sarc is fully connected layer of a task and train separately. (share BERT layer)
    '''
    def __init__(self, bert, configs):
        super(SentimentModel, self).__init__()
        self.bert = bert
        self.dropout_p = configs['dropout_p']
        self.activation_flag = configs['active_dropout']
        
        hidden_size = bert.config.to_dict()['hidden_size']
        h_layers = copy.deepcopy(configs['hidden_layers'])
        h_layers.insert(0, hidden_size)        
        fc_layers = [
            linear_dropout_blocks(in_f, out_f, self.dropout_p, self.activation_flag) for in_f, out_f in zip(h_layers, h_layers[1:])
        ]
        
        if SENTIMENT_LABEL in configs['model']:
            self.fc_sent = nn.Sequential(*fc_layers)
            self.fc_sent_out = nn.Linear(h_layers[-1], configs['model'][SENTIMENT_LABEL]['out_features'])

        if IMDB_LABEL in configs['model']:
            self.fc_imdb = nn.Sequential(*fc_layers)
            self.fc_imdb_out = nn.Linear(h_layers[-1], configs['model'][IMDB_LABEL]['out_features'])

        if SARCASM_LABEL in configs['model']:
            self.fc_sarc = nn.Sequential(*fc_layers)
            self.fc_sarc_out = nn.Linear(h_layers[-1], configs['model'][SARCASM_LABEL]['out_features'])
            
        if AMAZON_LABEL in configs['model']:
            self.fc_amaz = nn.Sequential(*fc_layers)
            self.fc_amaz_out = nn.Linear(h_layers[-1], configs['model'][AMAZON_LABEL]['out_features'])
        
    def forward(self, input_ids, attention_mask, target_task):
        '''
          forward for each task
          :params
            target_task: task label(string)
        '''
        result = self.bert(
            input_ids = input_ids,
            attention_mask = attention_mask
        )

        if target_task == SENTIMENT_LABEL:
            out = self.fc_sent(result.pooler_output)
            out = self.fc_sent_out(out)
        elif target_task == IMDB_LABEL:
            out = self.fc_imdb(result.pooler_output)
            out = self.fc_imdb_out(out)
        elif target_task == SARCASM_LABEL:
            out = self.fc_sarc(result.pooler_output)
            out = self.fc_sarc_out(out)
        elif target_task == AMAZON_LABEL:
            out = self.fc_amaz(result.pooler_output)
            out = self.fc_amaz_out(out)
        elif target_task == ALL_LABEL:
            out = torch.cat([
                self.fc_sent(result.pooler_output),
                self.fc_imdb(result.pooler_output),
                self.fc_sarc(result.pooler_output),
                self.fc_amaz(result.pooler_output)
                ], dim=1)
            
        return out

class ConcatModel(nn.Module):
    def __init__(self, sent_model, configs):
        super(ConcatModel, self).__init__()
        self.sent_model = sent_model
        self.fc_dropout = linear_dropout_blocks(configs['hidden_layers'][-1] * 4, configs['hidden_layers'][-1] * 2, configs['dropout_p'], configs['active_dropout'])
        self.fc_out = nn.Linear(configs['hidden_layers'][-1] * 2, configs['model'][SENTIMENT_LABEL]['out_features'])
    def forward(self, input_ids, attention_mask, target_task):
      out = self.sent_model(
          input_ids = input_ids,
          attention_mask = attention_mask,
          target_task = ALL_LABEL
      )
      out = self.fc_dropout(out)
      out = self.fc_out(out)
      return out

def train_epoch(model, loader, loss_fn, optimizer, scheduler, dataset_size):
    '''
        Return accruacy, loss for each task
    '''
    losses = {}
    means = {}
    correct_predictions = {}
    
    for task in loss_fn:
        losses[task] = []
        correct_predictions[task] = 0.0
    
    model = model.train()
    
    for batch in tqdm(loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'][0]
        attention_mask = batch['attention_mask'][0]
        targets = batch['targets'][0]
        task = convert_enum_to_label(batch['task'][0][0])
        if torch.cuda.is_available():
            input_ids = input_ids.cuda()
            attention_mask = attention_mask.cuda()
            targets = targets.cuda()
        
        outputs = model(
            input_ids = input_ids,
            attention_mask = attention_mask,
            target_task = task,
        )
        
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn[task](outputs, targets)
        
        correct_predictions[task] += torch.sum(preds == targets)
        losses[task].append(loss.detach().item())
        
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
    
    for task in correct_predictions:
        try:
            correct_predictions[task] = correct_predictions[task].double() / dataset_size[task]
        except:
            correct_predictions[task] = correct_predictions[task] / dataset_size[task]
        means[task] = np.sum(losses[task]) / dataset_size[task]
    return correct_predictions, means

def valid_epoch(model, loader, loss_fn, dataset_size):
    '''
        Return accruacy, loss for each task
    '''
    losses = {}
    means = {}
    correct_predictions = {}

    for task in loss_fn:
        losses[task] = []
        correct_predictions[task] = 0.0
    
    model = model.eval()
    
    with torch.no_grad():
        for batch in tqdm(loader):
            input_ids = batch['input_ids'][0]
            attention_mask = batch['attention_mask'][0]
            targets = batch['targets'][0]
            task = convert_enum_to_label(batch['task'][0][0])
            
            if torch.cuda.is_available():
                input_ids = input_ids.cuda()
                attention_mask = attention_mask.cuda()
                targets = targets.cuda()
                
            outputs = model(
                input_ids = input_ids,
                attention_mask = attention_mask,
                target_task = task
            )
            
            _, preds = torch.max(outputs, dim=1)
            
            loss = loss_fn[task](outputs, targets)
            
            correct_predictions[task] += torch.sum(preds == targets)
            losses[task].append(loss.detach().item())

        for task in correct_predictions:
            try:
                correct_predictions[task] = correct_predictions[task].double() / dataset_size[task]
            except:
                correct_predictions[task] = correct_predictions[task] / dataset_size[task]
            means[task] = np.sum(losses[task]) / dataset_size[task] 

    return correct_predictions, means

def get_predictions(model, loader, task, is_argmax):
    '''
        Return predictions for a task
    '''
    model = model.eval()
    
    predictions = []
    predictions_probs = []
    
    with torch.no_grad():
        for batch in tqdm(loader):
            input_ids = batch['input_ids'][0]
            attention_mask = batch['attention_mask'][0]
            
            if torch.cuda.is_available():
                input_ids = input_ids.cuda()
                attention_mask = attention_mask.cuda()
                
            outputs = model(
                input_ids = input_ids,
                attention_mask = attention_mask,
                target_task = task
            )
            if is_argmax:
                predictions.extend(torch.argmax(outputs, dim=1))
            else:
                predictions.extend(nn.functional.softmax(outputs, dim=1))
            
    return torch.stack(predictions).cpu()


def mc_prediction(model, loader, task, mc_num):
    model = model.eval()
    
    predictions = None
    
    for i in range(mc_num):
        print(f'Monte Carlo Predictions : {i+1} / {mc_num}')
        time.sleep(1)
        if predictions is None:
            predictions = get_predictions(model, loader, task, False)
        else:
            predictions += get_predictions(model, loader, task, False)
        
    predictions = predictions / mc_num
    return torch.argmax(predictions, dim=1)
    

def print_model_results(phase, epoch, accuracy, losses):
    for task in accuracy:
        print(f'{phase} : {task} accruacy/loss : {accuracy[task]:.5f}/{losses[task]}')

### Load Dataset

In [None]:
max_len = configs['max_len']
batch_size = configs['batch_size']

train_data, valid_data, test_data = load_csv_data(configs, RANDOM_SEED)


print(f'Batch size = {batch_size}')
print(f'-'*50)
print(f'Task Configuration')
print(f'-'*50)
print_dataset_configs(configs, train_data, valid_data, test_data)
print(f'-'*50)

train_loader = get_data_loader('Train', train_data, tokenizer, max_len, batch_size, True, True)
valid_loader = get_data_loader('Valid', valid_data, tokenizer, max_len, batch_size, True, True)
test_loader = get_data_loader('Test', test_data, tokenizer, max_len, batch_size, False, False)

### Train, Valid, Test model

In [None]:
model = SentimentModel(bert_model, configs)

if Path(checkpoint_dir, configs['saved_checkpoint']).is_file():
    model.load_state_dict(torch.load(Path(checkpoint_dir, configs['saved_checkpoint'])))
    print(f'{Path(checkpoint_dir, configs["saved_checkpoint"])} Model Loaded')
    if configs['use_concat_model']:
        model = ConcatModel(model, configs)
        print(f'Concat Model Applied')

epochs = configs['epoch']
total_steps = len(train_loader) * epochs
learning_rate = configs['learning_rate']

loss_fn = {}
train_size = {}
valid_size = {}

for task in configs['dataset']:
    train_size[task] = len(train_data[task])
    valid_size[task] = len(valid_data[task])
    loss_fn[task] = convert_name_to_func(configs['model'][task]['loss_fn'])

if torch.cuda.is_available():
    model = model.cuda()
    for task in loss_fn:
        # model fully connected layer cuda
        loss_fn[task] = loss_fn[task].cuda()
    
# Adam optimizer with weight decay
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

Cosine_T_0 = int(epochs / 3)

if Cosine_T_0 <= 5:
    Cosine_T_0 = epochs

# Cosine annealing warm restarts
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
    optimizer,
    T_0 = Cosine_T_0,
    T_mult = 1,
    eta_min = 2e-8
)

results = {
    'train_loss' : [],
    'train_acc' : [],
    'valid_loss' : [],
    'valid_acc' : []
}

best_valid_acc = 0

for epoch in range(epochs):
    print(f'Epoch {epoch + 1} / {epochs}')
    time.sleep(1)
    train_acc, train_loss = train_epoch(
        model,
        train_loader,
        loss_fn,
        optimizer,
        scheduler,
        train_size
    )
    print_model_results('Train', epoch, train_acc, train_loss)
    results['train_loss'].append(train_loss)
    results['train_acc'].append(train_acc)
    time.sleep(1)
    valid_acc, valid_loss = valid_epoch(
        model,
        valid_loader,
        loss_fn,
        valid_size
    ) 
    print_model_results('Valid', epoch, valid_acc, valid_loss)
    results['valid_loss'].append(valid_loss)
    results['valid_acc'].append(valid_acc)
    
    if best_valid_acc < valid_acc[SENTIMENT_LABEL]:
        best_valid_acc = valid_acc[SENTIMENT_LABEL]
        torch.save(model.state_dict(), Path(checkpoint_dir, f'Model_Valid_{start_time.strftime("%Y_%m_%d_%H_%M_%S")}.pt'))
        print(f'Best valid acc : {best_valid_acc * 100:.5f}%')
        
    print(f'-'*25)


torch.save(model.state_dict(), Path(checkpoint_dir, f'Model_Train_{start_time.strftime("%Y_%m_%d_%H_%M_%S")}.pt'))
model.load_state_dict(torch.load(Path(checkpoint_dir, f'Model_Valid_{start_time.strftime("%Y_%m_%d_%H_%M_%S")}.pt')))

In [None]:
if configs['active_dropout']:
    predictions = mc_prediction(model, test_loader, SENTIMENT_LABEL, configs['mc_num'])
else:
    predictions = get_predictions(model, test_loader, SENTIMENT_LABEL, True)

submission = pd.DataFrame({'Id' : range(len(predictions)), 'Category' : predictions})
submission.to_csv('submission.csv', index=False)