In [1]:
import torch
import math
import random
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
from torch.utils import data
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.metrics import f1_score, roc_auc_score
import time

from hyperopt import hp, fmin, tpe, Trials, STATUS_FAIL, STATUS_OK
import csv

SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x20587283890>

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [3]:
NUM_EVALS = 1
output_filename = "./search_result/search.csv"

In [4]:
# Define the dataset and data iterators
class Dataset(data.Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, x, labels):
        'Initialization'
        self.x = x
        self.labels = labels

    def __len__(self):
        'Denotes the total number of samples'
        return self.x.shape[0]

    def __getitem__(self, index):
        'Generates one sample of data'

        # Load data and get label
        x = self.x[index]
        y = self.labels[index]

        return x, y

In [5]:
class BERTGRUSentiment(nn.Module):
    def __init__(self,
                 bert,
                 hidden_dim,
                 output_dim,
                 n_layers,
                 bidirectional,
                 dropout):
        
        super().__init__()
        
        self.bert = bert
        
        embedding_dim = bert.config.to_dict()['dim']
        
        self.rnn = nn.GRU(embedding_dim,
                          hidden_dim,
                          num_layers = n_layers,
                          bidirectional = bidirectional,
                          batch_first = True,
                          dropout = 0 if n_layers < 2 else dropout)
        
        self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        #text = [batch size, sent len]
        attention_mask = text.masked_fill(text != 0, 1)
                
        with torch.no_grad():
            embedded = self.bert(text, attention_mask=attention_mask)[0]
                
        #embedded = [batch size, sent len, emb dim]
        
        _, hidden = self.rnn(embedded)
        
        #hidden = [n layers * n directions, batch size, emb dim]
        
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
                
        #hidden = [batch size, hid dim]
        
        output = self.out(hidden)
        
        #output = [batch size, out dim]
        
        return output

In [6]:
def get_max_len(tokenized):
    max_len = 0
    for i in tokenized.values:
        if len(i) > max_len:
            max_len = len(i)
    return max_len

In [7]:
def multi_acc(y_pred, y_label):
    softmax = nn.Softmax(dim=1)
    y_pred_softmax = softmax(y_pred)
    _, y_pred_tags = torch.max(y_pred_softmax, dim = 1)

    # accu
    correct_pred = (y_pred_tags == y_label).float()
    acc = correct_pred.sum() / len(y_label)

    # roc-auc
    one_hot_label = nn.functional.one_hot(y_label)
    roc_auc = roc_auc_score(one_hot_label.detach().cpu(), y_pred_softmax.detach().cpu(), average="macro")

    # f1
    f1 = f1_score(y_label.detach().cpu(), y_pred_tags.detach().cpu(), average='weighted')
    
    return acc, roc_auc, f1

In [8]:
def train(model, data_loader, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    epoch_rocauc = 0
    epoch_f1 = 0
    
    model.train()
    
    for batch_idx, (data, target) in enumerate(data_loader):
        data, target = data.to(device), target.to(device)
        
        optimizer.zero_grad()
        
        predictions = model(data).squeeze(1)
        
        loss = criterion(predictions, target)
        
        acc, roc_auc, f1 = multi_acc(predictions, target)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        epoch_rocauc += roc_auc
        epoch_f1 += f1

#         print("batch idx {}: | train loss: {} | train accu: {:.3f} | train roc: {:.3f} | train f1: {}".format(
#             batch_idx, loss.item(), acc.item(), roc_auc, f1))
        
    return epoch_loss / len(data_loader), epoch_acc / len(data_loader), epoch_rocauc / len(data_loader), epoch_f1 / len(data_loader)

In [9]:
def evaluate(model, data_loader, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    epoch_rocauc = 0
    epoch_f1 = 0
    model.eval()
    
    with torch.no_grad():
    
        for batch_idx, (data, target) in enumerate(data_loader):
            data, target = data.to(device), target.to(device)
            
            predictions = model(data).squeeze(1)
            
            loss = criterion(predictions, target)
            
            acc, roc_auc, f1 = multi_acc(predictions, target)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
            epoch_rocauc += roc_auc
            epoch_f1 += f1
        
    return epoch_loss / len(data_loader), epoch_acc / len(data_loader), epoch_rocauc / len(data_loader), epoch_f1 / len(data_loader)

In [10]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [11]:
def run_model(space):
    args = {'batch_size': 128,
    'lr': space['lr'],
    'hidden_dim': 128,
    'n_layers': space['n_layers'],
    'bidirectional': True,
    'dropout': space['dropout'],
    'n_epochs': 20,
    'b1': space['b1'],
    'b2': space['b2'],
    'weight_decay': space['weight_decay'],
    'weight': torch.tensor([0.1568, 0.4639, 0.3793], dtype=torch.float32)
    }
    train_loader = torch.load("train_loader.pth")
    valid_loader = torch.load("valid_loader.pth")
    test_loader = torch.load("test_loader.pth")
    
    opt_name = '_'.join(['b1_'+str(args['b1']), 'b2_'+str(args['b2']), 'lr'+str(args['lr']),
                         'drop'+str(args['dropout']), 'l2_'+str(args['weight_decay'])])
    
    bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
    model = BERTGRUSentiment(bert,
                         args['hidden_dim'],
                         3,
                         args['n_layers'],
                         args['bidirectional'],
                         args['dropout']).to(device)
    for name, param in model.named_parameters():                
        if name.startswith('bert'):
            param.requires_grad = False
    optimizer = optim.Adam(model.parameters(), 
                           lr=args['lr'], 
                           betas=(args["b1"], args["b2"]),
                           weight_decay=args["weight_decay"])
    criterion = nn.CrossEntropyLoss(weight=args['weight']).to(device)
    history = {
        "train_loss": [],
        "valid_loss": []
    }

    best_valid_loss = float('inf')
    best_valid_acc = 0
    best_valid_f1 = 0

    for epoch in range(args['n_epochs']):

        start_time = time.time()

        train_loss, train_acc, train_rocauc, train_f1 = train(model, train_loader, optimizer, criterion)
        history["train_loss"].append(train_loss)
        valid_loss, valid_acc, valid_rocauc, valid_f1 = evaluate(model, valid_loader, criterion)
        history["valid_loss"].append(valid_loss)

        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            best_valid_acc = valid_acc
            best_valid_f1 = valid_f1
            
#         print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
#         print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f} | Train rocauc: {train_rocauc} | Train f1: {train_f1}%')
#         print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f} | Val. rocauc: {valid_rocauc} | Val. f1: {valid_f1}%')
    model_dict = {
        "v_loss": best_valid_loss,
        "v_acc": best_valid_acc,
        "v_f1": best_valid_f1,
        "name": opt_name
    }
    return model_dict

In [None]:
# hyper parameter search space'
space = {
    'lr': hp.loguniform('lr', np.log(1e-4), np.log(3e-2)),
    'n_layers': hp.choice("n_layers", range(2, 4, 1)),
    'dropout': hp.uniform("dropouut", 0.25, 0.5),
    'b1': hp.loguniform('b1', np.log(0.5), np.log(0.9)),
    'b2': hp.loguniform('b2', np.log(0.5), np.log(0.999)),
    'weight_decay': hp.loguniform('weight_decay', np.log(0.01), np.log(1))
}

trials = Trials()
evals_inc = min(NUM_EVALS, 1)
while evals_inc <= NUM_EVALS:
    best = fmin(fn=run_model, space=space, algo=tpe.suggest, max_evals=evals_inc,
                trials=trials)
    results  = []
    for trial in trails.trails:
        results.append(trail['result'])
    keys = results[0].keys()
    with(open(output_filename, "w")) as output_file:
        dict_writer = csv.DictWriter(output_filename, keys)
        dict_writer.writeheader()
        dict_writer.writerows(results)
        
    if evals_inc == NUM_EVALS:
        break
    evals_inc = min(NUM_EVALS, evals_inc+5)


  0%|                                                                            | 0/1 [00:00<?, ?trial/s, best loss=?]