## Data Loading

In [1]:
from transformers import RobertaTokenizer, RobertaModel
from dataloader import DataLoader
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [2]:
train_dataloader = DataLoader('fce_train_origin', tokenizer, max_len=132, batch_size=16)
validation_dataloader = DataLoader('fce_dev_origin', tokenizer, max_len=132, test=True, batch_size=16)
test_dataloader = DataLoader('fce_test_origin', tokenizer, max_len=132, test=True, batch_size=16)

## Model

In [3]:
from model import RobertaForTokenZeroShotClassification
import torch
from transformers import AdamW
import torch.optim as optim
import numpy as np
import time
import datetime
import random

def set_seed(seed_value=42):
    """Set seed for reproducibility."""
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    elapsed_rounded = int(round((elapsed)))
    
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [4]:
from tqdm.notebook import trange, tqdm
from sklearn.metrics import f1_score, accuracy_score
def train(model, train_dataloader):
    total_train_loss = 0

    model.train()

    for step, batch in tqdm(enumerate(train_dataloader), total=train_dataloader.total_step):

        if step % 2000 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, train_dataloader.total_step, elapsed))
            total_eval_loss, token_result, label_actual, label_predict = eval(model, validation_dataloader)
            acc = accuracy_score(label_actual, label_predict)
            print(' Accuracy is {}'.format(acc))
            print(token_result)

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[3].to(device)

        model.zero_grad()        

        result_t = model(b_input_ids,
                             attention_mask=b_input_mask, 
                             labels=b_labels)
        loss = result_t['loss']
        logits = result_t['logits']
        total_train_loss += loss.item()

        loss.backward()

        # not clip for roberta
        # torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        scheduler.step()
    
    return total_train_loss

In [5]:
def eval_result(predicts, labels):
    main_label = 0
    main_correct_count = 0
    correct_sum = 0
    main_predicted_count = 0
    main_total_count = 0
    assert len(predicts) == len(labels)
    for i in range(len(predicts)):
        if labels[i] <= 1:
            predicted_label = predicts[i]
            gold_label = labels[i]
            if gold_label == predicted_label:
                correct_sum += 1
            if predicted_label == main_label:
                main_predicted_count += 1
            if gold_label == main_label:
                main_total_count += 1
            if predicted_label == gold_label and gold_label == main_label:
                main_correct_count += 1
    p = (float(main_correct_count) / float(main_predicted_count)) if (main_predicted_count > 0) else 0.0
    r = (float(main_correct_count) / float(main_total_count)) if (main_total_count > 0) else 0.0
    f = (2.0 * p * r / (p + r)) if (p + r > 0.0) else 0.0
    f05 = ((1.0 + 0.5 * 0.5) * p * r / ((0.5 * 0.5 * p) + r)) if (p + r > 0.0) else 0.0
    return {"p":p, "r":r, "f1":f, "f05":f05}

In [6]:
def eval(model, validation_dataloader, threshold=0.5):
    model.eval()

    total_eval_loss = 0
    nb_eval_steps = 0

    label_actual = []
    label_predict = []
    predict_tokens = []
    scores = []

    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[3].to(device)
        
        with torch.no_grad():        
            result_t = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels, return_mask=True)
            loss = result_t['loss']
            logits = result_t['logits']
            mask = result_t['average_mask']

        # predict_token = mask.max(1)[1]
        predict_token = (mask.squeeze(1).detach().cpu().numpy() < 0).astype(int)
        predict_token = predict_token.reshape(-1).tolist()
        score = batch[2][:, 1:].reshape(-1).tolist()
        predict_tokens.extend(predict_token)
        scores.extend(score)
        
        total_eval_loss += loss.item()

        predict_ids = (logits.sigmoid().detach().cpu().numpy() > threshold).astype(int)
        label_ids = b_labels.to('cpu').numpy()

        label_actual.append(label_ids)

        label_predict.append(predict_ids)

    return total_eval_loss, eval_result(predict_tokens, scores), np.concatenate(label_actual), np.concatenate(label_predict)

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
roberta = RobertaModel.from_pretrained('roberta-base', add_pooling_layer=False)
# TODO: why kmin influence the recall rate that much?
model = RobertaForTokenZeroShotClassification(roberta, num_maps=8, num_labels=1, kmax=0.1, kmin=0.1, alpha=0.3, beta=0, penalty_ratio=0.01, random_drop=0).to(device)
model_name = 'model'
model.update_dropout(0)

optimizer = AdamW([{'params': [param for name, param in model.named_parameters() if 'roberta' not in name], 'lr': 2e-5}, 
                    {'params': model.roberta.parameters(), 'lr': 2e-5}], 
            lr=2e-5,
            eps = 1e-6,
            betas = (0.9, 0.98),
            weight_decay=0.1
            )

from transformers import get_linear_schedule_with_warmup

epochs = 5

total_steps = train_dataloader.total_step * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = int(total_steps / 2.5), 
                                            num_training_steps = total_steps)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Run Training

In [8]:
import random
import numpy as np
from sklearn.metrics import f1_score, accuracy_score

set_seed(42)

total_t0 = time.time()

eval_loss_list = []
f1_score_list = []

for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================

    # if epoch_i <= 5:
    #     model.update_dropout(0.1*(epoch_i))
    # else:
    #     model.update_dropout(0.5)

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()

    total_train_loss = train(model, train_dataloader)

    avg_train_loss = total_train_loss / train_dataloader.total_step         
    
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()

    total_eval_loss, token_result, label_actual, label_predict = eval(model, validation_dataloader)

    print("  Accuracy: {0:.2f}".format(accuracy_score(label_actual, label_predict)))
    print("  f1_Macro_Score: {0:.2f}".format(f1_score(label_actual, label_predict,average = 'macro', zero_division=1)))
    print("  f1_Micro_Score: {0:.2f}".format(f1_score(label_actual, label_predict,average = 'micro', zero_division=1)))
    print(token_result)

    _, token_result, _, _ = eval(model, test_dataloader)
    print("test result:", token_result)
    f1_s = f1_score(label_actual, label_predict,average = 'micro', zero_division=1)
    
    avg_val_loss = total_eval_loss / validation_dataloader.total_step
    
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    if len(eval_loss_list) == 0 or avg_val_loss < min(eval_loss_list):
        print(" New best val loss, save to disc.")
        torch.save(model.state_dict(), "./models/best_val_{}.pt".format(model_name))

    if len(f1_score_list) == 0 or f1_s > max(f1_score_list):
        print(" New best f1 score, save to disc.")
        torch.save(model.state_dict(), "./models/best_f1_{}.pt".format(model_name))

    eval_loss_list.append(avg_val_loss)
    f1_score_list.append(f1_s)

print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...


  0%|          | 0/1705.9375 [00:00<?, ?it/s]


  Average training loss: 0.59
  Training epcoh took: 0:06:39

Running Validation...
  Accuracy: 0.73
  f1_Macro_Score: 0.72
  f1_Micro_Score: 0.73
{'p': 0.19550105444036553, 'r': 0.804305912596401, 'f1': 0.31454602576185986, 'f05': 0.23037699727560557}
test result: {'p': 0.21804246321909965, 'r': 0.7850911974623315, 'f1': 0.3412969283276451, 'f05': 0.25485774304161135}
  Validation Loss: 0.55
  Validation took: 0:00:22
 New best val loss, save to disc.
 New best f1 score, save to disc.

Training...


  0%|          | 0/1705.9375 [00:00<?, ?it/s]


  Average training loss: 0.49
  Training epcoh took: 0:06:37

Running Validation...
  Accuracy: 0.67
  f1_Macro_Score: 0.67
  f1_Micro_Score: 0.67
{'p': 0.2544849537037037, 'r': 0.5652313624678663, 'f1': 0.3509577015163608, 'f05': 0.28592327698309494}
test result: {'p': 0.2744718718563801, 'r': 0.5625693893735131, 'f1': 0.36894112752236324, 'f05': 0.3057916788799421}
  Validation Loss: 0.62
  Validation took: 0:00:21

Training...


  0%|          | 0/1705.9375 [00:00<?, ?it/s]


  Average training loss: 0.41
  Training epcoh took: 0:06:37

Running Validation...
  Accuracy: 0.76
  f1_Macro_Score: 0.76
  f1_Micro_Score: 0.76
{'p': 0.23482393605530352, 'r': 0.6985861182519281, 'f1': 0.35149555375909464, 'f05': 0.27077520924671183}
test result: {'p': 0.2614665147790635, 'r': 0.6916732751784298, 'f1': 0.37948137835015666, 'f05': 0.29861272784541437}
  Validation Loss: 0.50
  Validation took: 0:00:21
 New best val loss, save to disc.
 New best f1 score, save to disc.

Training...


  0%|          | 0/1705.9375 [00:00<?, ?it/s]


  Average training loss: 0.31
  Training epcoh took: 0:06:38

Running Validation...
  Accuracy: 0.78
  f1_Macro_Score: 0.77
  f1_Micro_Score: 0.78
{'p': 0.2378365326572498, 'r': 0.6330334190231363, 'f1': 0.3457656867046951, 'f05': 0.2717691204061362}
test result: {'p': 0.25470129205577585, 'r': 0.6315622521808089, 'f1': 0.3630065180728383, 'f05': 0.28921718162141746}
  Validation Loss: 0.50
  Validation took: 0:00:21
 New best val loss, save to disc.
 New best f1 score, save to disc.

Training...


  0%|          | 0/1705.9375 [00:00<?, ?it/s]


  Average training loss: 0.24
  Training epcoh took: 0:06:37

Running Validation...
  Accuracy: 0.79
  f1_Macro_Score: 0.78
  f1_Micro_Score: 0.79
{'p': 0.24497393894266567, 'r': 0.634318766066838, 'f1': 0.3534467323187108, 'f05': 0.2792553191489362}
test result: {'p': 0.26310299869621906, 'r': 0.6401268834258524, 'f1': 0.37292677292677295, 'f05': 0.29823394664893227}
  Validation Loss: 0.54
  Validation took: 0:00:21
 New best f1 score, save to disc.

Training complete!
Total training took 0:35:38 (h:mm:ss)


## inference

In [12]:
test_model=model.eval()
test_model.load_state_dict(torch.load("./models/best_val_{}.pt".format(model_name)))

<All keys matched successfully>

In [10]:
total_eval_loss, token_result, label_actual, label_predict = eval(model, test_dataloader)
token_result

{'p': 0.25470129205577585,
 'r': 0.6315622521808089,
 'f1': 0.3630065180728383,
 'f05': 0.28921718162141746}

In [11]:
f1_score(label_actual, label_predict)

0.8055808310585381