In [1]:
import os
import sys
import logging
import argparse
import numpy as np
import tensorflow as tf
import preliminary
import pt_modeler
import preprocessing as pp

from collections import deque
from pt_modeler import ConstructPtModeler
from huggingface_utils import MODELS
from scipy.special import softmax
from sklearn.utils import shuffle
from sklearn.metrics import log_loss, f1_score, accuracy_score
from scipy.spatial.distance import cosine,euclidean

logger = logging.getLogger('BUS-stop')
formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s',"%H:%M:%S")
logger.setLevel(logging.DEBUG)

fhandler = logging.FileHandler(filename='./logs/run-cell-by-cell.log', mode='w')
fhandler.setFormatter(formatter)
fhandler.setLevel(logging.INFO)
logger.addHandler(fhandler)

consoleHandler = logging.StreamHandler(sys.stdout)
consoleHandler.setFormatter(formatter)
consoleHandler.setLevel(logging.DEBUG)
logger.addHandler(consoleHandler)

#Variables for preprocessing
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
GLOBAL_SEED = 0
pt_model = "TFBertModel"
pt_model_checkpoint = "./params/bert_base/"

for indx, model in enumerate(MODELS):
    if model[0].__name__ == pt_model:
        TFModel, Tokenizer, Config = MODELS[indx]

tokenizer = Tokenizer.from_pretrained(pt_model_checkpoint)

devices = []
for gpu_num in os.environ["CUDA_VISIBLE_DEVICES"].split(','):
    devices.append('/device:GPU:{}'.format(gpu_num))

strategy = tf.distribute.MirroredStrategy(devices=devices)
gpus = strategy.num_replicas_in_sync

logger.info("***Logging start***")
logger.info("os.environ['CUDA_VISIBLE_DEVICES'] = {}".format(os.environ['CUDA_VISIBLE_DEVICES']))
logger.info("devices = {}".format(devices))
logger.info("Number of devices: {}".format(gpus))

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
03:23:37:BUS-stop:INFO: ***Logging start***
03:23:37:BUS-stop:INFO: os.environ['CUDA_VISIBLE_DEVICES'] = 0
03:23:37:BUS-stop:INFO: devices = ['/device:GPU:0']
03:23:37:BUS-stop:INFO: Number of devices: 1


In [2]:
task = "SST-2"
task_path = os.path.join("./data",task)
max_seq_length = 64

logger.info("*******************")
logger.info("***Preprocessing***")
logger.info("*******************")
task = task.strip()
Processor = pp.task_to_processor(task)
processor = Processor(task, task_path, tokenizer, max_seq_length) 

label_list = processor.get_label_list()
lab_examples = processor.tsv_to_examples('labeled.tsv')
tst_examples = processor.tsv_to_examples('test_with_gold.tsv') 
unl_examples = processor.tsv_to_examples('unlabeled.tsv')

X_lab,y_lab = processor.examples_to_features(lab_examples)
X_tst,y_tst = processor.examples_to_features(tst_examples)
X_unl,y_unl = processor.examples_to_features(unl_examples)

lab_len, unl_len = len(lab_examples), len(unl_examples)
num_labels = len(label_list)

logger.info('Labeled//Test//Unlabeled matrix shape = {} // {} // {}'.format(
    X_lab['input_ids'].shape,X_tst['input_ids'].shape,X_unl['input_ids'].shape))

for i in range(2):
    logger.info("***Labeled***")
    logger.info ("Example {}".format(i))
    logger.info ("Label {}".format(y_lab[i]))
    logger.info ("Token ids {}".format(X_lab["input_ids"][i]))
    logger.info ("Tokens {}".format(tokenizer.convert_ids_to_tokens(X_lab["input_ids"][i])))
    #logger.info ("Token type ids {}".format(X_lab["token_type_ids"][i]))
    logger.info ("Token mask {}".format(X_lab["attention_mask"][i]))

for i in range(2):
    logger.info("***Test***")
    logger.info ("Example {}".format(i))
    logger.info ("Label {}".format(y_tst[i]))
    logger.info ("Token ids {}".format(X_tst["input_ids"][i]))
    logger.info ("Tokens {}".format(tokenizer.convert_ids_to_tokens(X_tst["input_ids"][i])))
    #logger.info ("Token type ids {}".format(X_tst["token_type_ids"][i]))
    logger.info ("Token mask {}".format(X_tst["attention_mask"][i]))

for i in range(2):
    logger.info("***Unlabeled***")
    logger.info ("Example {}".format(i))
    logger.info ("Token ids {}".format(X_unl["input_ids"][i]))
    logger.info ("Tokens {}".format(tokenizer.convert_ids_to_tokens(X_unl["input_ids"][i])))
    #logger.info ("Token type ids {}".format(X_unl["token_type_ids"][i]))
    logger.info ("Token mask {}".format(X_unl["attention_mask"][i]))


03:23:48:BUS-stop:INFO: *******************
03:23:48:BUS-stop:INFO: ***Preprocessing***
03:23:48:BUS-stop:INFO: *******************
03:23:48:BUS-stop:INFO: In file ./data/SST-2/labeled.tsv, we read 100 samples, 
03:23:48:BUS-stop:INFO: where the class distribution is {'0': 50, '1': 50, None: 0}.
03:23:48:BUS-stop:INFO: In file ./data/SST-2/test_with_gold.tsv, we read 1000 samples, 
03:23:48:BUS-stop:INFO: where the class distribution is {'0': 200, '1': 800, None: 0}.
03:23:48:BUS-stop:INFO: In file ./data/SST-2/unlabeled.tsv, we read 1000 samples, 
03:23:48:BUS-stop:INFO: where the class distribution is {'0': 0, '1': 0, None: 1000}.
03:23:49:BUS-stop:INFO: Labeled//Test//Unlabeled matrix shape = (100, 64) // (1000, 64) // (1000, 64)
03:23:49:BUS-stop:INFO: ***Labeled***
03:23:49:BUS-stop:INFO: Example 0
03:23:49:BUS-stop:INFO: Label 0
03:23:49:BUS-stop:INFO: Token ids [  101  1037 18856 18163  6588  7609 14427 17312  1010  1037  2806  1011
  2489  6912  1999 16924  1998 26865  1012   1

In [3]:
drop_rate = 0.2
with strategy.scope():
    modeler = ConstructPtModeler(TFModel, Config, pt_model_checkpoint, max_seq_length, 
                                 num_labels, dense_dropout_prob=drop_rate, word_freeze=True,
                                 attention_probs_dropout_prob=drop_rate, hidden_dropout_prob=drop_rate)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already

In [4]:
logger.info("***********************")
logger.info("***Preliminary stage***")
logger.info("***********************")
preliminary_records = preliminary.run_stage(strategy, modeler, processor, lab_examples, X_unl, rand_seed=GLOBAL_SEED,
                                            epochs=30, patience=10, batch_size=16, learning_rate=3e-5, val_ratio=0.5, 
                                            T=3, n_base=3, verbose=0) #verbose=0/1/2 -> print silent/progress_bar/one_line_per_epoch, 
p_l_conf, c_u_cali = preliminary.obtain_outputs(preliminary_records, cali_acc_or_f1='f1', bias_lab_or_val='val')

#logger.info("preliminary_records = {}".format(preliminary_records))
p_l_ = list(np.around(p_l_conf,4))
logger.info("p_l_conf = [{}, {}, {}, ..., {}, {}, {}]".format(p_l_[0],p_l_[1],p_l_[2],p_l_[-3],p_l_[-2],p_l_[-1]))
logger.info("class distribution of unlabeled data: pred {} -> cali {}".format(
    np.around(np.mean(preliminary_records['ulb_dist'],0),4), np.around(c_u_cali,4) ))

03:24:00:BUS-stop:INFO: ***********************
03:24:00:BUS-stop:INFO: ***Preliminary stage***
03:24:00:BUS-stop:INFO: ***********************
03:24:00:BUS-stop:DEBUG: Labels in the labeled set mixed evenly like this, ['0', '1', '0', '1', '...'].
03:24:00:BUS-stop:DEBUG:  
03:24:00:BUS-stop:DEBUG: 0-th run / total 3 runs
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/

In [5]:
epochs = 50
batch_size = 16
n_que = 5

logger.info("****************")
logger.info("***Main stage***")
logger.info("****************")
with strategy.scope():
    model = modeler.build_model()
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08), 
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
                  metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name="acc")])

steps_per_epoch = lab_len//batch_size
queue = deque(n_que*[0], n_que)
best_conf, n_pat = np.inf, 0
rand_indices = np.arange(lab_len)
for epoch in range(1,epochs+1):
    
    rand_indices = shuffle(rand_indices,random_state=GLOBAL_SEED)
    for step in range(steps_per_epoch):
        batch_indices = rand_indices[step*batch_size:(step+1)*batch_size]
        X_bat = {}
        for key in X_lab.keys():
            X_bat[key] = pp.select_by_index(X_lab[key], batch_indices)
        y_bat = pp.select_by_index(y_lab, batch_indices)
        model.train_on_batch(X_bat,y_bat)
    
    trn_loss,trn_acc = model.evaluate(X_lab, y_lab)
    tst_loss,tst_acc = model.evaluate(X_tst, y_tst) # 
    
    unl_probs = softmax(model.predict(X_unl),axis=1)
    unl_confs = unl_probs.max(1)
    unl_dist = unl_probs.mean(0)
    
    _ids = np.arange(0,unl_len,unl_len/lab_len).astype('int32') # for downsampling
    s_conf = euclidean(unl_confs[_ids], p_l_conf)
    s_class = 1.-cosine(unl_dist, c_u_cali)
    logger.info("Epoch {}, s_conf={}, s_class={}, tst_acc={}, tst_loss={}".format(
                    epoch, round(s_conf,4), round(s_class,4), round(tst_acc,4), round(tst_loss,4)))
    
    if s_conf < best_conf:
        n_pat = 0
        queue = deque(n_que*[0], n_que)
        best_conf = s_conf
    else:
        n_pat += 1 
    
    if n_pat < n_que:
        if s_class > max(queue):
            best_weights = model.get_weights()
            stop_epoch = epoch
        queue.append(s_class)
    else:
        break

logger.info('***End training***')

logger.info('***Load the model and Evaluate on test data***')
logger.info("BUS-stop's stop_epoch = {}".format(stop_epoch))
model.set_weights(best_weights)
tst_loss,tst_acc = model.evaluate(X_tst, y_tst) # 
logger.info('Final tst_acc : {}, tst_loss : {} \n'.format(round(tst_acc,4),round(tst_loss,4)))



03:28:46:BUS-stop:INFO: ****************
03:28:46:BUS-stop:INFO: ***Main stage***
03:28:46:BUS-stop:INFO: ****************
03:29:07:BUS-stop:INFO: Epoch 1, s_conf=3.1572, s_class=0.8669, tst_acc=0.743, tst_loss=0.651
03:29:14:BUS-stop:INFO: Epoch 2, s_conf=2.6461, s_class=0.9118, tst_acc=0.812, tst_loss=0.564
03:29:22:BUS-stop:INFO: Epoch 3, s_conf=1.8832, s_class=0.9693, tst_acc=0.861, tst_loss=0.4057
03:29:29:BUS-stop:INFO: Epoch 4, s_conf=1.4792, s_class=0.9832, tst_acc=0.857, tst_loss=0.3246
03:29:36:BUS-stop:INFO: Epoch 5, s_conf=1.5341, s_class=0.9999, tst_acc=0.897, tst_loss=0.2829
03:29:43:BUS-stop:INFO: Epoch 6, s_conf=1.8281, s_class=0.9911, tst_acc=0.86, tst_loss=0.4012
03:29:50:BUS-stop:INFO: Epoch 7, s_conf=1.872, s_class=0.9986, tst_acc=0.885, tst_loss=0.4063
03:29:56:BUS-stop:INFO: Epoch 8, s_conf=1.9279, s_class=0.9977, tst_acc=0.872, tst_loss=0.4771
03:30:03:BUS-stop:INFO: Epoch 9, s_conf=1.968, s_class=0.9893, tst_acc=0.845, tst_loss=0.64
03:30:03:BUS-stop:INFO: ***En