# Part-1 : Section A
Training a multi-exit ElasticBERT model on SST-2 dataset

In [60]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ['CUDA_VISIBLE_DEVICES']='0'
# os.environ["CUDA_LAUNCH_BLOCKING"]="1"


In [61]:
# The code closely follows the original ElasticBERT repository
# Feature to train models with a given exit configuration is added
# !git clone https://github.com/MLiONS/MutiExitDNNs.git


%cd path../ElasticBERT
!pip install -r requirements.txt

/home/divya/divyaJyoti/UBERT/MutiExitDNNs/ElasticBERT


In [62]:
#All the hyper-parameters/ location to training dataset are set in
#MultiExitDNNs -> finetune-dynamic -> finetune_elue_entropy.sh file

#1)Set the correct location to SST-2 dataset
#All models are trained on SST-2 "train" split and evaluated on "dev" split
#"train.tsv" and "dev.tsv" are expected to be in ELUE_DIR/TASK_NAME
#You can set both ELUE_DIR and TASK_NAME in finetune_elue_entropy.sh
#Or change the dataset directory using "data_dir" option

#2)Please change the "num_output_layers" option as per the desired exit-configuration

#3)Model checkpoints will be saved at "output_dir" and
#logs will be available at "log_dir"
# !bash path../ElasticBERT/finetune-dynamic/finetune_elue_entropy.sh

# Part-1 : Section B
Generating the prediction matrix

In [63]:
#Evaluation on other datasets-IMDb or Yelp

In [64]:
from transformers import BertTokenizer as ElasticBertTokenizer

#Set the current directory location inside "finetune-dynamic" folder
%cd path../ElasticBERT/finetune-dynamic

from models.configuration_elasticbert import ElasticBertConfig
from models.modeling_elasticbert_entropy import ElasticBertForSequenceClassification

/home/divya/divyaJyoti/UBERT/MutiExitDNNs/ElasticBERT/finetune-dynamic


In [65]:
#Set location to the best performing model
#Model checkpoints are saved at "output_dir" from Part-1: Section A
checkpoint_sst = 'path../ElasticBERT/finetune-dynamic/ckpts/elue/entropy/SST-2/checkpoint-1335'
checkpoint_mrpc = 'path../ElasticBERT/finetune-dynamic/ckpts/elue/entropy/MRPC/checkpoint-575'
checkpoint_rte = 'path../ElasticBERT/finetune-dynamic/ckpts/elue/entropy/RTE/checkpoint-390'
checkpoint_mnli = 'path../ElasticBERT/finetune-dynamic/ckpts/elue/entropy/MNLI/checkpoint-61360'

In [66]:
data_checkpoint={'imdb':checkpoint_sst,'yelp':checkpoint_sst,'qqp':checkpoint_mrpc,'scitail':checkpoint_rte,'snli':checkpoint_mnli}
#model

In [67]:
def get_args(arg_vec):
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument(
        "--num_hidden_layers",
        default=None,
        type=int,
        required=True,
        help='The number of layers to import.',
    )
    parser.add_argument(
        "--num_output_layers",
        nargs = 12,
        default=None,
        type=int,
        required=True,
        help='The number of layers to output.',
    )
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name.",
    )
    parser.add_argument(
        "--task_name",
        default=None,
        type=str,
        required=True,
        help="The name of the task to train selected in the list.",
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model predictions and checkpoints will be written.",
    )
    parser.add_argument(
        "--log_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the logs will be written.",
    )
    parser.add_argument(
        "--spec_eval",
        default=None,
        type=str,
        required=False,
        help="'Set as train or test based on specific split on which to evaluate'"
    )
    parser.add_argument(
        "--patience",
        default='0',
        type=str,
        required=False,
    )
    parser.add_argument(
        "--regression_threshold",
        default=0,
        type=float,
        required=False,
    )
    parser.add_argument(
        "--early_exit_entropy",
        default='0.1',
        type=str,
        required=False,
    )
    # Other parameters
    parser.add_argument(
        "--load",
        default=None,
        type=str,
        help="The path of ckpts used to continue training."
    )
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name",
    )
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name",
    )
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help="Where do you want to store the pre-trained models downloaded from huggingface.co",
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help="The maximum total input sequence length after tokenization. Sequences longer "
             "than this will be truncated, sequences shorter will be padded.",
    )
    parser.add_argument("--debug", action="store_true", help="Whether to use debug mode.")
    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--evaluate_during_training",
        action="store_true",
        help="Run evaluation during training at each logging step.",
    )
    parser.add_argument(
        "--do_lower_case",
        action="store_true",
        help="Set this flag if you are using an uncased model.",
    )
    parser.add_argument(
        "--per_gpu_train_batch_size",
        default=8,
        type=int,
        help="Batch size per GPU/CPU for training.",
    )
    parser.add_argument(
        "--per_gpu_eval_batch_size",
        default=1,
        type=int,
        help="Batch size per GPU/CPU for evaluation.",
    )
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help="Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument(
        "--learning_rate",
        default=5e-5,
        type=float,
        help="The initial learning rate for Adam.",
    )
    parser.add_argument("--weight_decay", default=0.01, type=float, help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
    parser.add_argument(
        "--num_train_epochs",
        default=3.0,
        type=float,
        help="Total number of training epochs to perform.",
    )
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
    )
    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
    parser.add_argument("--warmup_rate", default=0, type=float, help="Linear warmup over warmup_rate.")

    parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
    parser.add_argument(
        "--save_steps",
        type=int,
        default=500,
        help="Save checkpoint every X updates steps.",
    )
    parser.add_argument(
        "--eval_all_checkpoints",
        action="store_true",
        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
    )
    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
    parser.add_argument(
        "--overwrite_output_dir",
        action="store_true",
        help="Overwrite the content of the output directory",
    )
    parser.add_argument(
        "--overwrite_cache",
        action="store_true",
        help="Overwrite the cached training and evaluation sets",
    )
    parser.add_argument(
        "--not_save_model",
        action="store_true",
        help="Do not save model checkpoints"
    )
    parser.add_argument("--seed", type=int, default=6, help="random seed for initialization")

    parser.add_argument(
        "--fp16",
        action="store_true",
        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
    )
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
             "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument(
        "--local_rank",
        type=int,
        default=-1,
        help="For distributed training: local_rank",
    )
    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
    args = parser.parse_args(arg_vec)

    return args

In [68]:
from load_data import (
    load_and_cache_examples_glue,
    load_and_cache_examples_elue,
)

def evaluate_elue_entropy(args, model, tokenizer, prefix="", eval_highway=False, entropy=0.):
    model.elasticbert.set_early_exit_entropy(entropy)
    model.elasticbert.set_eval_state(eval_highway)
    model.elasticbert.reset_stats()

    eval_task = args.task_name.lower()
    eval_output_dir = args.output_dir

    num_op_layers = args.num_output_layers

    results = {}
    results_all = []
    exit_layer = []
    for i in range(sum(num_op_layers)):
        results_all.append({})

    if args.task_name in ('imdb','yelp','scitail','snli'):
      if args.spec_eval:
          eval_dataset = load_and_cache_examples_elue(args, eval_task, tokenizer, data_type=args.spec_eval)
      else:
          eval_dataset = load_and_cache_examples_elue(args, eval_task, tokenizer, data_type='train')
    else:
      if args.spec_eval:
        eval_dataset = load_and_cache_examples_glue(args, eval_task, tokenizer, data_type=args.spec_eval)
      else:
        eval_dataset = load_and_cache_examples_glue(args, eval_task, tokenizer, data_type='train')

    if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(eval_output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

    # multi-gpu eval
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    preds_all = []
    pred_tuple = []
    for i in range(sum(num_op_layers)):
        preds_all.append(None)
        pred_tuple.append(None)
    out_label_ids = None

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "labels": batch[-1],
            }
            inputs["token_type_ids"] = batch[2]
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]

            eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1
        if out_label_ids is None:
            out_label_ids = inputs["labels"].detach().cpu().numpy()
        else:
            out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
        if not eval_highway:
            for i, pred in enumerate(preds_all):
                if pred is None:
                    preds_all[i] = logits[i].detach().cpu().numpy()
                else:
                    preds_all[i] = np.append(pred, logits[i].detach().cpu().numpy(), axis=0)
        else:
            if preds is None:
                preds = logits.detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)

    eval_loss = eval_loss / nb_eval_steps
    if args.output_mode == "classification":
        if not eval_highway:
            for i, pred in enumerate(preds_all):
                preds_all[i] = np.argmax(pred, axis = 1)
                pred_tuple[i] = pred
        else:
            preds = np.argmax(preds, axis = 1)
            pred_tuple[i] = pred

    elif args.output_mode == "regression":
        if not eval_highway:
            for i, pred in enumerate(preds_all):
                preds_all[i] = np.squeeze(pred)
        else:
            preds = np.squeeze(preds)

    if not eval_highway:
        for i, pred in enumerate(preds_all):
            if eval_task == 'rte' or 'qnli' or 'wnli' or 'qqp':
                eval_task = 'scitail'
            if eval_task == 'mnli':
                eval_task = 'snli'
            if eval_task == 'yelp':
              eval_task = 'imdb'
            result = elue_compute_metrics(eval_task, pred, out_label_ids)
            results_all[i].update(result)

    else:
        if eval_task == 'rte' or 'qnli' or 'wnli' or 'qqp':
                eval_task = 'scitail'
        if eval_task == 'mnli':
                eval_task = 'snli'
        result = elue_compute_metrics(eval_task, preds, out_label_ids)
        results.update(result)

        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            print("  %s = %s" % (key, str(result[key])))

        exiting_layer_every_ins = model.elasticbert.exiting_layer_every_ins
        exit_layer.append(exiting_layer_every_ins)

    if eval_highway:
        speed_up = model.elasticbert.log_stats()
        return results, speed_up, exit_layer

    if args.spec_eval:
      return results_all, preds_all, pred_tuple, out_label_ids

    return results_all, preds_all, pred_tuple , out_label_ids
    #return results_all, preds_all, out_label_ids


In [69]:
# !git clone https://github.com/hsm207/imdb_data.git

In [70]:
# %cd imdb_data
# !pip install -r requirements.txt

In [71]:
# !tf_upgrade_v2 --infile create_imdb_dataset.py --outfile bar.py

In [72]:
# !python bar.py --output_dir path../UBERT/elue_data/imdb_data/imdb_dataset

In [73]:
import numpy as np
import pandas as pd

def get_preds(eval_dataset='snli', data_split='train'):
  if eval_dataset in ('imdb','yelp'):
    args.spec_eval = 'train'
  else:
    args.spec_eval = False
  args.task_name=eval_dataset.lower()
  args.data_dir=ELUE_DIR + '/'+args.task_name

  results_all, exit_preds, pred_tuple, op_labels = evaluate_elue_entropy(args, model, tokenizer)


  # exit_preds_list = np.stack(exit_preds, axis=1)
  # df = pd.DataFrame((exit_preds_list) )
  # df['op_labels'] = op_labels

  return  results_all, exit_preds, pred_tuple, op_labels

In [74]:
import numpy as np
import math
import scipy
def softmax(x):
    return(np.exp(x)/np.exp(x).sum())

In [75]:
result_dict={}
replace_dict={'scitail':{0:1,1:0},'snli':{0:1,1:2,2:0}}

In [78]:
def data_generate(df_list):
  for dataset in df_list:
    config = ElasticBertConfig.from_pretrained(data_checkpoint[dataset])
    tokenizer = ElasticBertTokenizer.from_pretrained(data_checkpoint[dataset])
    model = ElasticBertForSequenceClassification.from_pretrained(data_checkpoint[dataset]) 
    ELUE_DIR='path../UBERT/elue_data'
    TASK_NAME=dataset

    arg_vec= ['--model_name_or_path', 'fnlp/elasticbert-base',
      '--task_name', dataset, \
      '--do_train', \
      '--do_lower_case', \
      '--data_dir', "path../UBERT/elue_data", \
      '--log_dir', 'path../ElasticBERT/logs/elue/entropy/SNLI-BTestCheck', \
      '--output_dir', 'path../ElasticBERT/ckpts/elue/entropy/SNLI-BTestCheck', \
      '--num_hidden_layers', '12', \
      '--num_output_layers', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', \
      '--max_seq_length', '128', \
      '--per_gpu_train_batch_size', '32', \
      '--per_gpu_eval_batch_size',' 32', \
      '--learning_rate', '2e-5', \
      '--weight_decay', '0.1', \
      '--save_steps', '50', \
      '--logging_steps', '50', \
      '--num_train_epochs', '5',  \
      '--warmup_rate', '0.06', \
      '--evaluate_during_training', \
      '--overwrite_output_dir'
    ]

    import argparse
    parser = argparse.ArgumentParser()

    args = get_args(arg_vec)
    import torch
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = torch.cuda.device_count()
    args.device = device

    args.output_mode = 'classification'

    print(args.device)
    model.to(args.device)
    from elue import elue_compute_metrics
    import logging
    logger = logging.getLogger(__name__)
    from torch.utils.data import DataLoader, SequentialSampler
    from torch.utils.data.distributed import DistributedSampler
    import os
    from tqdm import tqdm
    results, final_preds, pred_tuple, op_labels = get_preds(eval_dataset=dataset, data_split='train')
    pred_prob_layer=[]
    for j in range(len(pred_tuple)):
        pred_prob=[]
        for i in range(len(pred_tuple[0])):
            pred_prob.append((softmax(pred_tuple[j][i])))
        pred_prob_layer.append(pred_prob)
    prob_df=pd.DataFrame(pred_prob_layer)
    prob_df=prob_df.transpose()
    prob_df.columns=[f"tup_layer{i}" for i in range(1,13)]
    prob_df['output_labels']=op_labels
    if dataset in replace_dict.keys():
      prob_df['output_labels']=prob_df['output_labels'].replace(replace_dict[dataset])
    result_dict[dataset]=prob_df

In [None]:
data_generate(['scitail'])

In [None]:
comment out for find confidence dataset
dataset_name=['imdb','scitail','yelp','qqp','snli']
data_generate(dataset_name)
import pickle
with open("path../Dataset/confidence_label_dict.pkl",'wb') as file:
    pickle.dump(result_dict,file)