In [1]:
from bayes_opt import BayesianOptimization
from bayes_opt.util import Colours
import logging, argparse
import numpy as np
import glob, os, shutil
import random, json

import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset, Subset
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange
from transformers import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP

from transformers import (
    WEIGHTS_NAME,
    AdamW,
    BertConfig,
    BertForMultipleChoice,
    BertTokenizer,
    RobertaConfig,
    RobertaForMultipleChoice,
    RobertaTokenizer,
    XLNetConfig,
    XLNetForMultipleChoice,
    XLNetTokenizer,
    get_linear_schedule_with_warmup,
)
# from transformers import WarmupLinearSchedule
from utils_multiple_choice import convert_examples_to_features, processors

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "6"
device="cuda:0"

In [3]:
try:
    from torch.utils.tensorboard import SummaryWriter
except ImportError:
    from tensorboardX import SummaryWriter

In [4]:
logger = logging.getLogger(__name__)

ALL_MODELS = sum(
    (tuple(conf.keys()) for conf in (BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP)), ()
)

MODEL_CLASSES = {
    "bert": (BertConfig, BertForMultipleChoice, BertTokenizer),
    "xlnet": (XLNetConfig, XLNetForMultipleChoice, XLNetTokenizer),
    "roberta": (RobertaConfig, RobertaForMultipleChoice, RobertaTokenizer),
}

In [5]:
MODEL_CLASSES = {
    "bert": (BertConfig, BertForMultipleChoice, BertTokenizer),
    "xlnet": (XLNetConfig, XLNetForMultipleChoice, XLNetTokenizer),
    "roberta": (RobertaConfig, RobertaForMultipleChoice, RobertaTokenizer),
}


def select_field(features, field):
    return [[choice[field] for choice in feature.choices_features] for feature in features]


def simple_accuracy(preds, labels):
    return (preds == labels).mean()


def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)


def dataset_with_indices(cls):
    """
    Modifies the given Dataset class to return a tuple data, target, index
    instead of just data, target.
    """

    def __getitem__(self, index):
        data = cls.__getitem__(self, index)
        return data + (index,)

    return type(cls.__name__, (cls,), {
        '__getitem__': __getitem__,
    })


IndexedTensorDataset = dataset_with_indices(TensorDataset)


class SPLLoss(torch.nn.NLLLoss):
    def __init__(self, *args, device=torch.device("cpu"), n_samples=0, warmup_steps=500, **kwargs):
        super(SPLLoss, self).__init__(*args, **kwargs)
        self.threshold = 0.5
        self.growing_factor = 1.3
        self.v = torch.zeros(n_samples).int().to(device)
        self.warmup_steps = warmup_steps

    def forward(self, input: torch.Tensor, target: torch.Tensor, index: torch.Tensor, n_steps) -> torch.Tensor:
        super_loss = torch.nn.functional.nll_loss(torch.log_softmax(input, dim=-1), target, reduction="none")

        # if n_steps <= self.warmup_steps:
        #    return super_loss.mean()
        # else:
        v = self.spl_loss(super_loss)
        self.v[index] = v
        return (super_loss * v.float()).mean()

    def increase_threshold(self):
        self.threshold *= self.growing_factor

    def spl_loss(self, super_loss):
        v = super_loss < self.threshold
        return v.int()

    def save_weights(self):
        weights = self.v.detach().cpu().numpy()
        np.save('weights.npy', weights)

### reconstruct

#### roberta wrapper

In [6]:
learning_rate = '5e-6'
batch_size = 8
n_epochs = 4
n_epochs = str(n_epochs)
if batch_size == 16:
    gradient_acc_steps = "8"
elif batch_size == 8:
    gradient_acc_steps = "4"
else:
    raise ValueError
warmup_steps = str(int(0.06 * 30000 / batch_size))

args_list = ['--model_type', 'roberta',
            '--task_name', 'cosmosqa',
            '--model_name_or_path', 'roberta-large',
            '--train_file', 'train.jsonl',
            '--eval_file', 'valid.jsonl',
            '--data_dir', '../data/cosmosqa/',
            '--output_dir', './baselines/cosmosqa-roberta-large/bayes-'  + learning_rate + '-' + n_epochs + '-' + str(batch_size),
            '--logging_steps', '200',
            '--do_train', '--do_eval',
            '--num_train_epochs', n_epochs,
            '--max_seq_length', '128',
            '--save_steps', '1000',
            '--overwrite_output',
            '--per_gpu_eval_batch_size', '8',
            '--per_gpu_train_batch_size', '2',
            '--gradient_accumulation_steps', gradient_acc_steps,
            '--warmup_steps', warmup_steps,
            '--learning_rate', learning_rate,
            '--weight_decay', '0.01']

#### roberta_train_and_eval

In [9]:
if args.local_rank == -1 or args.no_cuda:
    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()
else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
    torch.cuda.set_device(args.local_rank)
    device = torch.device("cuda", args.local_rank)
    torch.distributed.init_process_group(backend="nccl")
    args.n_gpu = 1
args.device = device

logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
)
logger.warning(
    "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
    args.local_rank,
    device,
    args.n_gpu,
    bool(args.local_rank != -1),
    args.fp16,
)

# Set seed
set_seed(args)

args.task_name = args.task_name.lower()
if args.task_name not in processors:
    raise ValueError("Task not found: %s" % (args.task_name))
processor = processors[args.task_name]()
label_list = processor.get_labels()
num_labels = len(label_list)



In [10]:
args.model_type = args.model_type.lower()
config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
config = config_class.from_pretrained(
    args.config_name if args.config_name else args.model_name_or_path,
    num_labels=num_labels,
    finetuning_task=args.task_name,
    cache_dir=args.cache_dir if args.cache_dir else None,
)
tokenizer = tokenizer_class.from_pretrained(
    args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
    do_lower_case=args.do_lower_case,
    cache_dir=args.cache_dir if args.cache_dir else None,
)
model = model_class.from_pretrained(
    args.model_name_or_path,
    from_tf=bool(".ckpt" in args.model_name_or_path),
    config=config,
    cache_dir=args.cache_dir if args.cache_dir else None,
)

model.to(args.device)

logger.info("Training/evaluation parameters %s", args)
best_steps = 0

# Create output directory if needed
if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
    os.makedirs(args.output_dir)

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForMultipleChoice: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

#### load_and_cache_examples

In [13]:
evaluate = False
test = False
cached_mode = "train"
task = args.task_name
cached_features_file = os.path.join(
    args.output_dir,
    "cached_{}_{}_{}_{}".format(
        cached_mode,
        list(filter(None, args.model_name_or_path.split("/"))).pop(),
        str(args.max_seq_length),
        str(task),
    ),
)

In [14]:
logger.info("Creating features from dataset file at %s", args.data_dir)
label_list = processor.get_labels()
if evaluate:
    examples = processor.get_dev_examples(args.data_dir, args.eval_file)
elif test:
    examples = processor.get_test_examples(args.data_dir, args.eval_file)
else:
    examples = processor.get_train_examples(args.data_dir, args.train_file)

10/23/2023 02:05:08 - INFO - __main__ -   Creating features from dataset file at ../data/cosmosqa/
10/23/2023 02:05:08 - INFO - utils_multiple_choice -   LOOKING AT ../data/cosmosqa/ directory and train.jsonl file
read cosmosqa data: 25262it [00:00, 62232.51it/s]


In [19]:
logger.info("Training number: %s", str(len(examples)))
features = convert_examples_to_features(
    examples,
    label_list,
    args.max_seq_length,
    tokenizer,
    pad_on_left=bool(args.model_type in ["xlnet"]),  # pad on the left for xlnet
    pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
)
if args.local_rank in [-1, 0]:
    logger.info("Saving features into cached file %s", cached_features_file)
    torch.save(features, cached_features_file)

10/23/2023 02:13:40 - INFO - __main__ -   Training number: 25262
convert examples to features: 0it [00:00, ?it/s]10/23/2023 02:13:40 - INFO - utils_multiple_choice -   Writing example 0 of 25262
10/23/2023 02:13:40 - INFO - utils_multiple_choice -   *** Example ***
10/23/2023 02:13:40 - INFO - utils_multiple_choice -   race_id: 3Q9SPIIRWJKVQ8244310E8TUS6YWAC##34V1S5K3GTZMDUBNBIGY93FLDOB690##A1S1K7134S2VUC##Blog_1044056##q1_a1##3XU9MCX6VQQG7YPLCSAFDPQNH4GR20
10/23/2023 02:13:40 - INFO - utils_multiple_choice -   choice: 0
10/23/2023 02:13:40 - INFO - utils_multiple_choice -   input_ids: 0 12350 3470 1771 8 621 226 4832 38 794 258 9 209 8577 307 363 2156 8 51 258 10879 162 409 479 3640 479 2497 3470 1771 16 21979 8 817 162 6675 479 38 269 64 45 244 53 28 1372 77 38 4161 7 106 25606 38 206 24 128 29 5 754 14 51 2551 98 1372 1235 77 51 702 479 2 2 1121 5 499 2156 40 42 621 213 7 192 97 8577 310 17487 9291 9 5 1065 5717 479 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

curriculum is False


In [None]:
    all_input_ids = torch.tensor(select_field(features, "input_ids"), dtype=torch.long)
    all_input_mask = torch.tensor(select_field(features, "input_mask"), dtype=torch.long)
    all_segment_ids = torch.tensor(select_field(features, "segment_ids"), dtype=torch.long)
    all_label_ids = torch.tensor([f.label for f in features], dtype=torch.long)

### utils

In [8]:
parser = argparse.ArgumentParser()

parser.add_argument(
    "--data_dir",
    default=None,
    type=str,
    required=True,
    help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
)
parser.add_argument(
    "--model_type",
    default=None,
    type=str,
    required=True,
    help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
)
parser.add_argument(
    "--model_name_or_path",
    default=None,
    type=str,
    required=True,
    help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
)
parser.add_argument(
    "--task_name",
    default=None,
    type=str,
    required=True,
    help="The name of the task to train selected in the list: " + ", ".join(processors.keys()),
)
parser.add_argument(
    "--output_dir",
    default=None,
    type=str,
    required=True,
    help="The output directory where the model predictions and checkpoints will be written.",
)

# Other parameters
parser.add_argument(
    "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
)
parser.add_argument(
    "--tokenizer_name",
    default="",
    type=str,
    help="Pretrained tokenizer name or path if not the same as model_name",
)
parser.add_argument(
    "--cache_dir",
    default="",
    type=str,
    help="Where do you want to store the pre-trained models downloaded from s3",
)
parser.add_argument(
    "--max_seq_length",
    default=128,
    type=int,
    help="The maximum total input sequence length after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded.",
)
parser.add_argument("--train_file", default='', type=str, help="Training file.")
parser.add_argument("--eval_file", default='', type=str, help="Evaluation file.")

parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
parser.add_argument("--do_test", action="store_true", help="Whether to run test on the test set")

parser.add_argument("--curriculum_learning", action="store_true", help="Whether to use curriculum learning.")
parser.add_argument("--starting_percent", default=0.3, type=float,
                    help="Starting percentage of training data for curriculum learning")
parser.add_argument("--increase_factor", default=1.1, type=float,
                    help="Multiplication factor for incrasing data usage after step length iterations")
parser.add_argument("--step_length", default=750, type=int,
                    help="Number of iterations after which pacing function is updated")

parser.add_argument(
    "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step."
)
parser.add_argument(
    "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
)

parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
parser.add_argument(
    "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
)
parser.add_argument(
    "--gradient_accumulation_steps",
    type=int,
    default=1,
    help="Number of updates steps to accumulate before performing a backward/update pass.",
)
parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
parser.add_argument(
    "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
)
parser.add_argument(
    "--max_steps",
    default=-1,
    type=int,
    help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
)
parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")

parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
parser.add_argument(
    "--eval_all_checkpoints",
    action="store_true",
    help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
)
parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
parser.add_argument(
    "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
)
parser.add_argument(
    "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
)
parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")

parser.add_argument(
    "--fp16",
    action="store_true",
    help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
)
parser.add_argument(
    "--fp16_opt_level",
    type=str,
    default="O1",
    help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
            "See details at https://nvidia.github.io/apex/amp.html",
)
parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")

args = parser.parse_args(args_list)