In [None]:
def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)


In [None]:
def collate_fn(data):
    batch = [i for i in zip(*data)]
    for i in range(len(batch)):
        if i < len(batch) - 2:
            batch[i] = torch.stack(batch[i], 0)
    return tuple(batch)


In [None]:
def get_labels(path):
    with open(path, "r") as f:
        labels = f.read().splitlines()
    if "O" not in labels:
        labels = ["O"] + labels
    return labels



## Train Function

### Train Sampler

- [ ] What is a Sampler?

In [None]:
train_sampler = (
        RandomSampler(train_dataset)
        if args.local_rank == -1
        else DistributedSampler(train_dataset)
    )

### Train Dataloader

- [ ] What is a Dataloader?

In [None]:
train_dataloader = DataLoader(
        train_dataset,
        sampler=train_sampler,
        batch_size=args.train_batch_size,
        collate_fn=None,
    )

In [None]:
if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = (
            args.max_steps
            // (len(train_dataloader) // args.gradient_accumulation_steps)
            + 1
        )
    else:
        t_total = (
            len(train_dataloader)
            // args.gradient_accumulation_steps
            * args.num_train_epochs
        )

- [ ] What is the deal with these double slashes?

Some kind of division for [integer output](https://stackoverflow.com/questions/1535596/what-is-the-reason-for-having-in-python#1535601)

Still need to figure out the meanings of `dataloader`, `gradient_accumulation_steps`.

- [ ] Is there any relation between `args.max_steps` and `args.num_train_ephochs`?



In [None]:
# Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p
                for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": args.weight_decay,
        },
        {
            "params": [
                p
                for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

In [None]:
    optimizer = AdamW(
        optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon
    )


### Optimizer

In [None]:
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

### Scheduler

In [None]:
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(
            model, optimizer, opt_level=args.fp16_opt_level
        )


### About Half Precision arithmetic

Mostly for efficiency purposes.  
Check out [this](https://en.wikipedia.org/wiki/Half-precision_floating-point_format), [this](https://en.wikipedia.org/wiki/Single-precision_floating-point_format) and [this](https://en.wikipedia.org/wiki/Double-precision_floating-point_format) to know about what floating point precision means.  

However for our specific problem, we're considered with what half precision means for training. This is elaborated [here](https://developer.nvidia.com/blog/apex-pytorch-easy-mixed-precision-training/).   
Half precision is also where NVIDIA's apex library becomes a dependency.

However this looks optional. Seems like one could skip this and just work with the defaults. However that might also remove support for some other optimizations needed later, such as parallel and distributed training, as is mentioned in the next steps in the code.


In [None]:
# multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True,
        )

Helper logs that can be ignored for now

In [None]:
# Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info(
        "  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size
    )
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

Till now we were setting up the infrastructure or hardware. We were making PyTorch aware of our operational or infrastructure preferences: how many GPUs, half or single point arithmetic, etc. After this point, we're going past the operational setup to be used, and making the architectural level decisions for setting up and training the neural network. So you'll start to see some terms which are related to Neural Network and Deep Learning concepts.

In [None]:
    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(
        int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
    )
    set_seed(args)  # Added here for reproductibility (even between python 2 and 3)

Let's look at each of these things one by one. 

1. Read up about `model.zero_grad`.   
[This](https://stackoverflow.com/a/48009142/3727642) is a useful discussion because it also talks about `loss.backwards` which we'll encounter later.

2. `trange()` seems to be just a helper/convenient function. Nothing to understand here, it seems like a wrapper around/prozy for [**tqdm**](https://tqdm.github.io/).  


In [None]:
There seem to be two loops to be run here:
1. Training Loop
2. Epoch Loop

In [None]:
    for _ in train_iterator:
        epoch_iterator = tqdm(
            train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]
        )


In the next section, the input dictionary is set up. 

In [None]:
        for step, batch in enumerate(epoch_iterator):
            model.train()
            inputs = {
                "input_ids": batch[0].to(args.device),
                "attention_mask": batch[1].to(args.device),
                "labels": batch[3].to(args.device),
            }
            if args.model_type in ["layoutlm"]:
                inputs["bbox"] = batch[4].to(args.device)
            inputs["token_type_ids"] = (
                batch[2].to(args.device) if args.model_type in ["bert", "layoutlm"] else None
            )  # RoBERTa don"t use segment_ids


- [ ] Describe the input dictionary and the significance of each of its key-value pairs 

The crux of the loop is in a single line, as follows:

In [None]:
            outputs = model(**inputs)

- [ ] Similarly, describe the Output dictionary and the significance of each of its key-value pairs.

In the following code, collect the output from the model run.

In [None]:
            # model outputs are always tuple in pytorch-transformers (see doc)
            loss = outputs[0]

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()


It's worth looking into the following individually: 
- [ ] `loss.backward()`: 
- [ ] `scaled_loss.backward()`:
- [ ] `args.gradient_accumulation_steps`: 