In [None]:
pip install datasets pytorch_lightning transformers

In [None]:
from datasets import load_dataset, list_datasets
import datasets
from torch.utils.data import Dataset, DataLoader
from argparse import Namespace
import torch
import pandas as pd
import pytorch_lightning as pl
import transformers
import torch.nn.functional as F
from typing import Union
from pytorch_lightning.callbacks import LearningRateMonitor
from pytorch_lightning.loggers import CSVLogger
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
import numpy as np

squad_train = load_dataset('squad', split='train[:1%]')
squad_valid = load_dataset('squad', split='validation[:1%]')

Reusing dataset squad (/root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
Reusing dataset squad (/root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


# 1) Data Loading
Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.

[Kaggle Reference](https://www.kaggle.com/code/karthikrangasai/chaii-q-a-with-pytorch-lightining)

In [None]:
config = Namespace(
    seed = 7,
    trainer = Namespace(
        precision = 16,
        accumulate_grad_batches = 2,
        max_epochs = 3,
        weights_summary='top',
        num_sanity_val_steps = 0,
        gpus = 1,
        fast_dev_run=True,
        # stochastic_weight_avg=True,
    ),

    model = Namespace(
        model_name_or_path = "deepset/tinyroberta-squad2",
        config_name = "deepset/tinyroberta-squad2",
        optimizer_type = 'AdamW',
        learning_rate = 3e-5,
        weight_decay = 1e-2,
        epsilon = 1e-8,
        max_grad_norm = 1.0,
        lr_scheduler = 'cosine',
        warmup_ratio = 0.1,
    ),

    data = Namespace(
        train_batch_size = 4,
        eval_batch_size = 4,
        max_seq_length = 512,
        doc_stride = 128,
        valid_split = 0.25,
        tokenizer_name = "deepset/bert-base-uncased-squad2",
    ),
)

In [None]:
class DatasetRetriever(Dataset):
    def __init__(self, features, mode='train'):
        super(DatasetRetriever, self).__init__()
        self.features = features
        self.mode = mode

    def __len__(self):
        return len(self.features)

    def __getitem__(self, item):
        feature = self.features[item]

        res = {
            'input_ids': torch.tensor(feature['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(feature['attention_mask'], dtype=torch.long),
        }

        if self.mode == 'train':
            res.update({
                'start_position': torch.tensor(feature['start_position'], dtype=torch.long),
                'end_position': torch.tensor(feature['end_position'], dtype=torch.long)
            })

        else:
            res.update({
                'id': feature['example_id'],
                'context': feature['context'],
                'question': feature['question']
            })

        return res

class DataModuleFit(pl.LightningDataModule):
    def __init__(self, config, train_set, valid_set, **kwargs):
        super().__init__()

        # built in method to extract config and save as self.hparams
        self.save_hyperparameters(config)
        self._tokenizer = transformers.AutoTokenizer.from_pretrained(self.hparams.tokenizer_name)
        self.train_set = train_set
        self.valid_set = valid_set

    def _prepare_features(self, example):

        # stride = # of overlapping tokens from the end of the truncated sequence
        # the overlap between truncated and overflowing sequences
        # return overflow will make the truncated text as next data line
        tokenized_example = self._tokenizer(
            [x.lstrip() for x in example["question"]],
            example["context"],
            truncation="only_second",
            max_length=self.hparams.max_seq_length,
            stride=self.hparams.doc_stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            padding="max_length",
        )

        # the index of overflowing sample
        sample_mapping = tokenized_example.pop("overflow_to_sample_mapping")

        # offset mapping is the span of text corresponding to each token
        # (33, 35) --> {'word': 'Hu', 'start': 33, 'end': 35},
        # (35, 40) --> 'word': '##gging', 'start': 35, 'end': 40,
        offset_mapping = tokenized_example.pop("offset_mapping")

        features = []
        for i, offsets in enumerate(offset_mapping):
            feature = {}
            input_ids = tokenized_example["input_ids"][i]
            attention_mask = tokenized_example["attention_mask"][i]
            feature['input_ids'] = input_ids
            feature['attention_mask'] = attention_mask
            feature['offset_mapping'] = offsets

            # cls_index = 0
            cls_index = input_ids.index(self._tokenizer.cls_token_id)

            # sequence_ids = [None, 0 * 15, 1 * 158]
            sequence_ids = tokenized_example.sequence_ids(i)
            sample_index = sample_mapping[i]
            answers = example["answers"][i]

            ### converting the answer start char position to token position
            if len(answers["answer_start"]) == 0:
                feature["start_position"] = cls_index
                feature["end_position"] = cls_index
            else:
                start_char = answers["answer_start"][0]
                end_char = start_char + len(answers["text"][0])

                # make pred token start at context
                token_start_index = 0
                while sequence_ids[token_start_index] != 1:
                    token_start_index += 1

                # make pred token end at padding
                token_end_index = len(input_ids) - 1
                while sequence_ids[token_end_index] != 1:
                    token_end_index -= 1

                # offsets[17] = (0, 0)
                # offsets [174] = (694, 695)
                if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                    feature["start_position"] = cls_index
                    feature["end_position"] = cls_index
                else:

                    # to make token_start_index equal to start char but not bigger than whole length
                    # 17 < 384 & 0 <= 505
                    while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                        token_start_index += 1
                    feature["start_position"] = token_start_index - 1
                    while offsets[token_end_index][1] >= end_char:
                        token_end_index -= 1
                    feature["end_position"] = token_end_index + 1

            features.append(feature)
        return features

    def prepare_data(self):

        self._train_features = self._prepare_features(self.train_set)
        self._valid_features = self._prepare_features(self.valid_set)

    def setup(self, stage=None):
        self._train_dset = DatasetRetriever(self._train_features)
        self._valid_dset = DatasetRetriever(self._valid_features)

    def train_dataloader(self):
        return DataLoader(
            self._train_dset,
            batch_size=self.hparams.train_batch_size,
            num_workers=4,
            pin_memory=True,
            drop_last=False,
            shuffle=True
        )

    def val_dataloader(self):
        return DataLoader(
            self._valid_dset,
            batch_size=self.hparams.eval_batch_size,
            num_workers=4,
            pin_memory=True,
            drop_last=False,
            shuffle=False,
    )

In [None]:
module = DataModuleFit(config.data, squad_train, squad_valid)
module.prepare_data()
module.setup()
train_dataloader = module.train_dataloader()
val_dataloader = module.val_dataloader()

  cpuset_checked))


# 2) Model Training

In [None]:
class Model(pl.LightningModule):

    def __init__(self, config, **kwargs):
        super().__init__()
        self.save_hyperparameters(config)
        self.model_config = transformers.AutoConfig.from_pretrained(self.hparams.config_name)
        self.model = transformers.AutoModel.from_pretrained(self.hparams.model_name_or_path, config=self.model_config)
        self.qa_outputs = torch.nn.Linear(self.model_config.hidden_size, 2)
        self.dropout = torch.nn.Dropout(self.model_config.hidden_dropout_prob)
        self._init_weights(self.qa_outputs)

    def forward(self, input_ids, attention_mask):
        """The forward step performs the next step for the model while training."""

        # sequence_output['last_hidden_state'].size() == [m, 512, 768]
        # sequence_output['pooler_output'].size() = [m, 768]
        sequence_output = self.model(input_ids, attention_mask=attention_mask)[0]

        # [m, 512, 2]
        qa_logits = self.qa_outputs(sequence_output)

        # [m, 512, 1], [m, 512, 1]
        start_logits, end_logits = qa_logits.split(1, dim=-1)

        # [m, 512]
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        return start_logits, end_logits

    def predict(self, batch):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        pred_start, pred_end = self(input_ids, attention_mask=attention_mask)
        return {
            'pred_start': pred_start,
            'pred_end': pred_end,
        }

    def _init_weights(self, module):
        if isinstance(module, torch.nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.model_config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()

    def get_num_training_steps(self) -> int:
        """Total training steps inferred from datamodule and devices."""
        if isinstance(self.trainer.limit_train_batches, int) and self.trainer.limit_train_batches != 0:
            num_batches = self.trainer.limit_train_batches
        elif isinstance(self.trainer.limit_train_batches, float):
            # limit_train_batches is a percentage of batches
            dataset_size = len(self.train_dataloader())
            num_batches = int(dataset_size * self.trainer.limit_train_batches)
        else:
            num_batches = len(self.train_dataloader())

        num_devices = max(1, self.trainer.num_gpus, self.trainer.num_processes)
        if self.trainer.tpu_cores:
            num_devices = max(num_devices, self.trainer.tpu_cores)

        effective_batch_size = self.trainer.accumulate_grad_batches * num_devices
        max_estimated_steps = (num_batches // effective_batch_size) * self.trainer.max_epochs

        if self.trainer.max_steps and self.trainer.max_steps < max_estimated_steps:
            return self.trainer.max_steps
        return max_estimated_steps

    @staticmethod
    def _compute_warmup(num_training_steps: int, num_warmup_steps: Union[int, float]) -> int:
        if isinstance(num_warmup_steps, float) and (num_warmup_steps > 1 or num_warmup_steps < 0):
            raise Exception("`num_warmup_steps` as float should be provided between 0 and 1.")

        if isinstance(num_warmup_steps, int):
            if num_warmup_steps > num_training_steps:
                raise Exception("`num_warmup_steps` as int should be less than `num_training_steps`.")
            return num_warmup_steps


        if isinstance(num_warmup_steps, float):
            # Convert float values to percentage of training steps to use as warmup
            num_warmup_steps *= num_training_steps
        return round(num_warmup_steps)

    def configure_optimizers(self):
        param_optimizer = list(self.model.named_parameters())
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                "weight_decay_rate": self.hparams.weight_decay
            },
            {
                "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
                "weight_decay_rate": 0.0
            },
        ]
        optimizer = transformers.AdamW(
            optimizer_grouped_parameters,
            lr=self.hparams.learning_rate,
            eps=self.hparams.epsilon,
            correct_bias=True)

        if self.hparams.lr_scheduler is not None:
            num_training_steps = self.get_num_training_steps()
            lr_scheduler = transformers.get_scheduler(
                name=self.hparams.lr_scheduler,
                optimizer=optimizer,
                num_warmup_steps=self._compute_warmup(num_training_steps, self.hparams.warmup_ratio),
                num_training_steps=num_training_steps,
            )
            lr_scheduler_config = {
                "scheduler": lr_scheduler,
                "interval": "step",
                "frequency": 1,
            }
            return [optimizer], [lr_scheduler_config]
        return optimizer

    def _compute_loss(self, preds, labels):
        start_preds, end_preds = preds
        start_labels, end_labels = labels
        start_loss = F.cross_entropy(start_preds, start_labels, ignore_index=-1)
        end_loss = F.cross_entropy(end_preds, end_labels, ignore_index=-1)
        total_loss = (start_loss + end_loss) / 2
        return total_loss

    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        targets_start = batch["start_position"]
        targets_end = batch['end_position']

        outputs_start, outputs_end = self(input_ids, attention_mask=attention_mask)
        loss = self._compute_loss((outputs_start, outputs_end), (targets_start, targets_end))
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        targets_start = batch["start_position"]
        targets_end = batch['end_position']

        outputs_start, outputs_end = self(input_ids, attention_mask=attention_mask)
        loss = self._compute_loss((outputs_start, outputs_end), (targets_start, targets_end))
        self.log('val_loss', loss, prog_bar=True)

In [None]:
model = Model(config.model)
for batch in train_dataloader:
    break

res = model.forward(batch['input_ids'], batch['attention_mask'])
print(res[0].size(), res[0].size())

Some weights of the model checkpoint at deepset/tinyroberta-squad2 were not used when initializing RobertaModel: ['qa_outputs.bias', 'qa_outputs.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at deepset/tinyroberta-squad2 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  cpuset_checked))


torch.Size([4, 512]) torch.Size([4, 512])


In [None]:
# training
pl.seed_everything(config.seed)
lr_monitor = LearningRateMonitor(logging_interval='step')
logger = CSVLogger(save_dir='logs/')

# Checkpoint
ckpt = ModelCheckpoint(
    monitor=f'val_loss',
    save_top_k=1,
    save_last=False,
    save_weights_only=True,
    dirpath='checkpoints',
    filename='{epoch:02d}-{val_loss:.4f}',
    verbose=False,
    mode='min',
)

trainer = pl.Trainer(
    logger=logger,
    callbacks=[ckpt, lr_monitor],
    **vars(config.trainer)
)

trainer.fit(model, datamodule=module)

Global seed set to 7
Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Running in fast_dev_run mode: will run a full train, val, test and prediction loop using 1 batch(es).
`Trainer(limit_train_batches=1)` was configured so 1 batch per epoch will be used.
`Trainer(limit_val_batches=1)` was configured so 1 batch will be used.
`Trainer(limit_test_batches=1)` was configured so 1 batch will be used.
`Trainer(limit_predict_batches=1)` was configured so 1 batch will be used.
`Trainer(val_check_interval=1.0)` was configured so validation will run at the end of the training epoch..
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  "`Trainer.num_gpus` was deprecated in v1.6 and will be removed in v1.8."
  "`Trainer.num_processes` is deprecated in v1.6 and will be removed in v1.8. "
  "`Trainer.tpu_cores` is deprecated in v1.6 and will be removed in v1.8. "


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [None]:
# prediction
for batch in val_dataloader:
    break

preds = model.predict(batch)
sub_pred_start = preds['pred_start'].argmax(dim=-1)
sub_pred_end = preds['pred_end'].argmax(dim=-1)
print(sub_pred_start)
print(sub_pred_end)

  cpuset_checked))


tensor([  6,  34,  44, 146])
tensor([ 2, 74, 15,  6])
