In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import numpy as np

In [160]:
# lists = [[ 2061,  3124,   318,   262,  6766,    30, 23998,    25,  4171,
#           198,  2061,  3124,   318,  8701,    30, 23998,    25,   445],
#        [ 2061,  3124,   318,   262,  6766,    30, 23998,    25,  4171,
#           198,  2061,  3124,   318,  8701,    30, 23998,    25, 17585],
#        [ 2061,  3124,   318,   262,  6766,    30, 23998,    25,  4171,
#           198,  2061,  3124,   318,  8701,    30, 23998,    25, 14809]]

lists = [[ 2061,  3124,   318,   262,  6766,    30, 23998,    25,  4171,
          198,  2061,  3124,   318,  8701,    30, 23998,    25],
       [ 2061,  3124,   318,   262,  6766,    30, 23998,    25,  4171,
          198,  2061,  3124,   318,  8701,    30, 23998,    25, 17585],
       [ 2061,  3124,   318,   262,  6766,    30, 23998,    25,  4171,
          198,  2061,  3124,   318,  8701,    30, 23998]]

In [161]:
from typing import Optional, Tuple, List, Dict, Union


### Implement left pad ragged lists ###

In [164]:
def _left_pad_ragged_lists(
        ragged_lists: List[List[int]],
        pad_value: int,
        shape: Optional[Tuple[int, int]] = None,
    ) -> np.array:
        """Pad ragged lists from the left.

        Example: [[1,2,3], [4]] -> [[1,2,3], [pad_value, pad_value, 4]]

        Args:
        ragged_lists: List of lists with unequal lengths.
        pad_value: padding token
        shape: If None (default) it will padd to the longest element in the
            ragged_lists. If not None, it will return a tensor of this shape either
            padded or truncated

        Returns:
        Left padded regular 2D list.
        """

        max_len = max([len(lst) for lst in ragged_lists])
        reversed_tensor = torch.tensor([lst[::-1] + [pad_value] * (max_len - len(lst)) for lst in ragged_lists])
        padded_tensor = reversed_tensor.flip(dims=(1,))
        if shape is not None:
            padded_tensor = F.pad(torch.tensor(target_ids), (10,0), 'constant', pad_value)
            for d in range(len(shape)):
                if padded_tensor.size()[d] > shape[d]:
                    padded_tensor = torch.narrow(padded_tensor, d, 0, shape[d])
        return padded_tensor.numpy()

In [165]:
_left_pad_ragged_lists(ragged_lists=lists, pad_value=0)

array([[    0,  2061,  3124,   318,   262,  6766,    30, 23998,    25,
         4171,   198,  2061,  3124,   318,  8701,    30, 23998,    25],
       [ 2061,  3124,   318,   262,  6766,    30, 23998,    25,  4171,
          198,  2061,  3124,   318,  8701,    30, 23998,    25, 17585],
       [    0,     0,  2061,  3124,   318,   262,  6766,    30, 23998,
           25,  4171,   198,  2061,  3124,   318,  8701,    30, 23998]])

In [190]:
def _left_pad_constant_length(input, pad_value, length):
    return F.pad(torch.tensor(input), (length,0), 'constant', -100)

In [192]:
goal = [[ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,   445],
       [ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100, 17585],
       [ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100, 14809]]
_left_pad_constant_length(input=[[445],[17585],[14809]], pad_value=-100, length=12)

tensor([[ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,   445],
        [ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100, 17585],
        [ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100, 14809]])

### Implement GPT batch tokenize ###

In [171]:
from model_wrappers.gpt2_wrapper import GPT2Wrapper
pytorch_gpt2 = GPT2Wrapper(model_name = "gpt2-medium", use_cuda = False)

In [183]:
def _gpt_batch_tokenize(
        tokenizer,
        batch_inputs: List[str],
        batch_targets: Optional[List[str]] = None,
        mask_token_id: int = -100,
    ) -> Dict[str, np.array]:
        """Tokenize and prepare batches of input and target string pairs for GPT
        (gpt2 or openai-gpt) scoring or generation.

        The tokenization requires a pad_token to be defined, and the padding is
        done on the left side.

        Args:
            tokenizer: GPT compatible tokenizer. Assumes it has a defined
            pad_token defined.
            batch_inputs: List of string inputs
            batch_targets: Optional list of string targets. If None (default), the
            returned tokenization is equivalent to the targets being empty strings.
            mask_token_id: Token id that is used in the returned target_ids to mark
            tokens corresponing to the input.

        Returns:
            Dictionary with entries:
            inputs_and_targets_ids: token_ids for inputs and targets (concatenated)
            targets_ids: Copy of the token_ids for inputs and targets where the
                input tokens is masked with mask_token_id.
            attention_mask: 0 where input_ids is masked, 1 otherwise.
            position_ids: Position ids to account for padding according to the
                attention_mask.
        """

        assert tokenizer.pad_token_id is not None, "Tokenizer must set pad_token_id."

        ragged_inputs_ids = tokenizer(batch_inputs)["input_ids"]
        
        if batch_targets:
            assert len(batch_inputs) == len(batch_targets), "Inputs and targets must have the same length."
            ragged_targets_ids = tokenizer(batch_targets)["input_ids"]
        else:
            ragged_targets_ids = [[] for _ in batch_inputs]

        ragged_inputs_and_targets_ids = [
            inp + tar for inp, tar in zip(ragged_inputs_ids, ragged_targets_ids)
        ]

        inputs_and_targets_ids = _left_pad_ragged_lists(
            ragged_lists=ragged_inputs_and_targets_ids, pad_value=tokenizer.pad_token_id
        )
        
        targets_ids = _left_pad_constant_length(
            input=ragged_targets_ids,
            pad_value=mask_token_id,
            length=inputs_and_targets_ids.shape[1]-1,
        )

        # Infer the values of the attention_mask and position_ids:
        attention_mask = inputs_and_targets_ids != tokenizer.pad_token_id
        attention_mask = attention_mask.astype(inputs_and_targets_ids.dtype)

        position_ids = np.maximum(np.cumsum(attention_mask, axis=-1) - 1, 0)

        return {
            "inputs_and_targets_ids": inputs_and_targets_ids,
            "targets_ids": targets_ids,
            "attention_mask": attention_mask,
            "position_ids": position_ids,
        }

In [3]:
batch_inputs = ['What color is the sky? Answer: blue\nWhat color is grass? Answer:', 'What color is the sky? Answer: blue\nWhat color is grass? Answer:', 'What color is the sky? Answer: blue\nWhat color is grass? Answer:']
batch_targets = ['red', 'blue', 'green']

In [185]:
_gpt_batch_tokenize(
    tokenizer=pytorch_gpt2._tokenizer,
    batch_inputs=batch_inputs,
    batch_targets=batch_targets
)

{'inputs_and_targets_ids': array([[ 2061,  3124,   318,   262,  6766,    30, 23998,    25,  4171,
           198,  2061,  3124,   318,  8701,    30, 23998,    25,   445],
        [ 2061,  3124,   318,   262,  6766,    30, 23998,    25,  4171,
           198,  2061,  3124,   318,  8701,    30, 23998,    25, 17585],
        [ 2061,  3124,   318,   262,  6766,    30, 23998,    25,  4171,
           198,  2061,  3124,   318,  8701,    30, 23998,    25, 14809]]),
 'targets_ids': tensor([[ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
           -100,  -100,  -100,  -100,  -100,  -100,  -100,   445],
         [ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
           -100,  -100,  -100,  -100,  -100,  -100,  -100, 17585],
         [ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
           -100,  -100,  -100,  -100,  -100,  -100,  -100, 14809]]),
 'attention_mask': array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

### Implement Score Function ###

In [188]:
class ToyClass:
    def __init__(self, tokenizer, model):
        self._tokenizer = tokenizer
        self._model = model
        
    def score(
        self,
        inputs: Union[List[str], str],
        targets: Union[List[str], str],
        mask_token_id=-100,
    ) -> List[float]:
        """Scores one or a batch of example targets given their inputs.
        Args:
        inputs: input context
        targets:  targets to be scored
        Returns:
        list of log probabilities for each target given the input.
        """

        if isinstance(inputs, str):
            input_list = [inputs]
            target_list = [targets]
        else:
            input_list = inputs
            target_list = targets

        tokenized_ids = _gpt_batch_tokenize(
            tokenizer=self._tokenizer,
            batch_inputs=input_list,
            batch_targets=target_list,
        )

        inputs_and_targets_ids = torch.tensor(tokenized_ids["inputs_and_targets_ids"])
        targets_ids = torch.tensor(tokenized_ids["targets_ids"])
        attention_mask = torch.tensor(tokenized_ids["attention_mask"])

        print(inputs_and_targets_ids)
        print(targets_ids)
        print(attention_mask)
#         inputs_and_targets_ids = self._maybe_truncate_input(
#             inputs_and_targets_ids, verbose=True
#         )
#         targets_ids = self._maybe_truncate_input(targets_ids, verbose=False)
#         attention_mask = self._maybe_truncate_input(attention_mask, verbose=False)
        # Calculating position ids, since they might be changed by truncation
#         position_ids = torch.maximum(torch.cumsum(attention_mask, axis=-1) - 1, 0)
        position_ids = torch.tensor(np.maximum(np.cumsum(attention_mask, axis=-1) - 1, 0))
        print(position_ids)
        logits = self._model(
            inputs_and_targets_ids,
            labels=targets_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
        ).logits

    
        return logits

#         return logits
#         return self.compute_loss(targets_ids, logits)

In [189]:
toy = ToyClass(tokenizer=pytorch_gpt2._tokenizer, model=pytorch_gpt2._model)
logits = toy.score(inputs=batch_inputs, targets=batch_targets)

  targets_ids = torch.tensor(tokenized_ids["targets_ids"])
  position_ids = torch.tensor(np.maximum(np.cumsum(attention_mask, axis=-1) - 1, 0))


tensor([[ 2061,  3124,   318,   262,  6766,    30, 23998,    25,  4171,   198,
          2061,  3124,   318,  8701,    30, 23998,    25,   445],
        [ 2061,  3124,   318,   262,  6766,    30, 23998,    25,  4171,   198,
          2061,  3124,   318,  8701,    30, 23998,    25, 17585],
        [ 2061,  3124,   318,   262,  6766,    30, 23998,    25,  4171,   198,
          2061,  3124,   318,  8701,    30, 23998,    25, 14809]])
tensor([[ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,   445],
        [ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100, 17585],
        [ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100, 14809]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],


In [3]:
from bigbench.models.huggingface_models import _HFTransformerModel, BIGBenchHFModel
bigbench_gpt2 = BIGBenchHFModel("gpt2-medium")

2023-04-04 07:49:39.187344: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm
All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2-medium.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [5]:
batch_inputs = ['What color is the sky? Answer: blue\nWhat color is grass? Answer:', 'What color is the sky? Answer: blue\nWhat color is grass? Answer:', 'What color is the sky? Answer: blue\nWhat color is grass? Answer:']
batch_targets = ['red', 'blue', 'green']

In [8]:
bigbench_gpt2._model.score(inputs=batch_inputs, targets=batch_targets)

[-10.4252290725708, -11.22307300567627, -9.641101837158203]