In [None]:
# default_exp data.core


In [None]:
# all_slow


In [None]:
#hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# data.core

> This module contains the core bits required to use the fastai DataBlock API and/or mid-level data processing pipelines to organize your data in a way modelable by Hugging Face transformer implementations.

In [None]:
# export
import os, inspect
from dataclasses import dataclass
from functools import reduce, partial
from typing import Any, Callable, List, Optional, Union, Type

from datasets import Dataset, load_dataset, concatenate_datasets
from fastcore.all import *
from fastai.data.block import TransformBlock
from fastai.data.core import Datasets, DataLoader, DataLoaders, TfmdDL
from fastai.imports import *
from fastai.losses import CrossEntropyLossFlat
from fastai.text.data import SortedDL
from fastai.torch_core import *
from fastai.torch_imports import *
from transformers import (
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    PretrainedConfig,
    PreTrainedTokenizerBase,
    PreTrainedModel,
    logging,
)

from blurr.utils import BLURR

logging.set_verbosity_error()


In [None]:
# hide_input
import pdb

from fastai.data.block import CategoryBlock, ColReader, ColSplitter, DataBlock, ItemGetter, RandomSplitter
from fastcore.test import *
from nbverbose.showdoc import show_doc

from blurr.utils import print_versions

os.environ["TOKENIZERS_PARALLELISM"] = "false"
print("What we're running with at the time this documentation was generated:")
print_versions("torch fastai transformers")


What we're running with at the time this documentation was generated:
torch: 1.10.1+cu111
fastai: 2.5.3
transformers: 4.15.0


In [None]:
# hide
# cuda
torch.cuda.set_device(1)
print(f"Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}")


Using GPU #1: GeForce GTX 1080 Ti


## Setup

We'll use a subset of `imdb` to demonstrate how to configure your blurr code for sequence classification tasks

In [None]:
raw_datasets = load_dataset("imdb", split=["train", "test"])
raw_datasets[0] = raw_datasets[0].add_column("is_valid", [False] * len(raw_datasets[0]))
raw_datasets[1] = raw_datasets[1].add_column("is_valid", [True] * len(raw_datasets[1]))

final_ds = concatenate_datasets([raw_datasets[0].shuffle().select(range(1000)), raw_datasets[1].shuffle().select(range(200))])
imdb_df = pd.DataFrame(final_ds)
imdb_df.head()


Reusing dataset imdb (/home/wgilliam/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached shuffled indices for dataset at /home/wgilliam/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-ed05e60e9c1442ee.arrow


Unnamed: 0,text,label,is_valid
0,The storyline was okay. Akshay Kumar was good as always and that was the only good thing about the movie. Kareena Kapoor looked bad. There was so hue and cry over her size zero but she did not looked good leaner. I don't know why the hell did Anil Kapoor accepted such a bad role. There was nothing much to do for him in the movie. Just because it is a Yashraj film does not means that an actor should accept the role however bad it is. Said Ali khan was alright. I think that it is high time that Indian directors and producers start thinking of Indian customers as intelligent lot. What are we ...,0,False
1,"The movie's storyline is pat and quaint. Two women travel through the middle east and discover themselves. Unfortunately, if you are looking for a movie about the middle east and central Asia this is absolutely terrible.<br /><br />The producers of the film either did no research or were unbelievably lazy when filming it. To begin with, and most glaringly incorrect, the Nuristanis, as they were known in the thirties, and indeed since the 1890s and their forceful conversion by Abdul-Rahman Shah of Aghnaistan, were not nomads. In fact they have not been nomads since the Aryan invasions of ce...",0,False
2,"There's no romance or other side plot to this movie, it's action and intrigue all the way, making it a real man's kung-fu movie.<br /><br />An aging master dispatches his last disciple Yan Tieh to stop his five former pupils who's styles represents five venomous animals centipede,snake, scorpion, lizard and the toad. Despite the word ""Venom"" in the title, none of these pupil uses venoms to kill their opponents. Yan Tieh told by his teacher that he's no match for the five former pupil, must find one he can form an alliance with to defeat the other four. How Yan Tieh and the others find each...",1,False
3,"I found it a real task to sit through this film. The sound track was not the best and some of the accents made it difficult to understand what was being said. There was little to move the plot along and often the action simply stopped and there was a prolonged period of conversations which seemed extraneous to the movie. These conversations switched between family groups and the observer was left to try and piece together what the common thread was that tied them together. It is rare that I rate a film this low and do so in this case as the entire viewing experience left me thinking ""so wh...",0,False
4,"Nowadays it is sort of a trend to look upon all shows from begin 90's as classics (people are so easily blinded by nostalgia these days), and while some of those shows were/are undoubtedly good, this one is just pure crap. I watched this show a lot back in those days since it got A LOT of reruns on TV back then, and even as a child I didn't like it. Even a 8-year old can see how much the people in Power Rangers are overacting, and how much the special-FX sucked even back then. When the show doesn't resort to the painfully bad 'fighting'-scenes, it plagues the viewer with this unnecessary s...",0,False


In [None]:
labels = raw_datasets[0].features["label"].names
labels


['neg', 'pos']

In [None]:
model_cls = AutoModelForSequenceClassification

pretrained_model_name = "roberta-base"  # "bert-base-multilingual-cased"
n_labels = len(labels)

hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(
    pretrained_model_name, model_cls=model_cls, config_kwargs={"num_labels": n_labels}
)

hf_arch, type(hf_config), type(hf_tokenizer), type(hf_model)


('roberta',
 transformers.models.roberta.configuration_roberta.RobertaConfig,
 transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast,
 transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification)

## Preprocessing

Starting with version 2.0, Blurr provides a preprocessing base class that can be used to build task specific, pre-processed datasets, from either DataFrames or Hugging Face Datasets.

In [None]:
# export
class Preprocessor:
    def __init__(
        self,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # The number of examples to process at a time
        batch_size: int = 1000,
        # The attribute holding the text of sequences
        text_attrs: Union[str, List[str]] = "text",
        # Tokenization kwargs that will be applied with calling the tokenizer
        tok_kwargs: dict = {},
    ):
        self.hf_tokenizer = hf_tokenizer
        self.batch_size = batch_size
        self.text_attrs = text_attrs
        self.tok_kwargs = tok_kwargs

    def process_df(self, training_df: pd.DataFrame, validation_df: Optional[pd.DataFrame] = None):
        df = training_df.copy()

        # concatenate the validation dataset if it is included
        if validation_df is not None:
            valid_df = validation_df.copy()
            # add an `is_valid_col` column to both training/validation DataFrames to indicate what data is part of the validation set
            if self.is_valid_attr:
                valid_df[self.is_valid_attr] = True
                df[self.is_valid_attr] = False

            df = pd.concat([df, valid_df])

        return df

    def process_hf_dataset(self, training_ds: Dataset, validation_ds: Optional[Dataset] = None):
        ds = training_ds

        # concatenate the validation dataset if it is included
        if validation_ds is not None:
            # add an `is_valid_col` column to both training/validation DataFrames to indicate what data is part of
            # the validation set
            if self.is_valid_attr:
                validation_ds = validation_ds.add_column(self.is_valid_attr, [True] * len(validation_ds))
                training_ds = training_ds.add_column(self.is_valid_attr, [False] * len(training_ds))

            ds = concatenate_datasets([training_ds, validation_ds])

        return ds

    def _tokenize_function(self, example):
        if is_listy(self.text_attrs) and len(self.text_attrs) > 1:
            return self.hf_tokenizer(example[self.text_attrs[0]], example[self.text_attrs[1]], truncation=True, **self.tok_kwargs)
        else:
            return self.hf_tokenizer(example[self.text_attrs], truncation=True, **self.tok_kwargs)


### `ClassificationPreprocessor`

Starting with version 2.0, blurr provides a sequence classification preprocessing class that can be used to preprocess DataFrames or Hugging Face Datasets.

This resulting pre-processed data can also be used with the Hugging Face `Trainer` API, `Accelerate`, or your own custom training loop should you want to use one of those options instead of using blurr/fast.ai for training your models. This class works for both slow and fast tokenizers

In [None]:
# export
class ClassificationPreprocessor(Preprocessor):
    def __init__(
        self,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # The number of examples to process at a time
        batch_size: int = 1000,
        # Whether the dataset should be processed for multi-label; if True, will ensure `label_attrs` are
        # converted to a value of either 0 or 1 indiciating the existence of the class in the example
        is_multilabel: bool = False,
        # The unique identifier in the dataset
        id_attr: Optional[str] = None,
        # The attribute holding the text of sequences
        text_attrs: Union[str, List[str]] = "text",
        # The attribute holding the label(s) of the example
        label_attrs: Union[str, List[str]] = "label",
        # The attribute that should be created if your are processing individual training and validation
        # datasets into a single dataset, and will indicate to which each example is associated
        is_valid_attr: Optional[str] = "is_valid",
        # A list indicating the valid labels for the dataset (optional, defaults to the unique set of labels
        # found in the full dataset)
        label_mapping: Optional[List[str]] = None,
        # Tokenization kwargs that will be applied with calling the tokenizer
        tok_kwargs: dict = {},
    ):
        super().__init__(hf_tokenizer, batch_size, text_attrs, tok_kwargs)

        self.is_multilabel = is_multilabel
        self.id_attr = id_attr
        self.label_attrs = label_attrs
        self.is_valid_attr = is_valid_attr
        self.label_mapping = label_mapping

    def process_df(self, training_df: pd.DataFrame, validation_df: Optional[pd.DataFrame] = None):
        df = super().process_df(training_df, validation_df)

        # convert even single "labels" to a list to make things easier
        label_cols = listify(self.label_attrs)

        # if `is_multilabel`, convert all targets to an int, 0 or 1, rounding floats if necessary
        if self.is_multilabel:
            for label_col in label_cols:
                df[label_col] = df[label_col].apply(lambda v: int(bool(max(0, round(v)))))

        # if a `label_mapping` is included, add a "[label_col]_name" field with the label Ids converted to their label names
        if self.label_mapping:
            for label_col in label_cols:
                df[f"{label_col}_name"] = df[label_col].apply(lambda v: self.label_mapping[v])

        # tokenize in batches
        final_df = pd.DataFrame()
        for g, batch_df in df.groupby(np.arange(len(df)) // self.batch_size):
            inputs_df = batch_df.apply(lambda r: pd.Series(self._tokenize_function(r)), axis=1)
            final_df = final_df.append(pd.concat([batch_df, inputs_df], axis=1))

        # return the pre-processed DataFrame
        return final_df

    def process_hf_dataset(self, training_ds: Dataset, validation_ds: Optional[Dataset] = None):
        ds = super().process_hf_dataset(training_ds, validation_ds)

        # convert even single "labels" to a list to make things easier
        label_attrs = listify(self.label_attrs)

        # if `is_multilabel`, convert all targets to an int, 0 or 1, rounding floats if necessary
        if self.is_multilabel:
            for label_attr in label_attrs:
                ds = ds.map(lambda example: int(bool(max(0, round(example[label_attr])))))

        # if a `label_mapping` is included, add a "[label_col]_name" field with the label Ids converted to their label names
        if self.label_mapping:
            for label_attr in label_attrs:
                ds = ds.map(lambda example: {f"{label_attr}_name": self.label_mapping[example[label_attr]]})

        # tokenize in batches
        ds = ds.map(self._tokenize_function, batched=True, batch_size=self.batch_size)

        # return the pre-processed DataFrame
        return ds


#### Using a `DataFrame`

In [None]:
preprocessor = ClassificationPreprocessor(hf_tokenizer, label_mapping=labels)
proc_df = preprocessor.process_df(imdb_df)
proc_df.columns, len(proc_df)
proc_df.head(2)


Unnamed: 0,text,label,is_valid,label_name,input_ids,attention_mask
0,The storyline was okay. Akshay Kumar was good as always and that was the only good thing about the movie. Kareena Kapoor looked bad. There was so hue and cry over her size zero but she did not looked good leaner. I don't know why the hell did Anil Kapoor accepted such a bad role. There was nothing much to do for him in the movie. Just because it is a Yashraj film does not means that an actor should accept the role however bad it is. Said Ali khan was alright. I think that it is high time that Indian directors and producers start thinking of Indian customers as intelligent lot. What are we ...,0,False,neg,"[0, 20, 19879, 21, 8578, 4, 83, 19437, 857, 6706, 21, 205, 25, 460, 8, 14, 21, 5, 129, 205, 631, 59, 5, 1569, 4, 19492, 4242, 7131, 4623, 1415, 1099, 4, 345, 21, 98, 30863, 8, 8930, 81, 69, 1836, 4276, 53, 79, 222, 45, 1415, 205, 11257, 254, 4, 38, 218, 75, 216, 596, 5, 7105, 222, 660, 718, 7131, 4623, 3903, 215, 10, 1099, 774, 4, 345, 21, 1085, 203, 7, 109, 13, 123, 11, 5, 1569, 4, 1801, 142, 24, 16, 10, 854, 1671, 763, 267, 822, 473, 45, 839, 14, 41, 2701, 197, 3264, 5, ...]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...]"
1,"The movie's storyline is pat and quaint. Two women travel through the middle east and discover themselves. Unfortunately, if you are looking for a movie about the middle east and central Asia this is absolutely terrible.<br /><br />The producers of the film either did no research or were unbelievably lazy when filming it. To begin with, and most glaringly incorrect, the Nuristanis, as they were known in the thirties, and indeed since the 1890s and their forceful conversion by Abdul-Rahman Shah of Aghnaistan, were not nomads. In fact they have not been nomads since the Aryan invasions of ce...",0,False,neg,"[0, 20, 1569, 18, 19879, 16, 10512, 8, 36579, 4, 1596, 390, 1504, 149, 5, 1692, 3017, 8, 8286, 1235, 4, 6802, 6, 114, 47, 32, 546, 13, 10, 1569, 59, 5, 1692, 3017, 8, 1353, 1817, 42, 16, 3668, 6587, 49069, 3809, 1589, 49007, 3809, 48709, 133, 4426, 9, 5, 822, 1169, 222, 117, 557, 50, 58, 37064, 22414, 77, 9293, 24, 4, 598, 1642, 19, 6, 8, 144, 26077, 352, 17401, 6, 5, 17474, 7566, 354, 6, 25, 51, 58, 684, 11, 5, 3553, 9211, 918, 6, 8, 5329, 187, 5, 37708, 29, 8, 49, 32165, 10012, 30, ...]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...]"


#### Using a Hugging Face `Dataset`

In [None]:
preprocessor = ClassificationPreprocessor(hf_tokenizer, label_mapping=labels)
proc_ds = preprocessor.process_hf_dataset(final_ds)
proc_ds


  0%|          | 0/1200 [00:00<?, ?ex/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

Dataset({
    features: ['attention_mask', 'input_ids', 'is_valid', 'label', 'label_name', 'text'],
    num_rows: 1200
})

## Mid-level API

Base tokenization, batch transform, and DataBlock methods

### `HF_BaseInput`

A `HF_BaseInput` object is returned from the decodes method of `HF_AfterBatchTransform` as a means to customize @typedispatched functions like `DataLoaders.show_batch` and `Learner.show_results`. It uses the "input_ids" of a Hugging Face object as the representative tensor for `show` methods

In [None]:
# export
class HF_BaseInput(TensorBase):
    """The base represenation of your inputs; used by the various fastai `show` methods"""

    def show(
        self,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # The "context" associated to the current `show_batch/results` call
        ctx=None,
        # Any truncation you want to apply to the decoded tokenized inputs
        trunc_at: int = None,
        # A decoded string of your tokenized inputs (input_ids)
    ) -> str:
        input_ids = self.cpu().numpy()
        decoded_input = str(hf_tokenizer.decode(input_ids, skip_special_tokens=True))[:trunc_at]

        return show_title(decoded_input, ctx=ctx, label="text")


### `HF_BeforeBatchTransform` 

Inspired by this [article](https://docs.fast.ai/tutorial.transformers.html), inputs can come in as raw text, a list of words (e.g., tasks like Named Entity Recognition (NER), where you want to predict the label of each token), or pre-processed "input_ids"

**On-the-fly Batch-Time Tokenization**: 

The previous version of the library performed the tokenization/numericalization as a type transform when the raw data was read, and included a couple batch transforms to prepare the data for collation (e.g., to be made into a mini-batch). With this update, everything is done in a single batch transform.  

Why?  Part of the inspiration had to do with the mechanics of the huggingrace tokenizer, in particular how by default it returns a collated mini-batch of data given a list of sequences. And where do we get a list of examples with fastai? In the batch transforms!  So I thought, hey, why not do everything dynamically at batch time?  And with a bit of tweaking, I got everything to work pretty well.  The result is *less code*, *faster mini-batch creation*, *less RAM utilization* and time spent tokenizing (really helps with very large datasets), and *more flexibility*.

In [None]:
# export
class HF_BeforeBatchTransform(Transform):
    """
    Handles everything you need to assemble a mini-batch of inputs and targets, as well as
    decode the dictionary produced as a byproduct of the tokenization process in the `encodes` method.
    """

    def __init__(
        self,
        # The abbreviation/name of your Hugging Face transformer architecture (e.b., bert, bart, etc..)
        hf_arch: str,
        # A specific configuration instance you want to use
        hf_config: PretrainedConfig,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # A Hugging Face model
        hf_model: PreTrainedModel,
        # If you are passing in the "input_ids" as your inputs, set `is_pretokenized` = True
        is_pretokenized: bool = False,
        # The token ID that should be ignored when calculating the loss
        ignore_token_id: int = CrossEntropyLossFlat().ignore_index,
        # To control the length of the padding/truncation. It can be an integer or None,
        # in which case it will default to the maximum length the model can accept. If the model has no
        # specific maximum input length, truncation/padding to max_length is deactivated.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        max_length: int = None,
        # To control the `padding` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `'do_not_pad'.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        padding: Union[bool, str] = True,
        # To control `truncation` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `do_not_truncate`.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        truncation: Union[bool, str] = True,
        # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
        # if your inputs are pre-tokenized (not numericalized)
        is_split_into_words: bool = False,
        # Any other keyword arguments you want included when using your `hf_tokenizer` to tokenize your inputs
        tok_kwargs: dict = {},
        # Keyword arguments to apply to `HF_BeforeBatchTransform`
        **kwargs
    ):
        store_attr(self=self, names="hf_arch, hf_config, hf_tokenizer, hf_model, is_pretokenized")
        store_attr(self=self, names="max_length, padding, truncation, is_split_into_words, ignore_token_id, tok_kwargs")
        store_attr(self=self, names="kwargs")

    def encodes(self, samples, return_batch_encoding=False):
        """
        This method peforms on-the-fly, batch-time tokenization of your data. In other words, your raw inputs
        are tokenized as needed for each mini-batch of data rather than requiring pre-tokenization of your full
        dataset ahead of time.
        """
        samples = L(samples)

        # grab inputs
        if is_listy(samples[0][0]) and not self.is_split_into_words and not self.is_pretokenized:
            inps = list(zip(samples.itemgot(0, 0), samples.itemgot(0, 1)))
        else:
            inps = samples.itemgot(0).items

        # if passing "input_ids" as your inputs, build the other sequence attributes using `prepare_for_model` since
        # the inputs have already been tokenized/numericalized ... else we tokenize the raw text using `__call__`
        tokenization_func = self.hf_tokenizer.prepare_for_model if self.is_pretokenized else self.hf_tokenizer

        inputs = tokenization_func(
            inps,
            max_length=self.max_length,
            padding=self.padding,
            truncation=self.truncation,
            is_split_into_words=self.is_split_into_words,
            return_tensors="pt",
            **self.tok_kwargs
        )

        # update the samples with tokenized inputs (e.g. input_ids, attention_mask, etc...)
        d_keys = inputs.keys()
        updated_samples = [(*[{k: inputs[k][idx] for k in d_keys}], *sample[1:]) for idx, sample in enumerate(samples)]

        if return_batch_encoding:
            return updated_samples, inputs

        return updated_samples


### `HF_AfterBatchTransform`

With fastai 2.1.5, before batch transforms no longer have a `decodes` method ... and so, I've introduced a standard batch transform here (one that occurs "after" the batch has been created) that will do the decoding for us.

In [None]:
# export
class HF_AfterBatchTransform(Transform):
    """A class used to cast your inputs into something understandable in fastai `show` methods"""

    def __init__(
        self,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # The return type your decoded inputs should be cast too (used by methods such as `show_batch`)
        input_return_type: Type = HF_BaseInput,
    ):
        store_attr(self=self, names="hf_tokenizer, input_return_type")

    def decodes(
        self,
        # The encoded samples for your batch. `input_ids` will be pulled out of your dictionary of Hugging Face
        # inputs, cast to `self.input_return_type` and returned for methods such as `show_batch`
        encoded_samples: Type,
    ):
        """Returns the proper object and data for show related fastai methods"""
        if isinstance(encoded_samples, dict):
            return self.input_return_type(encoded_samples["input_ids"], hf_tokenizer=self.hf_tokenizer)
        return encoded_samples


### `HF_TextBlock`

A basic wrapper that links defaults transforms for the Data Block API, `HF_TextBlock` is designed with sensible defaults to minimize user effort in defining their transforms pipeline. It handles setting up your `HF_BeforeBatchTransform` and `HF_AfterBatchTransform` transforms regardless of data source (e.g., this will work with files, DataFrames, whatever). 

You must either pass in your own instance of a `HF_BeforeBatchTransform` class or the Hugging Face objects returned from `BLURR.get_hf_objects` (e.g.,architecture, config, tokenizer, and model). The other args are optional.

We also include a `blurr_sort_func` that works with `SortedDL` to sort based on the number of tokens in each example.

In [None]:
# export
def blurr_sort_func(
    example,
    # A Hugging Face tokenizer
    hf_tokenizer: PreTrainedTokenizerBase,
    # If you are passing in the "input_ids" as your inputs, set `is_pretokenized` = True
    is_pretokenized: bool = False,
    # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
    # if your inputs are pre-tokenized (not numericalized)
    is_split_into_words: bool = False,
    # Any other keyword arguments you want to include during tokenization
    tok_kwargs: dict = {},
):
    """This method is used by the `SortedDL` to ensure your dataset is sorted *after* tokenization"""
    if is_split_into_words or is_pretokenized:
        return len(example[0])

    return len(hf_tokenizer.tokenize(example[0], **tok_kwargs))


In [None]:
# export
class HF_TextBlock(TransformBlock):
    """The core `TransformBlock` to prepare your data for training in Blurr with fastai's `DataBlock` API"""

    def __init__(
        self,
        # The abbreviation/name of your Hugging Face transformer architecture (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_arch: str = None,
        # A Hugging Face configuration object (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_config: PretrainedConfig = None,
        # A Hugging Face tokenizer (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_tokenizer: PreTrainedTokenizerBase = None,
        # A Hugging Face model (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_model: PreTrainedModel = None,
        # If you are passing in the "input_ids" as your inputs, set `is_pretokenized` = True
        is_pretokenized: bool = False,
        # The token ID that should be ignored when calculating the loss
        ignore_token_id=CrossEntropyLossFlat().ignore_index,
        # The before batch transform you want to use to tokenize your raw data on the fly
        # (defaults to an instance of `HF_BeforeBatchTransform` created using the Hugging Face objects defined above)
        before_batch_tfm: HF_BeforeBatchTransform = None,
        # The batch_tfms to apply to the creation of your DataLoaders,
        # (defaults to HF_AfterBatchTransform created using the Hugging Face objects defined above)
        after_batch_tfm: HF_AfterBatchTransform = None,
        # To control the length of the padding/truncation. It can be an integer or None,
        # in which case it will default to the maximum length the model can accept. If the model has no
        # specific maximum input length, truncation/padding to max_length is deactivated.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        max_length: int = None,
        # To control the `padding` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `'do_not_pad'.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        padding: Union[bool, str] = True,
        # To control `truncation` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `do_not_truncate`.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        truncation: Union[bool, str] = True,
        # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
        # if your inputs are pre-tokenized (not numericalized)
        is_split_into_words: bool = False,
        # The return type your decoded inputs should be cast too (used by methods such as `show_batch`)
        input_return_type: Type = HF_BaseInput,
        # The type of `DataLoader` you want created (defaults to `SortedDL`)
        dl_type: DataLoader = None,
        # Any keyword arguments you want applied to your before batch tfm
        before_batch_kwargs: dict = {},
        # Any keyword arguments you want applied to your after batch tfm (or referred to in fastai as `batch_tfms`)
        after_batch_kwargs: dict = {},
        # Any keyword arguments you want your Hugging Face tokenizer to use during tokenization
        tok_kwargs: dict = {},
        # Any keyword arguments you want to have applied with generating text
        text_gen_kwargs: dict = {},
        # Any keyword arguments you want applied to `HF_TextBlock`
        **kwargs
    ):
        if (not all([hf_arch, hf_config, hf_tokenizer, hf_model])) and before_batch_tfm is None:
            raise ValueError("You must supply an hf_arch, hf_config, hf_tokenizer, hf_model -or- a HF_BeforeBatchTransform")

        if before_batch_tfm is None:
            before_batch_tfm = HF_BeforeBatchTransform(
                hf_arch,
                hf_config,
                hf_tokenizer,
                hf_model,
                is_pretokenized=is_pretokenized,
                ignore_token_id=ignore_token_id,
                max_length=max_length,
                padding=padding,
                truncation=truncation,
                is_split_into_words=is_split_into_words,
                tok_kwargs=tok_kwargs.copy(),
                **before_batch_kwargs.copy()
            )

        if after_batch_tfm is None:
            after_batch_tfm = HF_AfterBatchTransform(
                hf_tokenizer=before_batch_tfm.hf_tokenizer, input_return_type=input_return_type, **after_batch_kwargs.copy()
            )

        if dl_type is None:
            dl_sort_func = partial(
                blurr_sort_func,
                hf_tokenizer=before_batch_tfm.hf_tokenizer,
                is_pretokenized=is_pretokenized,
                is_split_into_words=before_batch_tfm.is_split_into_words,
                tok_kwargs=before_batch_tfm.tok_kwargs.copy(),
            )

            dl_type = partial(SortedDL, sort_func=dl_sort_func)

        # set the TransformBlock's Hugging Face face objects
        self.hf_arch = before_batch_tfm.hf_arch
        self.hf_config = before_batch_tfm.hf_config
        self.hf_tokenizer = before_batch_tfm.hf_tokenizer
        self.hf_model = before_batch_tfm.hf_model

        return super().__init__(dl_type=dl_type, dls_kwargs={"before_batch": before_batch_tfm}, batch_tfms=after_batch_tfm)


## Low-level API

For working with PyTorch and/or fast.ai Datasets & DataLoaders, the low-level API allows you to get back fast.ai specific features such as `show_batch`, `show_results`, etc... when using plain ol' PyTorch Datasets, Hugging Face Datasets, etc...

In [None]:
# export
@dataclass
class BlurrBatchCreator:
    """
    A class that can be assigned to a `TfmdDL.create_batch` method; used to in Blurr's low-level API
    to create batches that can be used in the Blurr library
    """

    def __init__(
        self,
        # Your Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # Defaults to use Hugging Face's DataCollatorWithPadding(tokenizer=hf_tokenizer)
        data_collator: Type = None,
    ):
        self.hf_tokenizer = hf_tokenizer
        self.data_collator = data_collator if (data_collator) else DataCollatorWithPadding(tokenizer=hf_tokenizer)

    def __call__(self, features):  # A mini-batch (list of examples to run through your model)
        """This method will collate your data using `self.data_collator` and add a target element to the
        returned tuples if `labels` are defined as is the case when most Hugging Face datasets
        """
        batch = self.data_collator(features)
        if isinstance(features[0], dict):
            return dict(batch), batch["labels"] if ("labels" in features[0]) else dict(batch)

        return batch


In [None]:
# export
class BlurrBatchTransform(HF_AfterBatchTransform):
    """A class used to cast your inputs into something understandable in fastai `show` methods"""

    def __init__(
        self,
        # The abbreviation/name of your Hugging Face transformer architecture (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_arch: str = None,
        # A Hugging Face configuration object (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_config: PretrainedConfig = None,
        # A Hugging Face tokenizer (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_tokenizer: PreTrainedTokenizerBase = None,
        # A Hugging Face model (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_model: PreTrainedModel = None,
        # If you are passing in the "input_ids" as your inputs, set `is_pretokenized` = True
        is_pretokenized: bool = False,
        # The token ID to ignore when calculating loss/metrics
        ignore_token_id: int = CrossEntropyLossFlat().ignore_index,
        # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
        # if your inputs are pre-tokenized (not numericalized)
        is_split_into_words: bool = False,
        # Any other keyword arguments you want included when using your `hf_tokenizer` to tokenize your inputs
        tok_kwargs: dict = {},
        # Any text generation keyword arguments
        text_gen_kwargs: dict = {},
        # The return type your decoded inputs should be cast too (used by methods such as `show_batch`)
        input_return_type: Type = HF_BaseInput,
        # Any other keyword arguments you need to pass to `HF_AfterBatchTransform`
        **kwargs
    ):
        super().__init__(hf_tokenizer=hf_tokenizer, input_return_type=input_return_type)

        store_attr(self=self, names="hf_arch, hf_config, hf_model, hf_tokenizer, is_pretokenized, ignore_token_id")
        store_attr(self=self, names="is_split_into_words, tok_kwargs, text_gen_kwargs, kwargs")


In [None]:
# export
@delegates()
class BlurrDataLoader(TfmdDL):
    """A class that makes creating a fast.ai `DataLoader` that works with Blurr"""

    def __init__(
        self,
        # A standard PyTorch Dataset
        dataset: Union[torch.utils.data.dataset.Dataset, Datasets],
        # The abbreviation/name of your Hugging Face transformer architecture (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_arch: str,
        # A Hugging Face configuration object (not required if passing in an instance of `HF_BeforeBatchTransform`
        # to `before_batch_tfm`)
        hf_config: PretrainedConfig,
        # A Hugging Face tokenizer (not required if passing in an instance of `HF_BeforeBatchTransform` to
        # `before_batch_tfm`)
        hf_tokenizer: PreTrainedTokenizerBase,
        # A Hugging Face model (not required if passing in an instance of `HF_BeforeBatchTransform` to
        # `before_batch_tfm`)
        hf_model: PreTrainedModel,
        # An instance of `BlurrBatchCreator` or equivalent
        batch_creator: BlurrBatchCreator = None,
        # The batch_tfm used to decode Blurr batches (default: HF_AfterBatchTransform)
        batch_tfm: BlurrBatchTransform = None,
        # (optional) A preprocessing function that will be applied to your dataset
        preproccesing_func: Callable[
            [Union[torch.utils.data.dataset.Dataset, Datasets], PreTrainedTokenizerBase, PreTrainedModel],
            Union[torch.utils.data.dataset.Dataset, Datasets],
        ] = None,
        # Keyword arguments to be applied to your `batch_tfm`
        batch_tfm_kwargs: dict = {},
        # Keyword arguments to be applied to `BlurrDataLoader`
        **kwargs,
    ):
        if preproccesing_func:
            dataset = preproccesing_func(dataset, hf_tokenizer, hf_model)

        if "create_batch" in kwargs:
            kwargs.pop("create_batch")
        if not batch_creator:
            batch_creator = BlurrBatchCreator(hf_tokenizer=hf_tokenizer)

        if "after_batch" in kwargs:
            kwargs.pop("after_batch")
        if not batch_tfm:
            batch_tfm = BlurrBatchTransform(hf_arch, hf_config, hf_tokenizer, hf_model, **batch_tfm_kwargs.copy())

        super().__init__(dataset=dataset, create_batch=batch_creator, after_batch=batch_tfm, **kwargs)
        store_attr(self=self, names="hf_arch, hf_config, hf_tokenizer, hf_model")

    def new(
        self,
        # A standard PyTorch and fastai dataset
        dataset: Union[torch.utils.data.dataset.Dataset, Datasets] = None,
        # The class you want to create an instance of (will be "self" if None)
        cls: Type = None,
        #  Any additional keyword arguments you want to pass to the __init__ method of `cls`
        **kwargs,
    ):
        """We have to override the new method in order to add back the Hugging Face objects in this factory
        method (called for example in places like `show_results`). With the exception of the additions to the kwargs
        dictionary, the code below is pulled from the `DataLoaders.new` method as is.
        """
        if dataset is None:
            dataset = self.dataset
        if cls is None:
            cls = type(self)

        cur_kwargs = dict(
            dataset=dataset,
            num_workers=self.fake_l.num_workers,
            pin_memory=self.pin_memory,
            timeout=self.timeout,
            bs=self.bs,
            shuffle=self.shuffle,
            drop_last=self.drop_last,
            indexed=self.indexed,
            device=self.device,
        )

        for n in self._methods:
            o = getattr(self, n)
            if not isinstance(o, MethodType):
                cur_kwargs[n] = o

        # we need to add these arguments back in (these, after_batch, and create_batch will go in as kwargs)
        kwargs["hf_arch"] = self.hf_arch
        kwargs["hf_config"] = self.hf_config
        kwargs["hf_tokenizer"] = self.hf_tokenizer
        kwargs["hf_model"] = self.hf_model

        return cls(**merge(cur_kwargs, kwargs))


## Utility classes and methods 

These methods are use internally for getting blurr transforms associated to your `DataLoaders`

In [None]:
# export
def get_blurr_tfm(
    # A list of transforms (e.g., dls.after_batch, dls.before_batch, etc...)
    tfms_list: Pipeline,
    # The transform to find
    tfm_class: Transform = HF_BeforeBatchTransform,
):
    """
    Given a fastai DataLoaders batch transforms, this method can be used to get at a transform
    instance used in your Blurr DataBlock
    """
    return next(filter(lambda el: issubclass(type(el), tfm_class), tfms_list), None)


In [None]:
show_doc(get_blurr_tfm)


<h4 id="get_blurr_tfm" class="doc_header"><code>get_blurr_tfm</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>get_blurr_tfm</code>(**`tfms_list`**:`Pipeline`, **`tfm_class`**:`Transform`=*`HF_BeforeBatchTransform`*)

Given a fastai DataLoaders batch transforms, this method can be used to get at a transform
instance used in your Blurr DataBlock

**Parameters:**


 - **`tfms_list`** : *`<class 'fastcore.transform.Pipeline'>`*	<p>A list of transforms (e.g., dls.after_batch, dls.before_batch, etc...)</p>


 - **`tfm_class`** : *`<class 'fastcore.transform.Transform'>`*, *optional*	<p>The transform to find</p>



In [None]:
# export
def first_blurr_tfm(
    dls: DataLoaders,  # Your fast.ai `DataLoaders
    before_batch_tfm_class: Transform = HF_BeforeBatchTransform,  # The before_batch transform to look for
    blurr_batch_tfm_class: Transform = BlurrBatchTransform,  # The after_batch (or batch_tfm) to look for
):
    """
    This convenience method will find the first Blurr transform required for methods such as
    `show_batch` and `show_results`. The returned transform should have everything you need to properly
    decode and 'show' your Hugging Face inputs/targets
    """
    # try our befor_batch tfms (this will be used if you're using the mid-level DataBlock API)
    tfm = get_blurr_tfm(dls.before_batch, tfm_class=before_batch_tfm_class)
    if tfm:
        return tfm

    # try our after_batch tfms (this will be used if you're using the low-level Blurr data API)
    return get_blurr_tfm(dls.after_batch, tfm_class=blurr_batch_tfm_class)


In [None]:
show_doc(first_blurr_tfm)


<h4 id="first_blurr_tfm" class="doc_header"><code>first_blurr_tfm</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>first_blurr_tfm</code>(**`dls`**:`DataLoaders`, **`before_batch_tfm_class`**:`Transform`=*`HF_BeforeBatchTransform`*, **`blurr_batch_tfm_class`**:`Transform`=*`BlurrBatchTransform`*)

This convenience method will find the first Blurr transform required for methods such as
`show_batch` and `show_results`. The returned transform should have everything you need to properly
decode and 'show' your Hugging Face inputs/targets

**Parameters:**


 - **`dls`** : *`<class 'fastai.data.core.DataLoaders'>`*	<p>Your fast.ai `DataLoaders</p>


 - **`before_batch_tfm_class`** : *`<class 'fastcore.transform.Transform'>`*, *optional*	<p>The before_batch transform to look for</p>


 - **`blurr_batch_tfm_class`** : *`<class 'fastcore.transform.Transform'>`*, *optional*	<p>The after_batch (or batch_tfm) to look for</p>



## `show_batch`

In [None]:
# export
@typedispatch
def show_batch(
    # This typedispatched `show_batch` will be called for `HF_BaseInput` typed inputs
    x: HF_BaseInput,
    # Your targets
    y,
    # Your raw inputs/targets
    samples,
    # Your `DataLoaders`. This is required so as to get at the Hugging Face objects for
    # decoding them into something understandable
    dataloaders,
    # Your `show_batch` context
    ctxs=None,
    # The maximum number of items to show
    max_n=6,
    # Any truncation your want applied to your decoded inputs
    trunc_at=None,
    # Any other keyword arguments you want applied to `show_batch`
    **kwargs,
):
    # grab our tokenizer
    tfm = first_blurr_tfm(dataloaders)
    hf_tokenizer = tfm.hf_tokenizer

    # if we've included our labels list, we'll use it to look up the value of our target(s)
    trg_labels = tfm.kwargs["labels"] if ("labels" in tfm.kwargs) else None

    res = L()
    n_inp = dataloaders.n_inp

    for idx, (input_ids, label, sample) in enumerate(zip(x, y, samples)):
        if idx >= max_n:
            break

        rets = [hf_tokenizer.decode(input_ids, skip_special_tokens=True)[:trunc_at]]
        for item in sample[n_inp:]:
            if not torch.is_tensor(item):
                trg = trg_labels[int(item)] if trg_labels else item
            elif is_listy(item.tolist()):
                trg = [trg_labels[idx] for idx, val in enumerate(label.numpy().tolist()) if (val == 1)] if (trg_labels) else label.item()
            else:
                trg = trg_labels[label.item()] if (trg_labels) else label.item()

            rets.append(trg)
        res.append(tuplify(rets))

    cols = ["text"] + ["target" if (i == 0) else f"target_{i}" for i in range(len(res[0]) - n_inp)]
    display_df(pd.DataFrame(res, columns=cols)[:max_n])
    return ctxs


## Sequence classification

The following eamples demonstrate several approaches to construct your `DataBlock` for sequence classication tasks using the mid-level API, and also an example on how to accomplish the same using the low-level API and standard PyTorch/Hugging Face/fast.ai Datasets and DataLoaders.

### Using the mid-level API

#### Batch-Time Tokenization

##### Step 1: Get your Hugging Face objects.

There are a bunch of ways we can get at the four Hugging Face elements we need (e.g., architecture name, tokenizer, config, and model).  We can just create them directly, or we can use one of the helper methods available via `BLURR`.

In [None]:
# hide_output
from transformers import AutoModelForSequenceClassification

model_cls = AutoModelForSequenceClassification

pretrained_model_name = "distilroberta-base"  # "distilbert-base-uncased" "bert-base-uncased"
hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(pretrained_model_name, model_cls=model_cls)


#####  Step 2: Create your `DataBlock`

In [None]:
blocks = (HF_TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model, before_batch_kwargs={"labels": labels}), CategoryBlock)
dblock = DataBlock(blocks=blocks, get_x=ColReader("text"), get_y=ColReader("label"), splitter=ColSplitter())


##### Step 3: Build your `DataLoaders`

In [None]:
dls = dblock.dataloaders(imdb_df, bs=4)


In [None]:
b = dls.one_batch()
len(b), len(b[0]["input_ids"]), b[0]["input_ids"].shape, len(b[1])


(2, 4, torch.Size([4, 512]), 4)

Let's take a look at the actual types represented by our batch

In [None]:
explode_types(b)


{tuple: [dict, fastai.torch_core.TensorCategory]}

In [None]:
dls.show_batch(dataloaders=dls, max_n=2, trunc_at=500)


Unnamed: 0,text,target
0,"With Iphigenia, Mikhali Cacoyannis is perhaps the first film director to have successfully brought the feel of ancient Greek theatre to the screen. His own screenplay, an adaptation of Euripides' tragedy, was far from easy, compared to that of the other two films of the trilogy he directed. The story has been very carefully deconstructed from Euripides' version and placed in a logical, strictly chronological framework, better conforming to the modern methods of cinematic story-telling. Cacoyann",pos
1,"Well, maybe the PC version of this game was impressive. Maybe. I just finished playing the PS2 version and it's pretty much a complete mess.<br /><br />There are a couple elements that are okay or promising. I'll mention those first because it will be over quickly. First, the idea of a historical GTA-like game is a great one. The game Gun was a historical GTA-like game and unlike Mafia, Gun was excellent. I'd love to see a game set during Mafia's era done right. Next, the storyline is well writ",neg


#### Pre-tokenized/numericalized

BLURR now also works with pre-processed datasets where your inputs are actually "input_ids".  Preprocessing your raw data is the more traditional approach to using Transformers, and is required when you are working with documents that may be longer than your model can handle.  In the later case, in addition to task specific preprocessing, you typically want to tell your tokenizer to create "chunks" of text from such documents by setting `return_overflowing_tokens": True`.

Below is an example of how we can use pre-tokenized/numericalized inputs

##### Step 1: Get your Hugging Face objects.

In [None]:
hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(pretrained_model_name, model_cls=model_cls)


##### Step 1b. Preprocess dataset

In [None]:
preprocessor = ClassificationPreprocessor(hf_tokenizer, label_mapping=labels)
proc_ds = preprocessor.process_hf_dataset(final_ds)
proc_ds


  0%|          | 0/1200 [00:00<?, ?ex/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

Dataset({
    features: ['attention_mask', 'input_ids', 'is_valid', 'label', 'label_name', 'text'],
    num_rows: 1200
})

##### Step 2: Create your `DataBlock`

In [None]:
blocks = (
    HF_TextBlock(
        hf_arch,
        hf_config,
        hf_tokenizer,
        hf_model,
        is_pretokenized=True,
        before_batch_kwargs={"labels": labels},
        tok_kwargs={"add_special_tokens": False},
    ),
    CategoryBlock,
)
dblock = DataBlock(blocks=blocks, get_x=ItemGetter("input_ids"), get_y=ItemGetter("label"), splitter=RandomSplitter())


##### Step 3: Build your `DataLoaders`

In [None]:
dls = dblock.dataloaders(proc_ds, bs=4)


In [None]:
dls.show_batch(dataloaders=dls, max_n=2, trunc_at=500)


Unnamed: 0,text,target
0,"This film is pure, distilled, unadulterated boredom. I knew nothing of it before I entered the dark room, took my seat. I was seduced by the ""mysterious and suspenseful"" blurb on the poster I suppose. Also, Lena Headey is nice and unconventionally sexy, and Richard Jenkins is always a reliable guy to have around, so the cast seemed reasonable. It may have been his name above the title that convinced me to go with this instead of whatever else was on. I should've gone to see Valkyrie for the sec",neg
1,"As we all know the sub-genre of sex comedies is pretty crowded. Simply being excessively raunchy isn't enough anymore. I've seen and heard so many disgusting jokes and actions that a sex comedy really needs to have other positive points to appeal to me these days.<br /><br />Coming into the 40 Year Old Virgin I knew basically what to expect; I did see the commercials after all; ""is it true that if you don't use it, you lose it?"" What I didn't expect to find is a heart and honest attempts at cha",pos


### Using the low-level API

#### Step 1: Build your datasets

In [None]:
raw_datasets = load_dataset("glue", "mrpc")


Reusing dataset glue (/home/wgilliam/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
raw_datasets["train"].features
raw_datasets["train"]


Dataset({
    features: ['idx', 'label', 'sentence1', 'sentence2'],
    num_rows: 3668
})

In [None]:
preprocessor = ClassificationPreprocessor(hf_tokenizer, text_attrs=["sentence1", "sentence2"], label_mapping=labels)
proc_dataests = preprocessor.process_hf_dataset(raw_datasets)
proc_dataests


  0%|          | 0/3668 [00:00<?, ?ex/s]

  0%|          | 0/408 [00:00<?, ?ex/s]

  0%|          | 0/1725 [00:00<?, ?ex/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'idx', 'input_ids', 'label', 'label_name', 'sentence1', 'sentence2'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['attention_mask', 'idx', 'input_ids', 'label', 'label_name', 'sentence1', 'sentence2'],
        num_rows: 408
    })
    test: Dataset({
        features: ['attention_mask', 'idx', 'input_ids', 'label', 'label_name', 'sentence1', 'sentence2'],
        num_rows: 1725
    })
})

#### Step 2: Dataset pre-processing (optional)

In [None]:
# export
def preproc_hf_dataset(
    # A standard PyTorch Dataset or fast.ai Datasets
    dataset: Union[torch.utils.data.dataset.Dataset, Datasets],
    # A Hugging Face tokenizer
    hf_tokenizer: PreTrainedTokenizerBase,
    # A Hugging Face model
    hf_model: PreTrainedModel,
):
    """This method can be used to preprocess most Hugging Face Datasets for use in Blurr and other training
    libraries
    """
    if ("label") in dataset.column_names:
        dataset = dataset.rename_column("label", "labels")

    hf_model_fwd_args = list(inspect.signature(hf_model.forward).parameters.keys())
    bad_cols = set(dataset.column_names).difference(hf_model_fwd_args)
    dataset = dataset.remove_columns(bad_cols)

    dataset.set_format("torch")
    return dataset


#### Step 3: Build your `DataLoaders`.

Use `BlurrDataLoader` to build Blurr friendly dataloaders from your datasets. Passing `{'labels': label_names}` to your `batch_tfm_kwargs` will ensure that your lable/target names will be displayed in methods like `show_batch` and `show_results` (just as it works with the mid-level API)

In [None]:
label_names = raw_datasets["train"].features["label"].names

trn_dl = BlurrDataLoader(
    proc_dataests["train"],
    hf_arch,
    hf_config,
    hf_tokenizer,
    hf_model,
    preproccesing_func=preproc_hf_dataset,
    batch_tfm_kwargs={"labels": label_names},
    shuffle=True,
    batch_size=8,
)

val_dl = BlurrDataLoader(
    proc_dataests["validation"],
    hf_arch,
    hf_config,
    hf_tokenizer,
    hf_model,
    preproccesing_func=preproc_hf_dataset,
    batch_tfm_kwargs={"labels": label_names},
    batch_size=16,
)

dls = DataLoaders(trn_dl, val_dl)


In [None]:
b = dls.one_batch()
b[0]["input_ids"].shape


torch.Size([8, 74])

In [None]:
dls.show_batch(dataloaders=dls, max_n=2, trunc_at=800)


Unnamed: 0,text,target
0,Prosecutor Jim Hardin called the decision a victory for Kathleen Peterson's family. Members of Kathleen Peterson's family were not present.,not_equivalent
1,"Zulifquar Ali, a worshiper slightly wounded by shrapnel, said the attackers first targeted the mosque's guards. Witness Zulfiqar Ali, who was slightly wounded by shrapnel, said the attackers had focused on the mosque's guards.",equivalent


## Tests

The tests below to ensure the core DataBlock code above works for **all** pretrained sequence classification models available in Hugging Face.  These tests are excluded from the CI workflow because of how long they would take to run and the amount of data that would be required to download.

**Note**: Feel free to modify the code below to test whatever pretrained classification models you are working with ... and if any of your pretrained sequence classification models fail, please submit a github issue *(or a PR if you'd like to fix it yourself)*

In [None]:
# hide
[model_type for model_type in BLURR.get_models(task="SequenceClassification") if (not model_type.startswith("TF"))]


['AlbertForSequenceClassification',
 'BartForSequenceClassification',
 'BertForSequenceClassification',
 'BigBirdForSequenceClassification',
 'BigBirdPegasusForSequenceClassification',
 'CTRLForSequenceClassification',
 'CamembertForSequenceClassification',
 'CanineForSequenceClassification',
 'ConvBertForSequenceClassification',
 'DebertaForSequenceClassification',
 'DebertaV2ForSequenceClassification',
 'DistilBertForSequenceClassification',
 'ElectraForSequenceClassification',
 'FNetForSequenceClassification',
 'FlaubertForSequenceClassification',
 'FunnelForSequenceClassification',
 'GPT2ForSequenceClassification',
 'GPTJForSequenceClassification',
 'GPTNeoForSequenceClassification',
 'HubertForSequenceClassification',
 'IBertForSequenceClassification',
 'LEDForSequenceClassification',
 'LayoutLMForSequenceClassification',
 'LayoutLMv2ForSequenceClassification',
 'LongformerForSequenceClassification',
 'MBartForSequenceClassification',
 'MPNetForSequenceClassification',
 'MegatronB

In [None]:
# hide
pretrained_model_names = [
    "hf-internal-testing/tiny-albert",
    "hf-internal-testing/tiny-random-bart",
    "hf-internal-testing/tiny-bert",
    "google/bigbird-roberta-base",
    "google/bigbird-pegasus-large-arxiv",
    "hf-internal-testing/tiny-random-ctrl",
    "camembert-base",
    "hf-internal-testing/tiny-random-canine",
    "YituTech/conv-bert-base",
    "hf-internal-testing/tiny-deberta",
    "hf-internal-testing/tiny-random-deberta-v2",
    "hf-internal-testing/tiny-random-distilbert",
    "hf-internal-testing/tiny-electra",
    "google/fnet-base",
    "hf-internal-testing/tiny-random-flaubert",
    "hf-internal-testing/tiny-random-funnel",
    "hf-internal-testing/tiny-random-gpt2",
    "anton-l/gpt-j-tiny-random",
    "hf-internal-testing/tiny-random-gpt_neo",
    "kssteven/ibert-roberta-base",
    "hf-internal-testing/tiny-random-led",
    "hf-internal-testing/tiny-random-longformer",
    "hf-internal-testing/tiny-random-mbart",
    "hf-internal-testing/tiny-random-mpnet",
    # "nvidia/megatron-bert-cased-345m",                 could not test
    "hf-internal-testing/tiny-random-mobilebert",
    "openai-gpt",
    "google/reformer-crime-and-punishment",
    "google/rembert",
    "junnyu/roformer_chinese_sim_char_ft_small",
    "roberta-base",
    "squeezebert/squeezebert-uncased",
    "hf-internal-testing/tiny-random-transfo-xl",
    "xlm-mlm-en-2048",
    "xlm-roberta-base",
    "xlnet-base-cased",
]


In [None]:
# hide
# for model_name in pretrained_model_names:
#     tok = AutoTokenizer.from_pretrained(model_name)
#     print(f'=== {model_name} ===')
#     print(f'=== {tok.padding_side} ===')
#     print(f'=== {tok.pad_token_id} ===')
#     print(tok(['hi', 'hello everyone. its good to be here'], ['yo', 'yo'], padding='max_length', max_length=128))


In [None]:
# hide
raw_datasets = load_dataset("imdb", split=["train", "test"])
raw_datasets[0] = raw_datasets[0].add_column("is_valid", [False] * len(raw_datasets[0]))
raw_datasets[1] = raw_datasets[1].add_column("is_valid", [True] * len(raw_datasets[1]))

final_ds = concatenate_datasets([raw_datasets[0].shuffle().select(range(1000)), raw_datasets[1].shuffle().select(range(200))])
imdb_df = pd.DataFrame(final_ds)


Reusing dataset imdb (/home/wgilliam/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# hide
from transformers import RobertaTokenizer

model_cls = AutoModelForSequenceClassification
bsz = 2
seq_sz = 128

test_results = []
for model_name in pretrained_model_names:
    error = None

    print(f"=== {model_name} ===\n")

    tok_class = RobertaTokenizer if ("/ibert" in model_name) else None

    hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(model_name, model_cls=model_cls, tokenizer_cls=tok_class)

    print(f"architecture:\t{hf_arch}\ntokenizer:\t{type(hf_tokenizer).__name__}\n")

    # not all architectures include a native pad_token (e.g., gpt2, ctrl, etc...), so we add one here
    if hf_tokenizer.pad_token is None:
        hf_tokenizer.add_special_tokens({"pad_token": "<pad>"})
        hf_config.pad_token_id = hf_tokenizer.get_vocab()["<pad>"]
        hf_model.resize_token_embeddings(len(hf_tokenizer))

    try:
        blocks = (HF_TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model, padding="max_length", max_length=seq_sz), CategoryBlock)

        dblock = DataBlock(blocks=blocks, get_x=ColReader("text"), get_y=ColReader("label"), splitter=ColSplitter())
        dls = dblock.dataloaders(imdb_df, bs=bsz)
        b = dls.one_batch()

        print("*** TESTING DataLoaders ***\n")
        test_eq(len(b), 2)
        test_eq(len(b[0]["input_ids"]), bsz)
        test_eq(b[0]["input_ids"].shape, torch.Size([bsz, seq_sz]))
        test_eq(len(b[1]), bsz)

        test_results.append((hf_arch, type(hf_tokenizer).__name__, model_name, "PASSED", ""))
        dls.show_batch(dataloaders=dls, max_n=2, trunc_at=1000)

    except Exception as err:
        test_results.append((hf_arch, type(hf_tokenizer).__name__, model_name, "FAILED", err))


=== hf-internal-testing/tiny-albert ===

architecture:	albert
tokenizer:	AlbertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"beat a path to this important documentary that looks like an attractive feature. forbidden lie$(2007) is simply a better (cinematic) version of norma khouri's book forbidden love, and that was a best-seller. an onion-peeling of literary fraud and of a pretty woman, lie$ is the very best in editorialised reality tv.br /br /cleverly edited and colourful, broinowski's storytelling is chaptered",1
1,"before i begin, i want to briefly say that this movie in and of itself is very well made and well acted by all involved, including whittaker, who indeed deserves his nomination. it is highly entertaining, and... taken in the right context as a work of fiction, it is a very good movie. for that, i give it the two stars.br /br /however, rather than wasting your time with what you can read a hundred times elsewhere, i want instead to point out",0


=== hf-internal-testing/tiny-random-bart ===

architecture:	bart
tokenizer:	BartTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Beat a path to this important documentary that looks like an attractive feature. Forbidden Lie$(2007) is simply a better (cinematic) version of Norma Khouri's book Forbidden Love, and THAT was a best-seller. An onion-peeling of literary fraud and of a pretty woman,",1
1,"The documentary revolves around Eva Mozes Kor, a holocaust survivor, part of Mengele's experiments on twins, consisting primarily her version of what happened at Auschwitz, and a comparison of the emotions of the other survivors of the twin experiments. The movie obviously had great reviews. It's one of those",0


=== hf-internal-testing/tiny-bert ===

architecture:	bert
tokenizer:	BertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"beat a path to this important documentary that looks like an attractive feature. forbidden lie $ ( 2007 ) is simply a better ( cinematic ) version of norma khouri's book forbidden love, and that was a best - seller. an onion - peeling of literary fraud and of a pretty woman, lie $ is the very best in editorialised reality tv. < br / > < br / > cleverly edited and colourful, broinowski's storytelling is chaptered by moving silhouettes of norma khouri meaningfully blowing smoke. i disagree ( with variety ) that it's overlong ; instead my one slight",1
1,"... and that's a bad thing, because at least if this had been a troma film, it would have had wanton violence and a greater sense of anarchic abandon that might have brought my rating up a bit. < br / > < br / > so what we have instead is a very tame ( rated pg ), barely lukewarm, low budget ( roger corman produced it with an unknown director who has subsequently remained unknown ) gremlins ( 1984 ) / critters ( 1986 ) - wannabe with almost exclusively flat humor, little of the logic that made gremlins work",0


=== google/bigbird-roberta-base ===



normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.


architecture:	big_bird
tokenizer:	BigBirdTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Beat a path to this important documentary that looks like an attractive feature. Forbidden Lie$(2007) is simply a better (cinematic) version of Norma Khouri's book Forbidden Love, and THAT was a best-seller. An onion-peeling of literary fraud and of a pretty woman, Lie$ is the very best in editorialised reality TV.<br /><br />Cleverly edited and colourful, Broinowski's storytelling is chaptered by moving silhouettes of Norma Khouri meaningfully blowing smoke. I disagree (with Variety) that it's overlong; instead my one slight problem was",1
1,"I did not know for some time in my youth all that could in general be known about this film however the ways of making a film was not what in fact drew my attention, what made this motion picture one the most liked films even to this very day that I have ever seen was of the Heroism,bravery and the Honor to have served in Her Majestys Service.This film is not always what it seems and that is perhaps as it should be,however I cant say enough for the courage exhibited by Sgt.Cutter in defense of The Uniform that he too would of sacrificed his life to save from peril",1


=== google/bigbird-pegasus-large-arxiv ===

architecture:	bigbird_pegasus
tokenizer:	PegasusTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Beat a path to this important documentary that looks like an attractive feature. Forbidden Lie$(2007) is simply a better (cinematic) version of Norma Khouri's book Forbidden Love, and THAT was a best-seller. An onion-peeling of literary fraud and of a pretty woman, Lie$ is the very best in editorialised reality TV.br />br />Cleverly edited and colourful, Broinowski's storytelling is chaptered by moving silhouettes of Norma Khouri meaningfully blowing smoke. I disagree (with Variety) that it's overlong; instead my one slight problem was with the episodic",1
1,"I have never seen any of Spike Lee's prior films, as their trailers never caught my interest. I have seen, and admire Denzel Washington, and Jodie Foster's work, and have several of their DVDs. I was, however, entirely disappointed with this movie. If this film is any indication of Spike Lee's ability as a director, my advice would be to ""get a job"", and stop wasting the time and talent of others. br />br />I wonder if some of the other IMDb commentators watched the same movie that I'd seen. I can only assume, from their sappy love",0


=== hf-internal-testing/tiny-random-ctrl ===



  angle_rates = 1 / torch.pow(10000, (2 * (i // 2)) / d_model_size)
Using pad_token, but it is not set yet.


architecture:	ctrl
tokenizer:	CTRLTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Beat a path to this important documentary that looks like an attractive feature. Forbidden Lie$(2007) is simply a better (cinematic) version of Norma Khouri's book Forbidden Love, and THAT was a best-seller. An onion-peeling of literary fraud and of a pretty woman, Lie$ is the very best in editorialised reality TV.<br /><br />Cleverly edited and colourful, Broinowski's storytelling is chaptered by moving silhouettes of Norma Khouri meaningfully blowing smoke. I disagree (with Variety) that it's overlong; instead my one slight problem was with the episodic nature of its key players commenting on others' just-recorded testimonials. On a",1
1,"I really wanted to love this show. I truly, honestly did.<br /><br />For the first time, gay viewers get their own version of the ""The Bachelor"". With the help of his obligatory ""hag"" Andra, James, a good looking, well-to-do thirty-something has the chance of love with 15 suitors (or ""mates"" as they are referred to in the show). The only problem is half of them are straight and James doesn't know this. If James picks a gay one, they get a trip to New Zealand, and If he picks a straight one, straight guy gets $25,000. How can this not be fun?! Take my hand, lets stroll: <b@@",0


=== camembert-base ===

architecture:	camembert
tokenizer:	CamembertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Beat a path to this important documentary that looks like an attractive feature. Forbidden Lie$(2007) is simply a better (cinematic) version of Norma Khouri's book Forbidden Love, and THAT was a best-seller. An onion-peeling of literary fraud and of a pretty woman, Lie$ is the very best in editorialised reality TV.<br /><br />Cleverly edited and colourful, Broinowski's storytelling is chaptered",1
1,"I did not know for some time in my youth all that could in general be known about this film however the ways of making a film was not what in fact drew my attention, what made this motion picture one the most liked films even to this very day that I have ever seen was of the Heroism,bravery and the Honor to have served in Her Majestys Service.This film is not always what it seems and that is perhaps as it should",1


=== hf-internal-testing/tiny-random-canine ===



Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.


architecture:	canine
tokenizer:	CanineTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,Beat a path to this important documentary that looks like an attractive feature. Forbidden Lie$(2007) is simply a better (cine,1
1,"Oftentimes, films of this nature come across as a mixed bag of great work along with slight drivel to fill the runtime. Whethe",1


=== YituTech/conv-bert-base ===

architecture:	convbert
tokenizer:	ConvBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"beat a path to this important documentary that looks like an attractive feature. forbidden lie $ ( 2007 ) is simply a better ( cinematic ) version of norma khouri's book forbidden love, and that was a best - seller. an onion - peeling of literary fraud and of a pretty woman, lie $ is the very best in editorialised reality tv. < br / > < br / > cleverly edited and colourful, broinowski's storytelling is chaptered by moving silhouettes of norma khouri meaningfully blowing smoke. i disagree ( with variety ) that it's overlong ; instead my one slight",1
1,"... and that's a bad thing, because at least if this had been a troma film, it would have had wanton violence and a greater sense of anarchic abandon that might have brought my rating up a bit. < br / > < br / > so what we have instead is a very tame ( rated pg ), barely lukewarm, low budget ( roger corman produced it with an unknown director who has subsequently remained unknown ) gremlins ( 1984 ) / critters ( 1986 ) - wannabe with almost exclusively flat humor, little of the logic that made gremlins work",0


=== hf-internal-testing/tiny-deberta ===

architecture:	deberta
tokenizer:	DebertaTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Beat a path to this important documentary that looks like an attractive feature. Forbidden Lie$(2007) is simply a better (cinematic) version of Norma Khouri's book Forbidden Love, and THAT was a best-seller. An onion-peeling of literary fraud and of a pretty woman, Lie$ is the very best in",1
1,"Oftentimes, films of this nature come across as a mixed bag of great work along with slight drivel to fill the runtime. Whether it be the big name support or the project itself, Paris je t'aime never falls into this realm. I believe I can truly say that the movie as a whole is better than its parts. Between the wonderful transitions and the",1


=== hf-internal-testing/tiny-random-deberta-v2 ===

architecture:	deberta_v2
tokenizer:	DebertaV2Tokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Beat a path to this important documentary that looks like an attractive feature. Forbidden Lie$(2007) is simply a better (cinematic) version of Norma Khouri's book Forbidden Love, and THAT was a best-seller. An onion-peeling of literary fraud and of a pretty woman, Lie$ is the very best in editorialised reality TV.<br /><br />Cleverly edited and colourful, Broinowski's storytelling is chaptered by moving silhouettes of Norma Khouri meaningfully blowing smoke. I disagree (with Variety) that it's overlong; instead my one slight problem was with the",1
1,"I have never seen any of Spike Lee's prior films, as their trailers never caught my interest. I have seen, and admire Denzel Washington, and Jodie Foster's work, and have several of their DVDs. I was, however, entirely disappointed with this movie. If this film is any indication of Spike Lee's ability as a director, my advice would be to ""get a job"", and stop wasting the time and talent of others. <br /><br />I wonder if some of the other IMDb commentators watched the same movie that I'd seen. I can only assume, from their sappy lovelorn",0


=== hf-internal-testing/tiny-random-distilbert ===

architecture:	distilbert
tokenizer:	DistilBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,beat a path to this important documentary that looks like an attractive feature. forbidden lie $ ( 2007 ) is simply a better ( cinematic ) version of norm,1
1,"i really wanted to love this show. i truly, honestly did. < br / > < br / > for the first time, gay viewers get their own version of the "" the bachelor "". with th",0


=== hf-internal-testing/tiny-electra ===

architecture:	electra
tokenizer:	ElectraTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"beat a path to this important documentary that looks like an attractive feature. forbidden lie $ ( 2007 ) is simply a better ( cinematic ) version of norma khouri's book forbidden love, and that was a best - seller. an onion - peeling of literary fraud and of a pretty woman, lie $ is the very best in editorialised reality tv. < br / > < br / > cleverly edited and colourful, broinow",1
1,"a double life has developed a mystique among film fans for two reasons : the plot idea of an actor getting so wrapped up into a role ( here othello ) as to pick up the great flaw of that character and put it into his life ; and that this is the film that won ronald colman the academy award ( as well as the golden globe ) as best actor. let's take the second point first. < br / > < br / > is anthony john colman's greatest role, or even his signatur",1


=== google/fnet-base ===

architecture:	fnet
tokenizer:	FNetTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Beat a path to this important documentary that looks like an attractive feature. Forbidden Lie$(2007) is simply a better (cinematic) version of Norma Khouri's book Forbidden Love, and THAT was a best-seller. An onion-peeling of literary fraud and of a pretty woman, Lie$ is the very best in editorialised reality TV.<br /><br />Cleverly edited and colourful, Broinowski's storytelling is chaptered by moving silhouettes of Norma Khouri meaningfully blowing smoke.",1
1,"A DOUBLE LIFE has developed a mystique among film fans for two reasons: the plot idea of an actor getting so wrapped up into a role (here Othello) as to pick up the great flaw of that character and put it into his life; and that this is the film that won Ronald Colman the Academy Award (as well as the Golden Globe) as best actor. Let's take the second point first.<br /><br />Is Anthony John Colman's greatest role, or even his signature role? I have my doubts on either level",1


=== hf-internal-testing/tiny-random-flaubert ===

architecture:	flaubert
tokenizer:	FlaubertTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Beat a path to this important documentary that looks like an attractive feature. Forbidden Lie $ ( 2007 ) is simply a better ( cinematic ) version of Norma Khouri' s book Forbidden Love, and THAT was a best-seller. An onion-peeling of literary fraud and of a pretty woman, Lie $ is the very best in editorialised reality TV. < br / > < br / > Cleverly edited and colourful, Broinowski' s storyt",1
1,"Oftentimes, films of this nature come across as a mixed bag of great work along with slight drivel to fill the runtime. Whether it be the big name support or the project itself, Paris je t' aime never falls into this realm. I believe I can truly say that the movie as a whole is better than its parts. Between the wonderful transitions and the fantastic ending sequence, merging characters together in one last view of love in Paris, I think",1


=== hf-internal-testing/tiny-random-funnel ===

architecture:	funnel
tokenizer:	FunnelTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,beat a path to this important documentary that looks like an attractive feature. forbidden lie $ ( 2007 ) is simply a better ( cinematic ) version of norm,1
1,i did not know for some time in my youth all that could in general be known about this film however the ways of making a film was not what in fact drew my atten,1


=== hf-internal-testing/tiny-random-gpt2 ===



Using pad_token, but it is not set yet.


architecture:	gpt2
tokenizer:	GPT2TokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Beat a path to this important documentary that looks like an attractive feature. Forbidden Lie$(2007) is simply a better (cinematic) version of Norma Khouri's book Forbidden Love, and THAT was a best-seller. An onion-peeling of literary fraud and of a pretty woman, Lie",1
1,"I am not going to lie. Despite looking interesting, I watched The Notorious Bettie Page because I had heard (and it was fairly obvious just by looking at a synopsis or anything about the film), that Gretchen Mol got naked in it. I have never been a fan of Mol, but I cannot resist seeing an attractive woman taking off her clot",1


=== anton-l/gpt-j-tiny-random ===



Using pad_token, but it is not set yet.


architecture:	gptj
tokenizer:	GPT2TokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Beat a path to this important documentary that looks like an attractive feature. Forbidden Lie$(2007) is simply a better (cinematic) version of Norma Khouri's book Forbidden Love, and THAT was a best-seller. An onion-peeling of literary fraud and of a pretty woman, Lie$ is the very best in editorialised reality TV.<br /><br />Cleverly edited and colourful, Broinowski's storytelling is chaptered by moving silhouettes of Norma Khouri meaningfully blowing smoke. I disagree (with Variety) that it's overlong; instead my one slight problem was with",1
1,"I did not know for some time in my youth all that could in general be known about this film however the ways of making a film was not what in fact drew my attention, what made this motion picture one the most liked films even to this very day that I have ever seen was of the Heroism,bravery and the Honor to have served in Her Majestys Service.This film is not always what it seems and that is perhaps as it should be,however I cant say enough for the courage exhibited by Sgt.Cutter in defense of The Uniform that he too would of sacrificed his life to save from peril of the",1


=== hf-internal-testing/tiny-random-gpt_neo ===



Using pad_token, but it is not set yet.


architecture:	gpt_neo
tokenizer:	GPT2TokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Beat a path to this important documentary that looks like an attractive feature. Forbidden Lie$(2007) is simply a better (cinematic) version of Norma Khouri's book Forbidden Love, and THAT was a best-seller. An onion-peeling of literary fraud and of a pretty woman, Lie",1
1,Was'nt really bad for Raw's first PPV of 006. But the ending was really really shocking to everyone in attendance & the ones who were watching at home.<br /><br />FIRST MATCH- RIC FLAIR VS. EDGE W/ LITA FOR THE WWE INTERCONTINENTAL CH,1


=== kssteven/ibert-roberta-base ===

architecture:	ibert
tokenizer:	RobertaTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Beat a path to this important documentary that looks like an attractive feature. Forbidden Lie$(2007) is simply a better (cinematic) version of Norma Khouri's book Forbidden Love, and THAT was a best-seller. An onion-peeling of literary fraud and of a pretty woman, Lie$ is the very best in editorialised reality TV.<br /><br />Cleverly edited and colourful, Broinowski's storytelling is chaptered by moving silhouettes of Norma Khouri meaningfully blowing smoke. I disagree (with Variety) that it's overlong; instead my one slight problem",1
1,"Enterprise, the latest high budget spin-off to the most successful franchise in film and or television history opens to the tune of a 90-minute episode called 'Broken Bow'. First we are swept into a massive action sequence with a Klingon being chased by some Suliban (who are the main enemy in the first season of the show). From there the televised movie takes us on a journey that seldom gets as good as it is, with some of the best character development, story and action/visual effects ever seen in such a short amount of time.<br /><br />The opening-credits is a debatable subject",1


=== hf-internal-testing/tiny-random-led ===

architecture:	led
tokenizer:	LEDTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Beat a path to this important documentary that looks like an attractive feature. Forbidden Lie$(2007) is simply a better (cinematic) version of Norma Khouri's book Forbidden Love, and THAT was a best-seller. An onion-peeling of literary fraud and of a pretty woman,",1
1,"Rumour has it that around the time that ABBA  the multi-award winning Swedish disco favourites 's star had reached its zenith, the band grew disillusioned with singing in English and yearned to perform in their native tongue. Soon after, problems began to emerge in the onetime-wed locked-watertight partnership and record",1


=== hf-internal-testing/tiny-random-longformer ===

architecture:	longformer
tokenizer:	LongformerTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Beat a path to this important documentary that looks like an attractive feature. Forbidden Lie$(2007) is simply a better (cinematic) version of Norma Khouri's book Forbidden Love, and THAT was a best-seller. An onion-peeling of literary fraud and of a pretty woman,",1
1,"Oftentimes, films of this nature come across as a mixed bag of great work along with slight drivel to fill the runtime. Whether it be the big name support or the project itself, Paris je t'aime never falls into this realm. I believe I can truly say that the movie as a whole is better than its parts. Between the wonderful transitions and the",1


=== hf-internal-testing/tiny-random-mbart ===

architecture:	mbart
tokenizer:	MBartTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Beat a path to this important documentary that looks like an attractive feature. Forbidden Lie(200) is simply a better (cinematic) version of Norma Khouri's book Forbidden Love, and THAT was a best-seller. An onion-peeling of literary fraud and",1
1,"Oftentimes, films of this nature come across as a mixed bag of great work along with slight drivel to fill the runtime. Whether it be the big name support or the project itself, Paris je t'aime never falls into this realm. I believe I can truly say that the movie as a whole is better than its parts. Between the wonderful tra",1


=== hf-internal-testing/tiny-random-mpnet ===

architecture:	mpnet
tokenizer:	MPNetTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,beat a path to this important documentary that looks like an attractive feature. forbidden lie $ ( 2007 ) is simply a better ( cinematic ) version of norm,1
1,"rumour has it that around the time that abba the multi - award winning swedish disco favourites's star had reached its zenith, the band grew disillusio",1


=== hf-internal-testing/tiny-random-mobilebert ===

architecture:	mobilebert
tokenizer:	MobileBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,beat a path to this important documentary that looks like an attractive feature. forbidden lie $ ( 2007 ) is simply a better ( cinematic ) version of norm,1
1,"oftentimes, films of this nature come across as a mixed bag of great work along with slight drivel to fill the runtime. whether it be the big name support",1


=== openai-gpt ===



Using pad_token, but it is not set yet.


architecture:	openai
tokenizer:	OpenAIGPTTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"i have never seen any of spike lee's prior films, as their trailers never caught my interest. i have seen, and admire denzel washington, and jodie foster's work, and have several of their dvds. i was, however, entirely disappointed with this movie. if this film is any indication of spike lee's ability as a director, my advice would be to "" get a job "", and stop wasting the time and talent of others. < br / > < br / > i wonder if some of the other imdb commentators watched the same movie that i'd seen. i can only assume,",0
1,"rumour has it that around the time that abba the multi - award winning swedish disco favourites's star had reached its zenith, the band grew disillusioned with singing in english and yearned to perform in their native tongue. soon after, problems began to emerge in the onetime - wed locked - watertight partnership and recordings became less and less frequent. the band dissolved, albeit unofficially, in 1982 and pop lost one of its most celebrated artists. although they have never admitted that there's any truth in those rumours, the fact remains that abba would never have been so successful had they only recorded in their native tongue. if",1


=== google/reformer-crime-and-punishment ===



Using pad_token, but it is not set yet.


architecture:	reformer
tokenizer:	ReformerTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Beat a path to this important documentary that looks like an attractive feature. Forbidden Lie( is simply a better (cinematic version of Norma Khouris book Forbidden Love, and THAT was a best-seller. An onion-peeling of liter",1
1,"Cartoon Network seems to be desperate for ratings. Beginning with the cancellation of Samurai ack, the network seemed hellbent on removing all the shows that made it so popular, such as the Powerpuff Girls, Dexters Lab, Dragonball Z, etc.",0


=== google/rembert ===

architecture:	rembert
tokenizer:	RemBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Beat a path to this important documentary that looks like an attractive feature. Forbidden Lie$(2007) is simply a better (cinematic) version of Norma Khouri's book Forbidden Love, and THAT was a best-seller. An onion-peeling of literary fraud and of a pretty woman, Lie$ is the very best in editorialised reality TV.<br /><br />Cleverly edited and colourful, Broinowski's storytelling is chaptered by moving silhouettes of Norma Khouri meaningfully blowing smoke. I disagree (with Variety) that it's overlong; instead my one",1
1,"Romance is in the air and love is in bloom in Victorian era England, in this light-hearted story set against a society in a time in which manners were still in vogue, the ladies were charming and elegant, and the gentlemen dashing. `Emma,' based on the novel by Jane Austen and written for the screen and directed by Douglas McGrath, stars the lovely Gwyneth Paltrow in the title role. A self-appointed matchmaker, Emma takes great delight in the romantic notion of playing Cupid and attempting to pair up those she feels are suited to one another.",1


=== junnyu/roformer_chinese_sim_char_ft_small ===

architecture:	roformer
tokenizer:	RoFormerTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"beat a path to this important documentary that looks like an attractive feature. forbidden lie $ ( 2007 ) is simply a better ( cinematic ) version of norma khouri's book forbidden love, and that was a best - seller. an onion - peeling of literary fraud and of a pretty woman, lie $ is the very best in editorialised reality tv. < br / > < br / > cleverly edited and colourful",1
1,"enterprise, the latest high budget spin - off to the most successful franchise in film and or television history opens to the tune of a 90 - minute episode called'broken bow '. first we are swept into a massive action sequence with a klingon being chased by some suliban ( who are the main enemy in the first season of the show ). from there the televised movie takes us on a journey that se",1


=== roberta-base ===

architecture:	roberta
tokenizer:	RobertaTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Beat a path to this important documentary that looks like an attractive feature. Forbidden Lie$(2007) is simply a better (cinematic) version of Norma Khouri's book Forbidden Love, and THAT was a best-seller. An onion-peeling of literary fraud and of a pretty woman, Lie$ is the very best in editorialised reality TV.<br /><br />Cleverly edited and colourful, Broinowski's storytelling is chaptered by moving silhouettes of Norma Khouri meaningfully blowing smoke. I disagree (with Variety) that it's overlong; instead my one slight problem",1
1,"I have never seen any of Spike Lee's prior films, as their trailers never caught my interest. I have seen, and admire Denzel Washington, and Jodie Foster's work, and have several of their DVDs. I was, however, entirely disappointed with this movie. If this film is any indication of Spike Lee's ability as a director, my advice would be to ""get a job"", and stop wasting the time and talent of others. <br /><br />I wonder if some of the other IMDb commentators watched the same movie that I'd seen. I can only assume, from their sappy lo",0


=== squeezebert/squeezebert-uncased ===

architecture:	squeezebert
tokenizer:	SqueezeBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"beat a path to this important documentary that looks like an attractive feature. forbidden lie $ ( 2007 ) is simply a better ( cinematic ) version of norma khouri's book forbidden love, and that was a best - seller. an onion - peeling of literary fraud and of a pretty woman, lie $ is the very best in editorialised reality tv. < br / > < br / > cleverly edited and colourful, broinowski's storytelling is chaptered by moving silhouettes of norma khouri meaningfully blowing smoke. i disagree ( with variety ) that it's overlong ; instead my one slight",1
1,"this film sat on my tivo for weeks before i watched it. i dreaded a self - indulgent yuppie flick about relationships gone bad. i was wrong ; this was an engrossing excursion into the screwed - up libidos of new yorkers. < br / > < br / > the format is the same as max ophuls'"" la ronde, "" based on a play by arthur schnitzler, who is given an "" inspired by "" credit. it starts from one person, a prostitute, standing on a street corner in brooklyn. she is picked up by a home",1


=== hf-internal-testing/tiny-random-transfo-xl ===



Using pad_token, but it is not set yet.


architecture:	transfo_xl
tokenizer:	TransfoXLTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"I have never seen any of Spike Lee's prior films, as their trailers never caught my interest. I have seen, and admire Denzel Washington, and Jodie Foster's work, and have several of their DVDs. I was, however, entirely disappointed with this movie. If this film is any indication of Spike Lee's ability as a director, my advice would be to ""get a job,"" and stop wasting the time and talent of others. < br / > < br / > I wonder if some of the other IMDb commentators watched the same movie that I'd seen. I can only assume, from their sappy lovelorn reviews, that their",0
1,"Beat a path to this important documentary that looks like an attractive feature. Forbidden Lie $(2007) is simply a better (cinematic) version of Norma Khouri's book Forbidden Love, and THAT was a best-seller. An onion-peeling of literary fraud and of a pretty woman, Lie $is the very best in editorialised reality TV. < br / > < br / > edited and colourful,'s storytelling is by moving silhouettes of Norma Khouri meaningfully blowing smoke. I disagree (with Variety) that it's overlong; instead my one slight problem was with the episodic nature of its key players commenting on others' just-recorded",1


=== xlm-mlm-en-2048 ===

architecture:	xlm
tokenizer:	XLMTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"beat a path to this important documentary that looks like an attractive feature. forbidden lie $ ( 2007 ) is simply a better ( cinematic ) version of norma khouri's book forbidden love, and that was a best-seller. an onion-peeling of literary fraud and of a pretty woman, lie $ is the very best in editorialised reality tv. < br / > < br / > cleverly edited and colourful, broinowski's storytelling is chaptered by moving silhouettes of norma khouri meaningfully blowing smoke. i disagree ( with variety ) that it's overlong ; instead",1
1,"a double life has developed a mystique among film fans for two reasons : the plot idea of an actor getting so wrapped up into a role ( here othello ) as to pick up the great flaw of that character and put it into his life ; and that this is the film that won ronald colman the academy award ( as well as the golden globe ) as best actor. let's take the second point first. < br / > < br / > is anthony john colman's greatest role, or even his signature role? i have my doubts on either level - but it is among his best known ro",1


=== xlm-roberta-base ===

architecture:	xlm_roberta
tokenizer:	XLMRobertaTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Beat a path to this important documentary that looks like an attractive feature. Forbidden Lie$(2007) is simply a better (cinematic) version of Norma Khouri's book Forbidden Love, and THAT was a best-seller. An onion-peeling of literary fraud and of a pretty woman, Lie$ is the very best in editorialised reality TV.<br /><br />Cleverly edited and colourful, Broinowski's storytelling is chaptered by moving silhouettes of Norma Khouri meaningfully blowing smoke. I disagree (with",1
1,"I did not know for some time in my youth all that could in general be known about this film however the ways of making a film was not what in fact drew my attention, what made this motion picture one the most liked films even to this very day that I have ever seen was of the Heroism,bravery and the Honor to have served in Her Majestys Service.This film is not always what it seems and that is perhaps as it should be,however I cant say enough for the courage exhibited by Sgt.Cutter in defense of The Uniform that he too would of sacrificed his life",1


=== xlnet-base-cased ===

architecture:	xlnet
tokenizer:	XLNetTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"I have never seen any of Spike Lee's prior films, as their trailers never caught my interest. I have seen, and admire Denzel Washington, and Jodie Foster's work, and have several of their DVDs. I was, however, entirely disappointed with this movie. If this film is any indication of Spike Lee's ability as a director, my advice would be to ""get a job"", and stop wasting the time and talent of others. <br /><br />I wonder if some of the other IMDb commentators watched the same movie that I'",0
1,"For those of you who've never heard of it (or seen it on A&E), Cracker is a brilliant British TV show about an overweight, chain-smoking, foulmouthed psychologist named Fitz who helps the Manchester police department get into the heads of violent criminals. It's considered to be one of the finest shows ever to come out of England (and that's saying something), and was tremendously successful in England and around the world back in 1993.<br /><br />Now, the original stars have re-teamed with the original writer to",0


In [None]:
# hide_input
test_results_df = pd.DataFrame(test_results, columns=["arch", "tokenizer", "model_name", "result", "error"])
display_df(test_results_df)


Unnamed: 0,arch,tokenizer,model_name,result,error
0,albert,AlbertTokenizerFast,hf-internal-testing/tiny-albert,PASSED,
1,bart,BartTokenizerFast,hf-internal-testing/tiny-random-bart,PASSED,
2,bert,BertTokenizerFast,hf-internal-testing/tiny-bert,PASSED,
3,big_bird,BigBirdTokenizerFast,google/bigbird-roberta-base,PASSED,
4,bigbird_pegasus,PegasusTokenizerFast,google/bigbird-pegasus-large-arxiv,PASSED,
5,ctrl,CTRLTokenizer,hf-internal-testing/tiny-random-ctrl,PASSED,
6,camembert,CamembertTokenizerFast,camembert-base,PASSED,
7,canine,CanineTokenizer,hf-internal-testing/tiny-random-canine,PASSED,
8,convbert,ConvBertTokenizerFast,YituTech/conv-bert-base,PASSED,
9,deberta,DebertaTokenizerFast,hf-internal-testing/tiny-deberta,PASSED,


## Summary

The `blurr.data.core` module contains the fundamental bits for all data preprocessing tasks

In [None]:
# hide
from nbdev.export import notebook2script

notebook2script()


Converted 00_utils.ipynb.
Converted 01_data-core.ipynb.
Converted 01_modeling-core.ipynb.
Converted 02_data-language-modeling.ipynb.
Converted 02_modeling-language-modeling.ipynb.
Converted 03_data-token-classification.ipynb.
Converted 03_modeling-token-classification.ipynb.
Converted 04_data-question-answering.ipynb.
Converted 04_modeling-question-answering.ipynb.
Converted 10_data-seq2seq-core.ipynb.
Converted 10_modeling-seq2seq-core.ipynb.
Converted 11_data-seq2seq-summarization.ipynb.
Converted 11_modeling-seq2seq-summarization.ipynb.
Converted 12_data-seq2seq-translation.ipynb.
Converted 12_modeling-seq2seq-translation.ipynb.
Converted 99a_examples-high-level-api.ipynb.
Converted 99b_examples-glue.ipynb.
Converted 99c_examples-glue-plain-pytorch.ipynb.
Converted 99d_examples-multilabel.ipynb.
Converted 99e_examples-causal-lm-gpt2.ipynb.
Converted index.ipynb.
