In [None]:
# default_exp data.core


In [None]:
# all_slow


In [None]:
#hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# data.core

> This module contains the core bits required to use the fastai DataBlock API and/or mid-level data processing pipelines to turn your raw datasets into modelable `DataLoaders`

In [None]:
# export
import os, inspect
from dataclasses import dataclass
from functools import reduce, partial
from typing import Any, Callable, List, Optional, Union, Type

from datasets import Dataset, load_dataset, concatenate_datasets
from fastcore.all import *
from fastai.data.block import TransformBlock
from fastai.data.core import Datasets, DataLoader, DataLoaders, TfmdDL
from fastai.imports import *
from fastai.losses import CrossEntropyLossFlat
from fastai.text.data import SortedDL
from fastai.torch_core import *
from fastai.torch_imports import *
from transformers import (
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    PretrainedConfig,
    PreTrainedTokenizerBase,
    PreTrainedModel,
    logging,
)

from blurr.utils import BLURR

logging.set_verbosity_error()


In [None]:
# hide_input
import pdb

from fastai.data.block import CategoryBlock, ColReader, ColSplitter, DataBlock, ItemGetter, RandomSplitter
from fastcore.test import *
from nbverbose.showdoc import show_doc

from blurr.utils import print_versions

os.environ["TOKENIZERS_PARALLELISM"] = "false"
print("What we're running with at the time this documentation was generated:")
print_versions("torch fastai transformers")


What we're running with at the time this documentation was generated:
torch: 1.10.1+cu111
fastai: 2.5.3
transformers: 4.16.2


In [None]:
# hide
# cuda
torch.cuda.set_device(1)
print(f"Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}")


Using GPU #1: GeForce GTX 1080 Ti


## Setup

We'll use a subset of `imdb` to demonstrate how to configure your BLURR for sequence classification tasks

In [None]:
raw_datasets = load_dataset("imdb", split=["train", "test"])
raw_datasets[0] = raw_datasets[0].add_column("is_valid", [False] * len(raw_datasets[0]))
raw_datasets[1] = raw_datasets[1].add_column("is_valid", [True] * len(raw_datasets[1]))

final_ds = concatenate_datasets([raw_datasets[0].shuffle().select(range(1000)), raw_datasets[1].shuffle().select(range(200))])
imdb_df = pd.DataFrame(final_ds)
imdb_df.head()


Reusing dataset imdb (/home/wgilliam/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached shuffled indices for dataset at /home/wgilliam/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-65b5588450d6b196.arrow


Unnamed: 0,text,label,is_valid
0,This movie was horrible. I swear they didn't even write a script they just kinda winged it through out the whole movie. Ice-T was annoying as hell. *SPOILERS Phht more like reasons not to watch it* They sit down and eat breakfast for 20 minutes. he coulda been long gone. The ground was hard it would of been close to impossible to to track him with out dogs. And when ICE-T is on that Hill and uses that Spaz-15 Assault SHOTGUN like its a sniper rifle (and then cuts down a tree with eight shells?? It would take 1000's of shells to cut down a tree that size.) Shotguns and hand guns are conside...,0,False
1,"I have seen this movie at the cinema many years ago, and one thing surprised me so negatively that I could not see any redeeming virtues in the movies: Dennis Quaid was cast as a policeman that never smiles or grin, while his smile and grin are two of his trademarks. Danny Glover was cast as the bad guy, but - again - most viewers' imagination could not go far enough as to believe him in that role. Also, Jared Leto was not believable as the former medicine student. The tension was just not there, since the killer was known very early. The finale was, again, neither dramatic nor tense: nobo...",0,False
2,"This is a fantastic series first and foremost. It is very well done and very interesting. As a huge WWII buff, I had learned a lot before seeing this series. One of the best things this has going for it is all the interviews with past individuals back when the war was relatively fresh in their minds, comparatively speaking that is. It is nothing against the men that you see getting interviewed in the programs of today, it is just that most of these men weren't really involved in the upper echelons of what was happening then. One of the best parts is the narrating by Sir Laurence Oliver. I ...",1,False
3,Kurosawa really blew it on this one. Every genius is allowed a failure. The concept is fine but the execution is badly blurred.<br /><br />There is an air of fantasy about this film making it something of an art film. The poverty stricken of Tokyo deserve a fairer and more realistic portrayal. Many of them have interesting stories to tell. A very disappointing film.,0,False
4,"MGM were unsure of how to market Garbo when she first arrived in Hollywood. Mayer had a lot of faith in her and her appearance in ""Torrent"" justified that. She did not speak a word of English so she must have found it difficult to work, also Ricardo Cortez did not make it very easy for her.<br /><br />The torrent of the title is the river Juscar that winds through a sleepy little village in Spain. Leonora (Greta Garbo) hopes someday that her voice will bring great wealth and happiness to her struggling parents. Leonora and Don Rafael (Ricardo Cortez) are in love but he is under his mother'...",1,False


In [None]:
labels = raw_datasets[0].features["label"].names
labels


['neg', 'pos']

In [None]:
model_cls = AutoModelForSequenceClassification

pretrained_model_name = "roberta-base"  # "bert-base-multilingual-cased"
n_labels = len(labels)

hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(
    pretrained_model_name, model_cls=model_cls, config_kwargs={"num_labels": n_labels}
)

hf_arch, type(hf_config), type(hf_tokenizer), type(hf_model)


('roberta',
 transformers.models.roberta.configuration_roberta.RobertaConfig,
 transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast,
 transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification)

## Preprocessing

Starting with version 2.0, BLURR provides a preprocessing base class that can be used to build task specific pre-processed datasets from pandas DataFrames or Hugging Face Datasets

In [None]:
# export
class Preprocessor:
    def __init__(
        self,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # The number of examples to process at a time
        batch_size: int = 1000,
        # The attribute holding the text
        text_attr: str = "text",
        # The attribute holding the text_pair
        text_pair_attr: Optional[str] = None,
        # Tokenization kwargs that will be applied with calling the tokenizer
        tok_kwargs: dict = {},
    ):
        self.hf_tokenizer = hf_tokenizer
        self.batch_size = batch_size
        self.text_attr, self.text_pair_attr = text_attr, text_pair_attr
        self.tok_kwargs = tok_kwargs

    def process_df(self, training_df: pd.DataFrame, validation_df: Optional[pd.DataFrame] = None):
        df = training_df.copy()

        # concatenate the validation dataset if it is included
        if validation_df is not None:
            valid_df = validation_df.copy()
            # add an `is_valid_col` column to both training/validation DataFrames to indicate what data is part of the validation set
            if self.is_valid_attr:
                valid_df[self.is_valid_attr] = True
                df[self.is_valid_attr] = False

            df = pd.concat([df, valid_df])

        return df

    def process_hf_dataset(self, training_ds: Dataset, validation_ds: Optional[Dataset] = None):
        ds = training_ds

        # concatenate the validation dataset if it is included
        if validation_ds is not None:
            # add an `is_valid_col` column to both training/validation DataFrames to indicate what data is part of
            # the validation set
            if self.is_valid_attr:
                validation_ds = validation_ds.add_column(self.is_valid_attr, [True] * len(validation_ds))
                training_ds = training_ds.add_column(self.is_valid_attr, [False] * len(training_ds))

            ds = concatenate_datasets([training_ds, validation_ds])

        return ds

    def _tokenize_function(self, example):
        truncation = self.tok_kwargs.pop("truncation", True)

        txts = example[self.text_attr]
        txt_pairs = example[self.text_pair_attr] if self.text_pair_attr else None

        return self.hf_tokenizer(txts, txt_pairs, truncation=truncation, **self.tok_kwargs)


### `ClassificationPreprocessor`

Starting with version 2.0, BLURR provides a sequence classification preprocessing class that can be used to preprocess DataFrames or Hugging Face Datasets.

This class can be used for preprocessing both multiclass and multilabel classification datasets, and includes a `proc_{your_text_attr}` and `proc_{your_text_pair_attr}` (optional) attributes containing your modified text as a result of tokenization (e.g., if you specify a `max_length` the `proc_{your_text_attr}` may contain truncated text). 

**Note**: This class works for both slow and fast tokenizers

In [None]:
# export
class ClassificationPreprocessor(Preprocessor):
    def __init__(
        self,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # The number of examples to process at a time
        batch_size: int = 1000,
        # Whether the dataset should be processed for multi-label; if True, will ensure `label_attrs` are
        # converted to a value of either 0 or 1 indiciating the existence of the class in the example
        is_multilabel: bool = False,
        # The unique identifier in the dataset
        id_attr: Optional[str] = None,
        # The attribute holding the text
        text_attr: str = "text",
        # The attribute holding the text_pair
        text_pair_attr: Optional[str] = None,
        # The attribute holding the label(s) of the example
        label_attrs: Union[str, List[str]] = "label",
        # The attribute that should be created if your are processing individual training and validation
        # datasets into a single dataset, and will indicate to which each example is associated
        is_valid_attr: Optional[str] = "is_valid",
        # A list indicating the valid labels for the dataset (optional, defaults to the unique set of labels
        # found in the full dataset)
        label_mapping: Optional[List[str]] = None,
        # Tokenization kwargs that will be applied with calling the tokenizer
        tok_kwargs: dict = {},
    ):
        tok_kwargs = {**tok_kwargs, "return_offsets_mapping": True}
        super().__init__(hf_tokenizer, batch_size, text_attr, text_pair_attr, tok_kwargs)

        self.is_multilabel = is_multilabel
        self.id_attr = id_attr
        self.label_attrs = label_attrs
        self.is_valid_attr = is_valid_attr
        self.label_mapping = label_mapping

    def process_df(self, training_df: pd.DataFrame, validation_df: Optional[pd.DataFrame] = None):
        df = super().process_df(training_df, validation_df)

        # convert even single "labels" to a list to make things easier
        label_cols = listify(self.label_attrs)

        # if `is_multilabel`, convert all targets to an int, 0 or 1, rounding floats if necessary
        if self.is_multilabel:
            for label_col in label_cols:
                df[label_col] = df[label_col].apply(lambda v: int(bool(max(0, round(v)))))

        # if a `label_mapping` is included, add a "[label_col]_name" field with the label Ids converted to their label names
        if self.label_mapping:
            for label_col in label_cols:
                df[f"{label_col}_name"] = df[label_col].apply(lambda v: self.label_mapping[v])

        # process df in mini-batches
        final_df = pd.DataFrame()
        for g, batch_df in df.groupby(np.arange(len(df)) // self.batch_size):
            final_df = final_df.append(self._process_df_batch(batch_df))

        final_df.reset_index(drop=True, inplace=True)
        return final_df

    def process_hf_dataset(self, training_ds: Dataset, validation_ds: Optional[Dataset] = None):
        ds = super().process_hf_dataset(training_ds, validation_ds)
        return Dataset.from_pandas(self.process_df(pd.DataFrame(ds)))

    # ----- utility methods -----
    def _process_df_batch(self, batch_df):
        batch_df.reset_index(drop=True, inplace=True)

        # grab our inputs
        inputs = self._tokenize_function(batch_df.to_dict(orient="list"))

        for txt_seq_idx, txt_attr in enumerate([self.text_attr, self.text_pair_attr]):
            if txt_attr is None:
                break

            char_idxs = []
            for idx, offset_mapping in enumerate(inputs["offset_mapping"]):
                text_offsets = [offset_mapping[i] for i, seq_id in enumerate(inputs.sequence_ids(idx)) if seq_id == txt_seq_idx]
                char_idxs.append([min(text_offsets)[0], max(text_offsets)[1]])

            batch_df = pd.concat(
                [batch_df, pd.DataFrame(char_idxs, columns=[f"{txt_attr}_start_char_idx", f"{txt_attr}_end_char_idx"])], axis=1
            )
            batch_df.insert(
                0,
                f"proc_{txt_attr}",
                batch_df.apply(lambda r: r[txt_attr][r[f"{txt_attr}_start_char_idx"] : r[f"{txt_attr}_end_char_idx"] + 1], axis=1),
            )

            return batch_df


#### Using a `DataFrame`

In [None]:
preprocessor = ClassificationPreprocessor(hf_tokenizer, label_mapping=labels, tok_kwargs={"max_length": 24})
proc_df = preprocessor.process_df(imdb_df)
proc_df.columns, len(proc_df)
proc_df.head(2)


Unnamed: 0,proc_text,text,label,is_valid,label_name,text_start_char_idx,text_end_char_idx
0,This movie was horrible. I swear they didn't even write a script they just kinda winged it through out,This movie was horrible. I swear they didn't even write a script they just kinda winged it through out the whole movie. Ice-T was annoying as hell. *SPOILERS Phht more like reasons not to watch it* They sit down and eat breakfast for 20 minutes. he coulda been long gone. The ground was hard it would of been close to impossible to to track him with out dogs. And when ICE-T is on that Hill and uses that Spaz-15 Assault SHOTGUN like its a sniper rifle (and then cuts down a tree with eight shells?? It would take 1000's of shells to cut down a tree that size.) Shotguns and hand guns are conside...,0,False,neg,0,102
1,"I have seen this movie at the cinema many years ago, and one thing surprised me so negatively that I could","I have seen this movie at the cinema many years ago, and one thing surprised me so negatively that I could not see any redeeming virtues in the movies: Dennis Quaid was cast as a policeman that never smiles or grin, while his smile and grin are two of his trademarks. Danny Glover was cast as the bad guy, but - again - most viewers' imagination could not go far enough as to believe him in that role. Also, Jared Leto was not believable as the former medicine student. The tension was just not there, since the killer was known very early. The finale was, again, neither dramatic nor tense: nobo...",0,False,neg,0,106


#### Using a Hugging Face `Dataset`

In [None]:
preprocessor = ClassificationPreprocessor(hf_tokenizer, label_mapping=labels)
proc_ds = preprocessor.process_hf_dataset(final_ds)
proc_ds


Dataset({
    features: ['proc_text', 'text', 'label', 'is_valid', 'label_name', 'text_start_char_idx', 'text_end_char_idx'],
    num_rows: 1200
})

## Mid-level API

Base tokenization, batch transform, and DataBlock methods

### `TextInput`

A `TextInput` object is returned from the decodes method of `BatchDecodeTransform` as a means to customize `@typedispatch`ed functions like `DataLoaders.show_batch` and `Learner.show_results`. The value will the your "input_ids".

In [None]:
# export
class TextInput(TensorBase):
    """The base represenation of your inputs; used by the various fastai `show` methods"""

    pass


### `BatchTokenizeTransform` 

Inspired by this [article](https://docs.fast.ai/tutorial.transformers.html), inputs can come in as raw **text**, **a list of words** (e.g., tasks like Named Entity Recognition (NER), where you want to predict the label of each token), or as a **dictionary** that includes extra information you want to use during post-processing.

**On-the-fly Batch-Time Tokenization**: 

Part of the inspiration for this derives from the mechanics of Hugging Face tokenizers, in particular it can return a collated mini-batch of data given a list of sequences. As such, the collating required for our inputs can be done during tokenization ***before*** our batch transforms run in a `before_batch_tfms` transform (where we get a list of examples)! This allows users of BLURR to have everything done dynamically at batch-time without prior preprocessing with at least four potential benefits:
1. Less code
2. Faster mini-batch creation
3. Less RAM utilization and time spent tokenizing beforehand (this really helps with very large datasets)
4. Flexibility

In [None]:
# export
class BatchTokenizeTransform(Transform):
    """
    Handles everything you need to assemble a mini-batch of inputs and targets, as well as
    decode the dictionary produced as a byproduct of the tokenization process in the `encodes` method.
    """

    def __init__(
        self,
        # The abbreviation/name of your Hugging Face transformer architecture (e.b., bert, bart, etc..)
        hf_arch: str,
        # A specific configuration instance you want to use
        hf_config: PretrainedConfig,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # A Hugging Face model
        hf_model: PreTrainedModel,
        # To control whether the "labels" are included in your inputs. If they are, the loss will be calculated in
        # the model's forward function and you can simply use `PreCalculatedLoss` as your `Learner`'s loss function to use it
        include_labels: bool = True,
        # The token ID that should be ignored when calculating the loss
        ignore_token_id: int = CrossEntropyLossFlat().ignore_index,
        # To control the length of the padding/truncation. It can be an integer or None,
        # in which case it will default to the maximum length the model can accept. If the model has no
        # specific maximum input length, truncation/padding to max_length is deactivated.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        max_length: int = None,
        # To control the `padding` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `'do_not_pad'.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        padding: Union[bool, str] = True,
        # To control `truncation` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `do_not_truncate`.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        truncation: Union[bool, str] = True,
        # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
        # if your inputs are pre-tokenized (not numericalized)
        is_split_into_words: bool = False,
        # Any other keyword arguments you want included when using your `hf_tokenizer` to tokenize your inputs
        tok_kwargs: dict = {},
        # Keyword arguments to apply to `BatchTokenizeTransform`
        **kwargs
    ):
        store_attr()
        self.kwargs = kwargs

    def encodes(self, samples, return_batch_encoding=False):
        """
        This method peforms on-the-fly, batch-time tokenization of your data. In other words, your raw inputs
        are tokenized as needed for each mini-batch of data rather than requiring pre-tokenization of your full
        dataset ahead of time.
        """
        samples = L(samples)

        # grab inputs
        is_dict = isinstance(samples[0][0], dict)
        test_inp = samples[0][0]["text"] if is_dict else samples[0][0]

        if is_listy(test_inp) and not self.is_split_into_words:
            if is_dict:
                inps = [(item["text"][0], item["text"][1]) for item in samples.itemgot(0).items]
            else:
                inps = list(zip(samples.itemgot(0, 0), samples.itemgot(0, 1)))
        else:
            inps = [item["text"] for item in samples.itemgot(0).items] if is_dict else samples.itemgot(0).items

        inputs = self.hf_tokenizer(
            inps,
            max_length=self.max_length,
            padding=self.padding,
            truncation=self.truncation,
            is_split_into_words=self.is_split_into_words,
            return_tensors="pt",
            **self.tok_kwargs
        )

        d_keys = inputs.keys()

        # update the samples with tokenized inputs (e.g. input_ids, attention_mask, etc...), as well as extra information
        # if the inputs is a dictionary.
        # (< 2.0.0): updated_samples = [(*[{k: inputs[k][idx] for k in d_keys}], *sample[1:]) for idx, sample in enumerate(samples)]
        updated_samples = []
        for idx, sample in enumerate(samples):
            inps = {k: inputs[k][idx] for k in d_keys}
            if is_dict:
                inps = {**inps, **{k: v for k, v in sample[0].items() if k not in ["text"]}}

            trgs = sample[1:]
            if self.include_labels and len(trgs) > 0:
                inps["labels"] = trgs[0]

            updated_samples.append((*[inps], *trgs))

        if return_batch_encoding:
            return updated_samples, inputs

        return updated_samples


### `BatchDecodeTransform`

As of fastai 2.1.5, before batch transforms no longer have a `decodes` method ... and so, I've introduced a standard batch transform here (one that occurs "after" the batch has been created) that will do the decoding for us.

In [None]:
# export
class BatchDecodeTransform(Transform):
    """A class used to cast your inputs as `input_return_type` for fastai `show` methods"""

    def __init__(self, input_return_type: Type = TextInput, **kwargs):
        store_attr()

    def decodes(self, items: dict):
        """Returns the proper object and data for show related fastai methods"""
        return self.input_return_type(items["input_ids"])


### `TextBlock`

A basic `DataBlock` for our inputs, `TextBlock` is designed with sensible defaults to minimize user effort in defining their transforms pipeline. It handles setting up your `BatchTokenizeTransform` and `BatchDecodeTransform` transforms regardless of data source (e.g., this will work with files, DataFrames, whatever). 

**Note**: You must either pass in your own instance of a `BatchTokenizeTransform` class or the Hugging Face objects returned from `BLURR.get_hf_objects` (e.g.,architecture, config, tokenizer, and model). The other args are optional.

We also include a `blurr_sort_func` that works with `SortedDL` to properly sort based on the number of tokens in each example.

In [None]:
# export
def blurr_sort_func(
    example,
    # A Hugging Face tokenizer
    hf_tokenizer: PreTrainedTokenizerBase,
    # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
    # if your inputs are pre-tokenized (not numericalized)
    is_split_into_words: bool = False,
    # Any other keyword arguments you want to include during tokenization
    tok_kwargs: dict = {},
):
    """This method is used by the `SortedDL` to ensure your dataset is sorted *after* tokenization"""
    txt = example[0]["text"] if isinstance(example[0], dict) else example[0]
    return len(txt) if is_split_into_words else len(hf_tokenizer.tokenize(txt, **tok_kwargs))


In [None]:
# export
class TextBlock(TransformBlock):
    """The core `TransformBlock` to prepare your inputs for training in Blurr with fastai's `DataBlock` API"""

    def __init__(
        self,
        # The abbreviation/name of your Hugging Face transformer architecture (not required if passing in an
        # instance of `BatchTokenizeTransform` to `before_batch_tfm`)
        hf_arch: Optional[str] = None,
        # A Hugging Face configuration object (not required if passing in an
        # instance of `BatchTokenizeTransform` to `before_batch_tfm`)
        hf_config: Optional[PretrainedConfig] = None,
        # A Hugging Face tokenizer (not required if passing in an
        # instance of `BatchTokenizeTransform` to `before_batch_tfm`)
        hf_tokenizer: Optional[PreTrainedTokenizerBase] = None,
        # A Hugging Face model (not required if passing in an
        # instance of `BatchTokenizeTransform` to `before_batch_tfm`)
        hf_model: Optional[PreTrainedModel] = None,
        # To control whether the "labels" are included in your inputs. If they are, the loss will be calculated in
        # the model's forward function and you can simply use `PreCalculatedLoss` as your `Learner`'s loss function to use it
        include_labels: bool = True,
        # The token ID that should be ignored when calculating the loss
        ignore_token_id=CrossEntropyLossFlat().ignore_index,
        # The before_batch_tfm you want to use to tokenize your raw data on the fly
        # (defaults to an instance of `BatchTokenizeTransform`)
        batch_tokenize_tfm: Optional[BatchTokenizeTransform] = None,
        # The batch_tfm you want to decode your inputs into a type that can be used in the fastai show methods,
        # (defaults to BatchDecodeTransform)
        batch_decode_tfm: Optional[BatchDecodeTransform] = None,
        # To control the length of the padding/truncation. It can be an integer or None,
        # in which case it will default to the maximum length the model can accept. If the model has no
        # specific maximum input length, truncation/padding to max_length is deactivated.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        max_length: Optional[int] = None,
        # To control the `padding` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `'do_not_pad'.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        padding: Union[bool, str] = True,
        # To control `truncation` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `do_not_truncate`.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        truncation: Union[bool, str] = True,
        # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
        # if your inputs are pre-tokenized (not numericalized)
        is_split_into_words: bool = False,
        # The return type your decoded inputs should be cast too (used by methods such as `show_batch`)
        input_return_type: Type = TextInput,
        # The type of `DataLoader` you want created (defaults to `SortedDL`)
        dl_type: Optional[DataLoader] = None,
        # Any keyword arguments you want applied to your `batch_tokenize_tfm`
        batch_tokenize_kwargs: dict = {},
        # Any keyword arguments you want applied to your `batch_decode_tfm` (will be set as a fastai `batch_tfms`)
        batch_decode_kwargs: dict = {},
        # Any keyword arguments you want your Hugging Face tokenizer to use during tokenization
        tok_kwargs: dict = {},
        # Any keyword arguments you want to have applied with generating text
        text_gen_kwargs: dict = {},
        # Any keyword arguments you want applied to `TextBlock`
        **kwargs
    ):
        if (not all([hf_arch, hf_config, hf_tokenizer, hf_model])) and batch_tokenize_tfm is None:
            raise ValueError("You must supply an hf_arch, hf_config, hf_tokenizer, hf_model -or- a BatchTokenizeTransform")

        if batch_tokenize_tfm is None:
            batch_tokenize_tfm = BatchTokenizeTransform(
                hf_arch,
                hf_config,
                hf_tokenizer,
                hf_model,
                include_labels=include_labels,
                ignore_token_id=ignore_token_id,
                max_length=max_length,
                padding=padding,
                truncation=truncation,
                is_split_into_words=is_split_into_words,
                tok_kwargs=tok_kwargs.copy(),
                **batch_tokenize_kwargs.copy()
            )

        if batch_decode_tfm is None:
            batch_decode_tfm = BatchDecodeTransform(input_return_type=input_return_type, **batch_decode_kwargs.copy())

        if dl_type is None:
            dl_sort_func = partial(
                blurr_sort_func,
                hf_tokenizer=batch_tokenize_tfm.hf_tokenizer,
                is_split_into_words=batch_tokenize_tfm.is_split_into_words,
                tok_kwargs=batch_tokenize_tfm.tok_kwargs.copy(),
            )

            dl_type = partial(SortedDL, sort_func=dl_sort_func)

        return super().__init__(dl_type=dl_type, dls_kwargs={"before_batch": batch_tokenize_tfm}, batch_tfms=batch_decode_tfm)


## Low-level API

For working with PyTorch and/or fast.ai Datasets & DataLoaders, the low-level API allows you to get back fast.ai specific features such as `show_batch`, `show_results`, etc... when using plain ol' PyTorch Datasets, Hugging Face Datasets, etc...

### `BlurrBatchCreator` 

In [None]:
# export
@dataclass
class BlurrBatchCreator:
    """
    A class that can be assigned to a `TfmdDL.create_batch` method; used to in Blurr's low-level API
    to create batches that can be used in the Blurr library
    """

    def __init__(
        self,
        # Your Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # Defaults to use Hugging Face's DataCollatorWithPadding(tokenizer=hf_tokenizer)
        data_collator: Type = None,
    ):
        self.hf_tokenizer = hf_tokenizer
        self.data_collator = data_collator if (data_collator) else DataCollatorWithPadding(tokenizer=hf_tokenizer)

    def __call__(self, features):  # A mini-batch (list of examples to run through your model)
        """This method will collate your data using `self.data_collator` and add a target element to the
        returned tuples if `labels` are defined as is the case when most Hugging Face datasets
        """
        batch = self.data_collator(features)
        if isinstance(features[0], dict):
            return dict(batch), batch["labels"] if ("labels" in features[0]) else dict(batch)

        return batch


### `BlurrBatchDecodeTransform` 

In [None]:
# export
class BlurrBatchDecodeTransform(BatchDecodeTransform):
    """A class used to cast your inputs into something understandable in fastai `show` methods"""

    def __init__(
        self,
        # The abbreviation/name of your Hugging Face transformer architecture (not required if passing in an
        # instance of `BatchTokenizeTransform` to `before_batch_tfm`)
        hf_arch: Optional[str] = None,
        # A Hugging Face configuration object (not required if passing in an
        # instance of `BatchTokenizeTransform` to `before_batch_tfm`)
        hf_config: Optional[PretrainedConfig] = None,
        # A Hugging Face tokenizer (not required if passing in an
        # instance of `BatchTokenizeTransform` to `before_batch_tfm`)
        hf_tokenizer: Optional[PreTrainedTokenizerBase] = None,
        # A Hugging Face model (not required if passing in an
        # instance of `BatchTokenizeTransform` to `before_batch_tfm`)
        hf_model: Optional[PreTrainedModel] = None,
        # To control whether the "labels" are included in your inputs. If they are, the loss will be calculated in
        # the model's forward function and you can simply use `PreCalculatedLoss` as your `Learner`'s loss function to use it
        include_labels: bool = True,
        # The token ID to ignore when calculating loss/metrics
        ignore_token_id: int = CrossEntropyLossFlat().ignore_index,
        # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
        # if your inputs are pre-tokenized (not numericalized)
        is_split_into_words: bool = False,
        # Any other keyword arguments you want included when using your `hf_tokenizer` to tokenize your inputs
        tok_kwargs: dict = {},
        # Any text generation keyword arguments
        text_gen_kwargs: dict = {},
        # The return type your decoded inputs should be cast too (used by methods such as `show_batch`)
        input_return_type: Type = TextInput,
        # Any other keyword arguments you need to pass to `BatchDecodeTransform`
        **kwargs
    ):
        super().__init__(input_return_type=input_return_type)
        store_attr()
        self.kwargs = kwargs


### `BlurrDataLoader`

In [None]:
# export
@delegates()
class BlurrDataLoader(TfmdDL):
    """A class that makes creating a fast.ai `DataLoader` that works with Blurr"""

    def __init__(
        self,
        # A standard PyTorch Dataset
        dataset: Union[torch.utils.data.dataset.Dataset, Datasets],
        # The abbreviation/name of your Hugging Face transformer architecture (not required if passing in an
        # instance of `BatchTokenizeTransform` to `before_batch_tfm`)
        hf_arch: str,
        # A Hugging Face configuration object (not required if passing in an instance of `BatchTokenizeTransform`
        # to `before_batch_tfm`)
        hf_config: PretrainedConfig,
        # A Hugging Face tokenizer (not required if passing in an instance of `BatchTokenizeTransform` to
        # `before_batch_tfm`)
        hf_tokenizer: PreTrainedTokenizerBase,
        # A Hugging Face model (not required if passing in an instance of `BatchTokenizeTransform` to
        # `before_batch_tfm`)
        hf_model: PreTrainedModel,
        # An instance of `BlurrBatchCreator` or equivalent (defaults to `BlurrBatchCreator`)
        batch_creator: Optional[BlurrBatchCreator] = None,
        # The batch_tfm used to decode Blurr batches (defaults to `BlurrBatchDecodeTransform`)
        batch_decode_tfm: Optional[BlurrBatchDecodeTransform] = None,
        # (optional) A preprocessing function that will be applied to your dataset
        preproccesing_func: Callable[
            [Union[torch.utils.data.dataset.Dataset, Datasets], PreTrainedTokenizerBase, PreTrainedModel],
            Union[torch.utils.data.dataset.Dataset, Datasets],
        ] = None,
        # Keyword arguments to be applied to your `batch_decode_tfm`
        batch_decode_kwargs: dict = {},
        # Keyword arguments to be applied to `BlurrDataLoader`
        **kwargs,
    ):
        if preproccesing_func:
            dataset = preproccesing_func(dataset, hf_tokenizer, hf_model)

        if "create_batch" in kwargs:
            kwargs.pop("create_batch")
        if not batch_creator:
            batch_creator = BlurrBatchCreator(hf_tokenizer=hf_tokenizer)

        if "after_batch" in kwargs:
            kwargs.pop("after_batch")
        if not batch_decode_tfm:
            batch_decode_tfm = BlurrBatchDecodeTransform(hf_arch, hf_config, hf_tokenizer, hf_model, **batch_decode_kwargs.copy())

        super().__init__(dataset=dataset, create_batch=batch_creator, after_batch=batch_decode_tfm, **kwargs)
        store_attr(names="hf_arch, hf_config, hf_tokenizer, hf_model")

    def new(
        self,
        # A standard PyTorch and fastai dataset
        dataset: Union[torch.utils.data.dataset.Dataset, Datasets] = None,
        # The class you want to create an instance of (will be "self" if None)
        cls: Type = None,
        #  Any additional keyword arguments you want to pass to the __init__ method of `cls`
        **kwargs,
    ):
        """We have to override the new method in order to add back the Hugging Face objects in this factory
        method (called for example in places like `show_results`). With the exception of the additions to the kwargs
        dictionary, the code below is pulled from the `DataLoaders.new` method as is.
        """
        if dataset is None:
            dataset = self.dataset
        if cls is None:
            cls = type(self)

        cur_kwargs = dict(
            dataset=dataset,
            num_workers=self.fake_l.num_workers,
            pin_memory=self.pin_memory,
            timeout=self.timeout,
            bs=self.bs,
            shuffle=self.shuffle,
            drop_last=self.drop_last,
            indexed=self.indexed,
            device=self.device,
        )

        for n in self._methods:
            o = getattr(self, n)
            if not isinstance(o, MethodType):
                cur_kwargs[n] = o

        # we need to add these arguments back in (these, after_batch, and create_batch will go in as kwargs)
        kwargs["hf_arch"] = self.hf_arch
        kwargs["hf_config"] = self.hf_config
        kwargs["hf_tokenizer"] = self.hf_tokenizer
        kwargs["hf_model"] = self.hf_model

        return cls(**merge(cur_kwargs, kwargs))


## Utility classes and methods 

These methods are use internally for getting blurr transforms associated to your `DataLoaders`

In [None]:
# export
def get_blurr_tfm(
    # A list of transforms (e.g., dls.after_batch, dls.before_batch, etc...)
    tfms_list: Pipeline,
    # The transform to find
    tfm_class: Transform = BatchTokenizeTransform,
):
    """
    Given a fastai DataLoaders batch transforms, this method can be used to get at a transform
    instance used in your Blurr DataBlock
    """
    return next(filter(lambda el: issubclass(type(el), tfm_class), tfms_list), None)


In [None]:
show_doc(get_blurr_tfm)


<h4 id="get_blurr_tfm" class="doc_header"><code>get_blurr_tfm</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>get_blurr_tfm</code>(**`tfms_list`**:`Pipeline`, **`tfm_class`**:`Transform`=*`BatchTokenizeTransform`*)

Given a fastai DataLoaders batch transforms, this method can be used to get at a transform
instance used in your Blurr DataBlock

**Parameters:**


 - **`tfms_list`** : *`<class 'fastcore.transform.Pipeline'>`*	<p>A list of transforms (e.g., dls.after_batch, dls.before_batch, etc...)</p>


 - **`tfm_class`** : *`<class 'fastcore.transform.Transform'>`*, *optional*	<p>The transform to find</p>



In [None]:
# export
def first_blurr_tfm(
    # Your fast.ai `DataLoaders
    dls: DataLoaders,
    # The Blurr transforms to look for in order
    tfms: List[Transform] = [BatchTokenizeTransform, BatchDecodeTransform, BlurrBatchDecodeTransform],
):
    """
    This convenience method will find the first Blurr transform required for methods such as
    `show_batch` and `show_results`. The returned transform should have everything you need to properly
    decode and 'show' your Hugging Face inputs/targets
    """
    for tfm in tfms:
        found_tfm = get_blurr_tfm(dls.before_batch, tfm_class=tfm)
        if found_tfm:
            return found_tfm

        found_tfm = get_blurr_tfm(dls.after_batch, tfm_class=tfm)
        if found_tfm:
            return found_tfm


In [None]:
show_doc(first_blurr_tfm)


<h4 id="first_blurr_tfm" class="doc_header"><code>first_blurr_tfm</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>first_blurr_tfm</code>(**`dls`**:`DataLoaders`, **`tfms`**:`List`\[`Transform`\]=*`[<class '__main__.BatchTokenizeTransform'>, <class '__main__.BatchDecodeTransform'>, <class '__main__.BlurrBatchDecodeTransform'>]`*)

This convenience method will find the first Blurr transform required for methods such as
`show_batch` and `show_results`. The returned transform should have everything you need to properly
decode and 'show' your Hugging Face inputs/targets

**Parameters:**


 - **`dls`** : *`<class 'fastai.data.core.DataLoaders'>`*	<p>Your fast.ai `DataLoaders</p>


 - **`tfms`** : *`typing.List[fastcore.transform.Transform]`*, *optional*	<p>The Blurr transforms to look for in order</p>



## `show_batch`

In [None]:
# export
@typedispatch
def show_batch(
    # This typedispatched `show_batch` will be called for `TextInput` typed inputs
    x: TextInput,
    # Your targets
    y,
    # Your raw inputs/targets
    samples,
    # Your `DataLoaders`. This is required so as to get at the Hugging Face objects for
    # decoding them into something understandable
    dataloaders,
    # Your `show_batch` context
    ctxs=None,
    # The maximum number of items to show
    max_n=6,
    # Any truncation your want applied to your decoded inputs
    trunc_at=None,
    # Any other keyword arguments you want applied to `show_batch`
    **kwargs,
):
    # grab our tokenizer
    tfm = first_blurr_tfm(dataloaders)
    hf_tokenizer = tfm.hf_tokenizer

    # if we've included our labels list, we'll use it to look up the value of our target(s)
    trg_labels = tfm.kwargs["labels"] if ("labels" in tfm.kwargs) else None

    res = L()
    n_inp = dataloaders.n_inp

    for idx, (input_ids, label, sample) in enumerate(zip(x, y, samples)):
        if idx >= max_n:
            break

        rets = [hf_tokenizer.decode(input_ids, skip_special_tokens=True)[:trunc_at]]
        for item in sample[n_inp:]:
            if not torch.is_tensor(item):
                trg = trg_labels[int(item)] if trg_labels else item
            elif is_listy(item.tolist()):
                trg = [trg_labels[idx] for idx, val in enumerate(label.numpy().tolist()) if (val == 1)] if (trg_labels) else label.numpy()
            else:
                trg = trg_labels[label.item()] if (trg_labels) else label.item()

            rets.append(trg)
        res.append(tuplify(rets))

    cols = ["text"] + ["target" if (i == 0) else f"target_{i}" for i in range(len(res[0]) - n_inp)]
    display_df(pd.DataFrame(res, columns=cols)[:max_n])
    return ctxs


## Examples

The following eamples demonstrate several approaches to construct your `DataBlock` for sequence classication tasks using the mid-level API, and also an example on how to accomplish the same using the low-level API and standard PyTorch/Hugging Face/fast.ai Datasets and DataLoaders.

### Using the mid-level API

#### Batch-Time Tokenization

##### Step 1: Get your Hugging Face objects.

There are a bunch of ways we can get at the four Hugging Face elements we need (e.g., architecture name, tokenizer, config, and model).  We can just create them directly, or we can use one of the helper methods available via `BLURR`.

In [None]:
# hide_output
from transformers import AutoModelForSequenceClassification

model_cls = AutoModelForSequenceClassification

pretrained_model_name = "distilroberta-base"  # "distilbert-base-uncased" "bert-base-uncased"
hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(pretrained_model_name, model_cls=model_cls)


#####  Step 2: Create your `DataBlock`

In [None]:
blocks = (TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model, batch_tokenize_kwargs={"labels": labels}), CategoryBlock)
dblock = DataBlock(blocks=blocks, get_x=ColReader("text"), get_y=ColReader("label"), splitter=ColSplitter())


##### Step 3: Build your `DataLoaders`

In [None]:
dls = dblock.dataloaders(imdb_df, bs=4)


In [None]:
b = dls.one_batch()
len(b), len(b[0]["input_ids"]), b[0]["input_ids"].shape, len(b[1])


(2, 4, torch.Size([4, 512]), 4)

In [None]:
b[0]


{'input_ids': tensor([[    0,  6142,    54,  ...,  6717,   619,     2],
         [    0,    38,    33,  ...,    14,    33,     2],
         [    0,   370,   218,  ...,  3809, 48709,     2],
         [    0,   152,  1012,  ...,     5, 28505,     2]], device='cuda:1'),
 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1]], device='cuda:1'),
 'labels': TensorCategory([0, 1, 1, 0], device='cuda:1')}

Let's take a look at the actual types represented by our batch

In [None]:
explode_types(b)


{tuple: [dict, fastai.torch_core.TensorCategory]}

In [None]:
dls.show_batch(dataloaders=dls, max_n=2, trunc_at=500)


Unnamed: 0,text,target
0,"Anyone who visited drive-ins in the 1950s, 60s, and 70s, must have seen a film or two by American International Pictures, a distributor that resembled 1980s giant Cannon Films. Wherever movie-goers ventured, AIP would be right there to supply the latest en vogue titles - in the 50s came horror movies like 'Voodoo Woman' and 'The Undead;' in the 60s were Frankie Avalon-Annette Funicello beach comedies and biker flicks like 'The Glory Stompers;' and into the 70s, AIP churned out grindhouse-level",neg
1,"(I'll indicate in this review the point where spoilers begin.) My dissatisfaction is split: 30% tone-deafness, 70% lackluster writing.<br /><br />The 30%: I agree with the first commenter's synopsis about the lack of diversity in the characters and scope of the stories. I was surprised how, this film, at best, woefully shortchanges the real NYC by presenting a collection of people and relationships so narrow as to come across as if it's inhabited only by the cast of Gossip Girl (this is coming",neg


#### Using a preprocessed dataset

Preprocessing your raw data is the more traditional approach to using Transformers. It is required, for example, when you want to work with documents longer than your model will allow. A preprocessed dataset is used in the same way a non-preprocessed dataset is.

##### Step 1a: Get your Hugging Face objects.

In [None]:
hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(pretrained_model_name, model_cls=model_cls)


##### Step 1b. Preprocess dataset

In [None]:
preprocessor = ClassificationPreprocessor(hf_tokenizer, label_mapping=labels)
proc_ds = preprocessor.process_hf_dataset(final_ds)
proc_ds


Dataset({
    features: ['proc_text', 'text', 'label', 'is_valid', 'label_name', 'text_start_char_idx', 'text_end_char_idx'],
    num_rows: 1200
})

##### Step 2: Create your `DataBlock`

In [None]:
blocks = (TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model, batch_tokenize_kwargs={"labels": labels}), CategoryBlock)
dblock = DataBlock(blocks=blocks, get_x=ItemGetter("proc_text"), get_y=ItemGetter("label"), splitter=RandomSplitter())


##### Step 3: Build your `DataLoaders`

In [None]:
dls = dblock.dataloaders(proc_ds, bs=4)


In [None]:
dls.show_batch(dataloaders=dls, max_n=2, trunc_at=500)


Unnamed: 0,text,target
0,"Rarely does one find a movie so bad that it achieves the often-sought paradigm of having so little redeeming value that that alone makes it worth watching. ""Cyclone,"" I am happy to report, is such a film.<br /><br />I knew I was in for something good as soon as I found the videotape. I am at least its fourth owner: It has a ""Used Movie Sale! $9.95"" sticker on the front, and a yard-sale sticker for one dollar. I picked it up at a thrift store for fifty cents.<br /><br />The Used Movie Sale! stic",neg
1,"Un Gatto nel Cervello, or Nightmare Concert as it's more commonly know amongst English speaking audiences, starts as horror film director Lucio Fulci (played by the man himself Lucio Fulci) goes to lunch after filming a very gory & violent scene, however he orders steak & has a horrible vision relating to cannibalism. The grotesque visions, hallucinations & dreams continue & begin to affect his mental state, Fulci decides to seek help & contacts Professor Egon Schwarz (Dvid L. Thompson) for psy",pos


#### Passing extra information

As of v.2, BLURR now also allows you to pass extra information alongside your inputs in the form of a dictionary.  If you use this approach, you must assign your text(s) to the `text` attribute of the dictionary.  This is a useful approach when splitting long documents into chunks, but wanting to score/predict by example rather than chunk (for example in extractive question answering tasks).

**Note**: A good place to access to this extra information during training/validation is in the `before_batch` method of a `Callback`.

In [None]:
blocks = (TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model, batch_tokenize_kwargs={"labels": labels}), CategoryBlock)


def get_x(item):
    return {"text": item.text, "another_val": "testing123"}


dblock = DataBlock(blocks=blocks, get_x=get_x, get_y=ColReader("label"), splitter=ColSplitter())


In [None]:
dls = dblock.dataloaders(imdb_df, bs=4)


In [None]:
b = dls.one_batch()
len(b), len(b[0]["input_ids"]), b[0]["input_ids"].shape, len(b[1])


(2, 4, torch.Size([4, 512]), 4)

In [None]:
dls.show_batch(dataloaders=dls, max_n=2, trunc_at=500)


Unnamed: 0,text,target
0,"Anyone who visited drive-ins in the 1950s, 60s, and 70s, must have seen a film or two by American International Pictures, a distributor that resembled 1980s giant Cannon Films. Wherever movie-goers ventured, AIP would be right there to supply the latest en vogue titles - in the 50s came horror movies like 'Voodoo Woman' and 'The Undead;' in the 60s were Frankie Avalon-Annette Funicello beach comedies and biker flicks like 'The Glory Stompers;' and into the 70s, AIP churned out grindhouse-level",neg
1,"Hear are some of the interesting things our combat hero faith healer Pat, his son Gordon (T.V. ministry seems like a family business.) and Terry Meeuwsen (Won Miss America in 1973 by wearing a swimsuit and showing her legs. Oh my goodness gracious!) say when our poor viewers are sick and need help.<br /><br />1. Someone with an ""abscessed right tooth""has just now been healed.2. Someone with ""twisted intestines"" has been healed.3.Then Terry said there was a person with a ""strange condition"",(You",neg


### Using the low-level API

#### Step 1: Build your datasets

In [None]:
raw_datasets = load_dataset("glue", "mrpc")


Reusing dataset glue (/home/wgilliam/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
def tokenize_function(example):
    return hf_tokenizer(example["sentence1"], example["sentence2"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)


  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

#### Step 2: Dataset pre-processing (optional)

In [None]:
# export
def preproc_hf_dataset(
    # A standard PyTorch Dataset or fast.ai Datasets
    dataset: Union[torch.utils.data.dataset.Dataset, Datasets],
    # A Hugging Face tokenizer
    hf_tokenizer: PreTrainedTokenizerBase,
    # A Hugging Face model
    hf_model: PreTrainedModel,
):
    """This method can be used to preprocess most Hugging Face Datasets for use in Blurr and other training
    libraries
    """
    if ("label") in dataset.column_names:
        dataset = dataset.rename_column("label", "labels")

    hf_model_fwd_args = list(inspect.signature(hf_model.forward).parameters.keys())
    bad_cols = set(dataset.column_names).difference(hf_model_fwd_args)
    dataset = dataset.remove_columns(bad_cols)

    dataset.set_format("torch")
    return dataset


#### Step 3: Build your `DataLoaders`.

Use `BlurrDataLoader` to build Blurr friendly dataloaders from your datasets. Passing `{'labels': label_names}` to your `batch_tfm_kwargs` will ensure that your lable/target names will be displayed in methods like `show_batch` and `show_results` (just as it works with the mid-level API)

In [None]:
label_names = raw_datasets["train"].features["label"].names

trn_dl = BlurrDataLoader(
    tokenized_datasets["train"],
    hf_arch,
    hf_config,
    hf_tokenizer,
    hf_model,
    preproccesing_func=preproc_hf_dataset,
    batch_decode_kwargs={"labels": label_names},
    shuffle=True,
    batch_size=8,
)

val_dl = BlurrDataLoader(
    tokenized_datasets["validation"],
    hf_arch,
    hf_config,
    hf_tokenizer,
    hf_model,
    preproccesing_func=preproc_hf_dataset,
    batch_decode_kwargs={"labels": label_names},
    batch_size=16,
)

dls = DataLoaders(trn_dl, val_dl)


In [None]:
b = dls.one_batch()
b[0]["input_ids"].shape


torch.Size([8, 68])

In [None]:
dls.show_batch(dataloaders=dls, max_n=2, trunc_at=800)


Unnamed: 0,text,target
0,"It would remove $ 55 billion in tax subsidies for exporters, which are illegal under international trade law, in exchange for new tax breaks. In exchange for new tax breaks, it would remove $ 55 billion of tax subsidies for exporters that are illegal under international trade law.",equivalent
1,"When you crossed the line, you violated the constitutional right, "" said Charles Weisselberg, a UC Berkeley law professor. When you crossed the line, you violated the constitutional right, "" said Charles Weisselberg, who teaches law at the University of California, Berkeley.",equivalent


## Tests

The tests below to ensure the core DataBlock code above works for **all** pretrained sequence classification models available in Hugging Face.  These tests are excluded from the CI workflow because of how long they would take to run and the amount of data that would be required to download.

**Note**: Feel free to modify the code below to test whatever pretrained classification models you are working with ... and if any of your pretrained sequence classification models fail, please submit a github issue *(or a PR if you'd like to fix it yourself)*

In [None]:
# hide
[model_type for model_type in BLURR.get_models(task="SequenceClassification") if (not model_type.startswith("TF"))]


['AlbertForSequenceClassification',
 'BartForSequenceClassification',
 'BertForSequenceClassification',
 'BigBirdForSequenceClassification',
 'BigBirdPegasusForSequenceClassification',
 'CTRLForSequenceClassification',
 'CamembertForSequenceClassification',
 'CanineForSequenceClassification',
 'ConvBertForSequenceClassification',
 'DebertaForSequenceClassification',
 'DebertaV2ForSequenceClassification',
 'DistilBertForSequenceClassification',
 'ElectraForSequenceClassification',
 'FNetForSequenceClassification',
 'FlaubertForSequenceClassification',
 'FunnelForSequenceClassification',
 'GPT2ForSequenceClassification',
 'GPTJForSequenceClassification',
 'GPTNeoForSequenceClassification',
 'HubertForSequenceClassification',
 'IBertForSequenceClassification',
 'LEDForSequenceClassification',
 'LayoutLMForSequenceClassification',
 'LayoutLMv2ForSequenceClassification',
 'LongformerForSequenceClassification',
 'MBartForSequenceClassification',
 'MPNetForSequenceClassification',
 'MegatronB

In [None]:
# hide
pretrained_model_names = [
    "hf-internal-testing/tiny-albert",
    "hf-internal-testing/tiny-random-bart",
    "hf-internal-testing/tiny-bert",
    "google/bigbird-roberta-base",
    "google/bigbird-pegasus-large-arxiv",
    "hf-internal-testing/tiny-random-ctrl",
    "camembert-base",
    "hf-internal-testing/tiny-random-canine",
    "YituTech/conv-bert-base",
    "hf-internal-testing/tiny-deberta",
    "hf-internal-testing/tiny-random-deberta-v2",
    "hf-internal-testing/tiny-random-distilbert",
    "hf-internal-testing/tiny-electra",
    "google/fnet-base",
    "hf-internal-testing/tiny-random-flaubert",
    "hf-internal-testing/tiny-random-funnel",
    "hf-internal-testing/tiny-random-gpt2",
    "anton-l/gpt-j-tiny-random",
    "hf-internal-testing/tiny-random-gpt_neo",
    "kssteven/ibert-roberta-base",
    "hf-internal-testing/tiny-random-led",
    "hf-internal-testing/tiny-random-longformer",
    "hf-internal-testing/tiny-random-mbart",
    "hf-internal-testing/tiny-random-mpnet",
    # "nvidia/megatron-bert-cased-345m",                 could not test
    "hf-internal-testing/tiny-random-mobilebert",
    "openai-gpt",
    "google/reformer-crime-and-punishment",
    "google/rembert",
    "junnyu/roformer_chinese_sim_char_ft_small",
    "roberta-base",
    "squeezebert/squeezebert-uncased",
    "hf-internal-testing/tiny-random-transfo-xl",
    "xlm-mlm-en-2048",
    "xlm-roberta-base",
    "xlnet-base-cased",
]


In [None]:
# hide
# for model_name in pretrained_model_names:
#     tok = AutoTokenizer.from_pretrained(model_name)
#     print(f'=== {model_name} ===')
#     print(f'=== {tok.padding_side} ===')
#     print(f'=== {tok.pad_token_id} ===')
#     print(tok(['hi', 'hello everyone. its good to be here'], ['yo', 'yo'], padding='max_length', max_length=128))


In [None]:
# hide
raw_datasets = load_dataset("imdb", split=["train", "test"])
raw_datasets[0] = raw_datasets[0].add_column("is_valid", [False] * len(raw_datasets[0]))
raw_datasets[1] = raw_datasets[1].add_column("is_valid", [True] * len(raw_datasets[1]))

final_ds = concatenate_datasets([raw_datasets[0].shuffle().select(range(1000)), raw_datasets[1].shuffle().select(range(200))])
imdb_df = pd.DataFrame(final_ds)


Reusing dataset imdb (/home/wgilliam/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# hide
from transformers import RobertaTokenizer

model_cls = AutoModelForSequenceClassification
bsz = 2
seq_sz = 128

test_results = []
for model_name in pretrained_model_names:
    error = None

    print(f"=== {model_name} ===\n")

    tok_class = RobertaTokenizer if ("/ibert" in model_name) else None

    hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(model_name, model_cls=model_cls, tokenizer_cls=tok_class)

    print(f"architecture:\t{hf_arch}\ntokenizer:\t{type(hf_tokenizer).__name__}\n")

    # not all architectures include a native pad_token (e.g., gpt2, ctrl, etc...), so we add one here
    if hf_tokenizer.pad_token is None:
        hf_tokenizer.add_special_tokens({"pad_token": "<pad>"})
        hf_config.pad_token_id = hf_tokenizer.get_vocab()["<pad>"]
        hf_model.resize_token_embeddings(len(hf_tokenizer))

    try:
        blocks = (TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model, padding="max_length", max_length=seq_sz), CategoryBlock)

        dblock = DataBlock(blocks=blocks, get_x=ColReader("text"), get_y=ColReader("label"), splitter=ColSplitter())
        dls = dblock.dataloaders(imdb_df, bs=bsz)
        b = dls.one_batch()

        print("*** TESTING DataLoaders ***\n")
        test_eq(len(b), 2)
        test_eq(len(b[0]["input_ids"]), bsz)
        test_eq(b[0]["input_ids"].shape, torch.Size([bsz, seq_sz]))
        test_eq(len(b[1]), bsz)

        test_results.append((hf_arch, type(hf_tokenizer).__name__, model_name, "PASSED", ""))
        dls.show_batch(dataloaders=dls, max_n=2, trunc_at=1000)

    except Exception as err:
        test_results.append((hf_arch, type(hf_tokenizer).__name__, model_name, "FAILED", err))


=== hf-internal-testing/tiny-albert ===

architecture:	albert
tokenizer:	AlbertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"match 1: tag team table match bubba ray and spike dudley vs eddie guerrero and chris benoit bubba ray and spike dudley started things off with a tag team table match against eddie guerrero and chris benoit. according to the rules of the match, both opponents have to go through tables in order to get the win. benoit and guerrero heated up early on by taking turns hammering first spike and then bubba ray. a german sup",1
1,"there is nothing cool, hip, or clever about this film-- liking it just reveals an ignorance of true art cinema. how can you so easily forget that the central fact of this entire film is that these mean & ugly people are... serial killers! if they have to dismember total strangers in order to ""be a family again,"" then we don't want them to ""be a family."" what part of that did you have trouble grasping? why applaud this fil",0


=== hf-internal-testing/tiny-random-bart ===

architecture:	bart
tokenizer:	BartTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Match 1: Tag Team Table Match Bubba Ray and Spike Dudley vs Eddie Guerrero and Chris Benoit Bubba Ray and Spike Dudley started things off with a Tag Team Table Match against Eddie Guerrero and Chris Benoit. According to the rules of the match, both opponents have to go through tables",1
1,"I respect Alex Cox the filmmaker, I really do. He's like the kid at school who you think at first is just trying a little too hard to be ""different"", a literary punk-rocker who has dipped more than his feet into spaghetti westerns and science fiction and fringe-culture and come out into the world read",0


=== hf-internal-testing/tiny-bert ===

architecture:	bert
tokenizer:	BertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"match 1 : tag team table match bubba ray and spike dudley vs eddie guerrero and chris benoit bubba ray and spike dudley started things off with a tag team table match against eddie guerrero and chris benoit. according to the rules of the match, both opponents have to go through tables in order to get the win. benoit and guerrero heated up early on by taking turns hammering first spike and then bubba ray. a german suplex by benoit to bubba took the wind out of the dudley brother. spike tried to help his brother, but the referee restrained him while benoit and guerrero ganged up on him in the corner. with benoit stomping",1
1,"i saw heartland when it was first released in 1980 and i have just seen it again. it improves with age. heartland is not just for lovers of "" indie "" films. at a time when most american films are little more than cynical attempts to make money with cgi, pyrotechnics, and / or vulgarity, heartland holds up as a slice of american history. it is also a reminder of how spoiled most of us modern, urbanized americans are. < br / > < br / > nothing in this film is overstated or stagey. no one declaims any hollywood movie speeches. the",1


=== google/bigbird-roberta-base ===



normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.


architecture:	big_bird
tokenizer:	BigBirdTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Match 1: Tag Team Table Match Bubba Ray and Spike Dudley vs Eddie Guerrero and Chris Benoit Bubba Ray and Spike Dudley started things off with a Tag Team Table Match against Eddie Guerrero and Chris Benoit. According to the rules of the match, both opponents have to go through tables in order to get the win. Benoit and Guerrero heated up early on by taking turns hammering first Spike and then Bubba Ray. A German suplex by Benoit to Bubba took the wind out of the Dudley brother. Spike tried to help his brother, but the referee restrained him while Benoit and Guerrero ganged up on",1
1,"There are many different versions of this one floating around, so make sure you can locate one of the unrated copies, otherwise some gore and one scene of nudity might be missing. Some versions also omit most of the opening sequence and other bits here and there. The cut I saw has the on-screen title WITCHCRAFT: EVIL ENCOUNTERS and was released by Shriek Show, who maintain the original US release title WITCHERY for the DVD release. It's a nice-looking print and seems to have all of the footage, but has some cropping/aspect ratio issues. In Italy",0


=== google/bigbird-pegasus-large-arxiv ===

architecture:	bigbird_pegasus
tokenizer:	PegasusTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Match 1: Tag Team Table Match Bubba Ray and Spike Dudley vs Eddie Guerrero and Chris Benoit Bubba Ray and Spike Dudley started things off with a Tag Team Table Match against Eddie Guerrero and Chris Benoit. According to the rules of the match, both opponents have to go through tables in order to get the win. Benoit and Guerrero heated up early on by taking turns hammering first Spike and then Bubba Ray. A German suplex by Benoit to Bubba took the wind out of the Dudley brother. Spike tried to help his brother, but the referee restrained him while Benoit and Guerrero ganged up on him in the corner. With Benoit stomping away on Bubba,",1
1,"Just got through watching this version of ""Samhain"", and even though I still like it, it's nothing like the ""rough cut"" version I have. If you check the message board, you'll see an apology from the director for this cut down version, 79 minutes., and he says he had nothing to do with this R-rated trimmed down edit with a completely new screwed up ending. Christian really doesn't need to distant himself that much, because the basic gore elements still stand up, even though highly trimmed down. This is a damn shame, because this had the potential of being one of",1


=== hf-internal-testing/tiny-random-ctrl ===



  angle_rates = 1 / torch.pow(10000, (2 * (i // 2)) / d_model_size)
Using pad_token, but it is not set yet.


architecture:	ctrl
tokenizer:	CTRLTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Match 1: Tag Team Table Match Bubba Ray and Spike Dudley vs Eddie Guerrero and Chris Benoit Bubba Ray and Spike Dudley started things off with a Tag Team Table Match against Eddie Guerrero and Chris Benoit. According to the rules of the match, both opponents have to go through tables in order to get the win. Benoit and Guerrero heated up early on by taking turns hammering first Spike and then Bubba Ray. A German suplex by Benoit to Bubba took the wind out of the Dudley brother. Spike tried to help his brother, but the referee restrained him while Benoit and Guerrero ganged up on him in the corner. With Benoit stomping away on Bubba, Guerrero set up a table",1
1,"It all starts with a suicide. Or is it a car crash? I guess it all depends on whether you choose to start at the beginning or the end. Director Gabriele Muccino gives you the ability to enter his new film Seven Pounds whichever way you prefer as he starts at the end and works his way back to the beginning, showing us the course of events that led us to that heartbreaking 911 call. This is one powerful movie; maybe that is because I'm a softy when it comes to dramas of this ilk, dripping with weighty moments and chock full of devastating performances, but either way, a film works best when it truly touches me, when it lingers in",1


=== camembert-base ===

architecture:	camembert
tokenizer:	CamembertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Match 1: Tag Team Table Match Bubba Ray and Spike Dudley vs Eddie Guerrero and Chris Benoit Bubba Ray and Spike Dudley started things off with a Tag Team Table Match against Eddie Guerrero and Chris Benoit. According to the rules of the match, both opponents have to go through tables in order to get the win. Benoit and Guerrero heated up early on by taking turns hammering first Spike and then Bubba Ray. A German suplex by Benoit to",1
1,"He's stocky, sweaty, slightly cross-eyed and restless. He stands in front of us and calls himself a pervert. He claims that we the film viewers perceive the screen as a toilet bowl, and are all secretly wishing for all the s**t to explode from the inside. He's unpredictable and scary. Well? Come on, you could have guessed by now: he's one of the le",1


=== hf-internal-testing/tiny-random-canine ===



Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.


architecture:	canine
tokenizer:	CanineTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,Match 1: Tag Team Table Match Bubba Ray and Spike Dudley vs Eddie Guerrero and Chris Benoit Bubba Ray and Spike Dudley started,1
1,"Originally conceived as a solo vehicle for Dudley Moore, 'Not Only...But Also' saw his ex-'Beyond The Fringe' collaborator Pet",1


=== YituTech/conv-bert-base ===

architecture:	convbert
tokenizer:	ConvBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"match 1 : tag team table match bubba ray and spike dudley vs eddie guerrero and chris benoit bubba ray and spike dudley started things off with a tag team table match against eddie guerrero and chris benoit. according to the rules of the match, both opponents have to go through tables in order to get the win. benoit and guerrero heated up early on by taking turns hammering first spike and then bubba ray. a german suplex by benoit to bubba took the wind out of the dudley brother. spike tried to help his brother, but the referee restrained him while benoit and guerrero ganged up on him in the corner. with benoit stomping",1
1,"retitled from its original japanese name of laputa ( for being an offensive phrase, something which director hayao miyazaki was oblivious to at the time ), castle in the sky is the master animator's third film, and it's one of his most beloved of all time. initially a box office disappointment in its 1986 release, it has since been embraced by critics and audiences around the world. inspired by jonathan swift's "" gulliver's travels "", castle in the sky is a steampunk - themed action adventure tale about two young orphans - - young miner pazu, and mysterious girl sheet",1


=== hf-internal-testing/tiny-deberta ===

architecture:	deberta
tokenizer:	DebertaTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Match 1: Tag Team Table Match Bubba Ray and Spike Dudley vs Eddie Guerrero and Chris Benoit Bubba Ray and Spike Dudley started things off with a Tag Team Table Match against Eddie Guerrero and Chris Benoit. According to the rules of the match, both opponents have to go through tables in order to get the win. Benoit and Guerrero heated up ear",1
1,"On the way back from IMC6 (San Jose, California), all five (mind you, three of us hardcore Kamal fans) of us had reached a unanimous verdict; VV was solid crap and thanks to the movie we were going to have a pretty screwed up Monday. Not to mention, we swore to stay off the theatres for the next year.br /br /I won't blame Kamal here",0


=== hf-internal-testing/tiny-random-deberta-v2 ===

architecture:	deberta_v2
tokenizer:	DebertaV2Tokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Match 1: Tag Team Table Match Bubba Ray and Spike Dudley vs Eddie Guerrero and Chris Benoit Bubba Ray and Spike Dudley started things off with a Tag Team Table Match against Eddie Guerrero and Chris Benoit. According to the rules of the match, both opponents have to go through tables in order to get the win. Benoit and Guerrero heated up early on by taking turns hammering first Spike and then Bubba Ray. A German suplex by Benoit to Bubba took the wind out of the Dudley brother. Spike tried to help his brother, but the referee restrained him while Benoit and Guerrero ganged up on him in the corner. With Benoit stomping away on Bubba,",1
1,Bill Crain's rarer than rare'slasher' movie certainly doesn't follow the standard stalk and slash guidelines that have become so essential of its counterparts. The bogeyman this time around uses grenades and small arms as well as an awesome array of melee weapons; - a sin that's virtually unacceptable in most post-Halloween genre pieces. But there's still just enough familiarity to keep slasher buffs from checking the rule book and the plot never strays too far from the path that you've grown to expect. Just as Wally Koz's surprisingly decent 555 was seemingly put together with help from various members of his family,1


=== hf-internal-testing/tiny-random-distilbert ===

architecture:	distilbert
tokenizer:	DistilBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,match 1 : tag team table match bubba ray and spike dudley vs eddie guerrero and chris benoit bubba ray and spike dudley started things off with a tag team t,1
1,also known as the big spook war. the great yokai war is miike's attempt at a family film and damn fine job he does as well. the problem is that i can't imagin,1


=== hf-internal-testing/tiny-electra ===

architecture:	electra
tokenizer:	ElectraTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"match 1 : tag team table match bubba ray and spike dudley vs eddie guerrero and chris benoit bubba ray and spike dudley started things off with a tag team table match against eddie guerrero and chris benoit. according to the rules of the match, both opponents have to go through tables in order to get the win. benoit and guerrero heated up early on by taking turns hammering first",1
1,"i liked batman : dead end. a dark edgy film - noir setting for batman was perfect. batman : dead end is good. this is not. < br / > < br / > first of all let me start off with the acting. none of it is really that good. the best would probably be clark bartram as batman. but that isn't saying much. he is good at first glance, and then you realize he is what he is, a body - builder who happens to be a",0


=== google/fnet-base ===

architecture:	fnet
tokenizer:	FNetTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Match 1: Tag Team Table Match Bubba Ray and Spike Dudley vs Eddie Guerrero and Chris Benoit Bubba Ray and Spike Dudley started things off with a Tag Team Table Match against Eddie Guerrero and Chris Benoit. According to the rules of the match, both opponents have to go through tables in order to get the win. Benoit and Guerrero heated up early on by taking turns hammering first Spike and then Bubba Ray. A German suplex by Benoit to Bubba took the wind out",1
1,"For some reason, this film has never turned up in its original language in my neck of the woods (despite owning the TCM UK Cable channel, which broadcasts scores of MGM titles week in week out). More disappointingly, it's still M.I.A. on DVD  even from Warners' recently-announced ""Western Classics Collection"" Box Set (which does include 3 other Robert Taylor genre efforts); maybe, they're saving it for an eventual ""Signature Collection"" devoted to this stalwart of MGM, which may be coming",1


=== hf-internal-testing/tiny-random-flaubert ===

architecture:	flaubert
tokenizer:	FlaubertTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Match 1 : Tag Team Table Match Bubba Ray and Spike Dudley vs Eddie Guerrero and Chris Benoit Bubba Ray and Spike Dudley started things off with a Tag Team Table Match against Eddie Guerrero and Chris Benoit. According to the rules of the match, both opponents have to go through tables in order to get the win. Benoit and Guerrero heated up early on by taking turns hammering first Spike and then Bubba Ray. A German suplex by Benoit to Bubba took the",1
1,"True, there are many movies much worse then this movie. This movie was no Manos : The Hands of Fate, or Troll 2 ( yes, I have seen them both.. twice ) but at the same time this movie is No Alien, Predator or even Alien Vs. Predator ( Yes, even that movie surpassed this ). Movies like this make Battlefield Earth look like a Star Wars it is so bad. Razzie awards lookout, your biggest competition has just arriv",0


=== hf-internal-testing/tiny-random-funnel ===

architecture:	funnel
tokenizer:	FunnelTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,match 1 : tag team table match bubba ray and spike dudley vs eddie guerrero and chris benoit bubba ray and spike dudley started things off with a tag team t,1
1,"there aren't too many times when i see a film and go, "" huh, what? "", but this was one of them. maybe after seeing zabriskie point i felt much the same way woo",1


=== hf-internal-testing/tiny-random-gpt2 ===



Using pad_token, but it is not set yet.


architecture:	gpt2
tokenizer:	GPT2TokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Match 1: Tag Team Table Match Bubba Ray and Spike Dudley vs Eddie Guerrero and Chris Benoit Bubba Ray and Spike Dudley started things off with a Tag Team Table Match against Eddie Guerrero and Chris Benoit. According to the rules of the match, both opponents have to go through tables in or",1
1,"Now I love Bela Lugosi,don't get me wrong,he is one of the most interesting people to ever make a movie but he certainly did his share of clunkers.This is just another one of those.<br /><br />Lugosi plays Dr.Lorenz,a doctor who has had his medical license pulled for unexplained reasons.He",0


=== anton-l/gpt-j-tiny-random ===



Using pad_token, but it is not set yet.


architecture:	gptj
tokenizer:	GPT2TokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Match 1: Tag Team Table Match Bubba Ray and Spike Dudley vs Eddie Guerrero and Chris Benoit Bubba Ray and Spike Dudley started things off with a Tag Team Table Match against Eddie Guerrero and Chris Benoit. According to the rules of the match, both opponents have to go through tables in order to get the win. Benoit and Guerrero heated up early on by taking turns hammering first Spike and then Bubba Ray. A German suplex by Benoit to Bubba took the wind out of the Dudley brother. Spike tried to help his brother, but the referee restrained him while Benoit and Guerrero ganged up on him in",1
1,"There is NOTHING cool, hip, or clever about this film-- liking it just reveals an ignorance of true art cinema. How can you so easily forget that the central fact of this entire film is that these mean & ugly people are... SERIAL KILLERS! If they have to dismember total strangers in order to ""be a family again,"" then we don't WANT them to ""be a family."" What part of that did you have trouble grasping? Why applaud this filth?<br /><br />THIS silly filth is what you do if you can't do art! One's head & life must",0


=== hf-internal-testing/tiny-random-gpt_neo ===



Using pad_token, but it is not set yet.


architecture:	gpt_neo
tokenizer:	GPT2TokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Match 1: Tag Team Table Match Bubba Ray and Spike Dudley vs Eddie Guerrero and Chris Benoit Bubba Ray and Spike Dudley started things off with a Tag Team Table Match against Eddie Guerrero and Chris Benoit. According to the rules of the match, both opponents have to go through tables in or",1
1,"It all starts with a suicide. Or is it a car crash? I guess it all depends on whether you choose to start at the beginning or the end. Director Gabriele Muccino gives you the ability to enter his new film Seven Pounds whichever way you prefer as he starts at the end and works his way back to the beginning, showing us the course of",1


=== kssteven/ibert-roberta-base ===

architecture:	ibert
tokenizer:	RobertaTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Match 1: Tag Team Table Match Bubba Ray and Spike Dudley vs Eddie Guerrero and Chris Benoit Bubba Ray and Spike Dudley started things off with a Tag Team Table Match against Eddie Guerrero and Chris Benoit. According to the rules of the match, both opponents have to go through tables in order to get the win. Benoit and Guerrero heated up early on by taking turns hammering first Spike and then Bubba Ray. A German suplex by Benoit to Bubba took the wind out of the Dudley brother. Spike tried to help his brother, but the referee restrained him while Benoit and Guerrero ganged up on",1
1,"Just got through watching this version of ""Samhain"", and even though I still like it, it's nothing like the ""rough cut"" version I have. If you check the message board, you'll see an apology from the director for this cut down version, 79 minutes., and he says he had nothing to do with this R-rated trimmed down edit with a completely new screwed up ending. Christian really doesn't need to distant himself that much, because the basic gore elements still stand up, even though highly trimmed down. This is a damn shame, because this had the potential of being one of the goriest and",1


=== hf-internal-testing/tiny-random-led ===

architecture:	led
tokenizer:	LEDTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Match 1: Tag Team Table Match Bubba Ray and Spike Dudley vs Eddie Guerrero and Chris Benoit Bubba Ray and Spike Dudley started things off with a Tag Team Table Match against Eddie Guerrero and Chris Benoit. According to the rules of the match, both opponents have to go through tables",1
1,"<br /><br />As usual, I was really looking forward to a new TV/film on a favourite subject of mine - makes a nice change from a *strangely familiar* documentary about Kursk or Stalingrad on the History Channel.<br /><br />I avidly looked forward to Pearl Harbour and Enemy",0


=== hf-internal-testing/tiny-random-longformer ===

architecture:	longformer
tokenizer:	LongformerTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Match 1: Tag Team Table Match Bubba Ray and Spike Dudley vs Eddie Guerrero and Chris Benoit Bubba Ray and Spike Dudley started things off with a Tag Team Table Match against Eddie Guerrero and Chris Benoit. According to the rules of the match, both opponents have to go through tables",1
1,"Retitled from its original Japanese name of LAPUTA (for being an offensive phrase, something which director Hayao Miyazaki was oblivious to at the time), CASTLE IN THE SKY is the master animator's third film, and it's one of his most beloved of all time. Initially a box office disappointment in its",1


=== hf-internal-testing/tiny-random-mbart ===

architecture:	mbart
tokenizer:	MBartTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,Match 1: Tag Team Table Match Bubba Ray and Spike Dudley vs Eddie Guerrero and Chris Benoit Bubba Ray and Spike Dudley started things off with a Tag Team Table Match against Eddie Guerrero and Chris Benoit. According to the rules of the mat,1
1,"There aren't too many times when I see a film and go, ""huh, what?"", but this was one of them. Maybe after seeing abriskie Point I felt much the same way Woody Allen felt after seeing 2001- he only liked the film after seeing it three times over a two year period, realizing the filmmaker was ahead of him in what was going",1


=== hf-internal-testing/tiny-random-mpnet ===

architecture:	mpnet
tokenizer:	MPNetTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,match 1 : tag team table match bubba ray and spike dudley vs eddie guerrero and chris benoit bubba ray and spike dudley started things off with a tag team t,1
1,"he's stocky, sweaty, slightly cross - eyed and restless. he stands in front of us and calls himself a pervert. he claims that we the film viewers perceiv",1


=== hf-internal-testing/tiny-random-mobilebert ===

architecture:	mobilebert
tokenizer:	MobileBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,match 1 : tag team table match bubba ray and spike dudley vs eddie guerrero and chris benoit bubba ray and spike dudley started things off with a tag team t,1
1,"for some reason, this film has never turned up in its original language in my neck of the woods ( despite owning the tcm uk cable channel, which broadcast",1


=== openai-gpt ===



Using pad_token, but it is not set yet.


architecture:	openai
tokenizer:	OpenAIGPTTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"match 1 : tag team table match bubba ray and spike dudley vs eddie guerrero and chris benoit bubba ray and spike dudley started things off with a tag team table match against eddie guerrero and chris benoit. according to the rules of the match, both opponents have to go through tables in order to get the win. benoit and guerrero heated up early on by taking turns hammering first spike and then bubba ray. a german suplex by benoit to bubba took the wind out of the dudley brother. spike tried to help his brother, but the referee restrained him while benoit and guerrero ganged",1
1,"originally conceived as a solo vehicle for dudley moore,'not only... but also'saw his ex -'beyond the fringe'collaborator peter cook guest on the first show, and so well received was it the controller of b. b. c. - 2 insisted that he be on it every week from then on. they were a classic comedy team - cook was tall, handsome and witty, while dudley was short, charismatic, and musically gifted. the sketch that brought the house down had them in a pub, wearing flat caps and mufflers, fantasising about movie stars such as jane russell and greta garbo",1


=== google/reformer-crime-and-punishment ===



Using pad_token, but it is not set yet.


architecture:	reformer
tokenizer:	ReformerTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,Match : Tag Team Table Match Bubba Ray and Spike Dudley vs Eddie Guerrero and Chris Benoit Bubba Ray and Spike Dudley started things off with a Tag Team Table Match against Eddie Guerrero and Chris Benoit. Accor,1
1,THHE remake was a superior movie remake in every way. Most remakes end up being total garbage but under the very talented direction of Alexandre Aja became one of the best ever made in terms of remakes and also as far as the mutant inbreed human sub-genre of horror,0


=== google/rembert ===

architecture:	rembert
tokenizer:	RemBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Match 1: Tag Team Table Match Bubba Ray and Spike Dudley vs Eddie Guerrero and Chris Benoit Bubba Ray and Spike Dudley started things off with a Tag Team Table Match against Eddie Guerrero and Chris Benoit. According to the rules of the match, both opponents have to go through tables in order to get the win. Benoit and Guerrero heated up early on by taking turns hammering first Spike and then Bubba Ray. A German suplex by Benoit to Bubba took the wind out of the Dudley brother. Spike tried to help his brother, but the referee restrained him while Benoit and Guerrero",1
1,"He's stocky, sweaty, slightly cross-eyed and restless. He stands in front of us and calls himself a pervert. He claims that we  the film viewers  perceive the screen as a toilet bowl, and are all secretly wishing for all the s**t to explode from the inside. He's unpredictable and scary. Well? Come on, you could have guessed by now: he's one of the leading philosophers of our age.<br /><br />Slavoj iek is both a narrator and a subject of",1


=== junnyu/roformer_chinese_sim_char_ft_small ===

architecture:	roformer
tokenizer:	RoFormerTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"match 1 : tag team table match bubba ray and spike dudley vs eddie guerrero and chris benoit bubba ray and spike dudley started things off with a tag team table match against eddie guerrero and chris benoit. according to the rules of the match, both opponents have to go through tables in order to get the win. benoit and guerrero heated up early on by taking turns hammering first sp",1
1,"just got through watching this version of "" samhain "", and even though i still like it, it's nothing like the "" rough cut "" version i have. if you check the message board, you'll see an apology from the director for this cut down version, 79 minutes., and he says he had nothing to do with this r - rated trimmed down edit with a completely new screwed up ending. christian really doesn't need to di",1


=== roberta-base ===

architecture:	roberta
tokenizer:	RobertaTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Match 1: Tag Team Table Match Bubba Ray and Spike Dudley vs Eddie Guerrero and Chris Benoit Bubba Ray and Spike Dudley started things off with a Tag Team Table Match against Eddie Guerrero and Chris Benoit. According to the rules of the match, both opponents have to go through tables in order to get the win. Benoit and Guerrero heated up early on by taking turns hammering first Spike and then Bubba Ray. A German suplex by Benoit to Bubba took the wind out of the Dudley brother. Spike tried to help his brother, but the referee restrained him while Benoit and Guerrero ganged up on",1
1,"The Shining is a weird example of adaptation: it has very little in common with the source novel, written by Stephen King, yet it is widely remembered as one of the best cinematic renditions of the horror master's work. This is due to two factors: Stanley Kubrick's masterful direction and Jack Nicholson's chilly acting.<br /><br />Nicholson plays Jack Torrance, a writer who accepts to take care of the Overlook Hotel in Canada during the winter period, unaffected by the gruesome stories surrounding the place: he claims a nice, isolated location is just what he needs to finish his new book. Therefore the Over",1


=== squeezebert/squeezebert-uncased ===

architecture:	squeezebert
tokenizer:	SqueezeBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"match 1 : tag team table match bubba ray and spike dudley vs eddie guerrero and chris benoit bubba ray and spike dudley started things off with a tag team table match against eddie guerrero and chris benoit. according to the rules of the match, both opponents have to go through tables in order to get the win. benoit and guerrero heated up early on by taking turns hammering first spike and then bubba ray. a german suplex by benoit to bubba took the wind out of the dudley brother. spike tried to help his brother, but the referee restrained him while benoit and guerrero ganged up on him in the corner. with benoit stomping",1
1,"just got through watching this version of "" samhain "", and even though i still like it, it's nothing like the "" rough cut "" version i have. if you check the message board, you'll see an apology from the director for this cut down version, 79 minutes., and he says he had nothing to do with this r - rated trimmed down edit with a completely new screwed up ending. christian really doesn't need to distant himself that much, because the basic gore elements still stand up, even though highly trimmed down. this is a damn shame, because this had the potential of being one",1


=== hf-internal-testing/tiny-random-transfo-xl ===



Using pad_token, but it is not set yet.


architecture:	transfo_xl
tokenizer:	TransfoXLTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Match 1: Tag Team Table Match Bubba Ray and Spike Dudley vs Eddie Guerrero and Chris Benoit Bubba Ray and Spike Dudley started things off with a Tag Team Table Match against Eddie Guerrero and Chris Benoit. According to the rules of the match, both opponents have to go through tables in order to get the win. Benoit and Guerrero heated up early on by taking turns hammering first Spike and then Bubba Ray. A German suplex by Benoit to Bubba took the wind out of the Dudley brother. Spike tried to help his brother, but the referee restrained him while Benoit and Guerrero up on him in the corner. With Benoit stomping away on Bubba, Guerrero set",1
1,"The Choke starts as a rock band known as The Choke prepare for a gig at a nightclub called 'Club 905' owned & run by Guy Johnson (Andrew Parker). Lead singer Dylan (Sean Cook) & guitar player Mike (Jason McKee) plan to tell the other band members, bass player London (Brooke Bailey) & drummer Nancy (Tom Olson), that they are both going solo & their services won't be needed any longer. Once at the club Dylan prepares but Mike doesn't show up & the gig turns into a disaster. Then just as the band think things couldn't get any worse they find a dead body in the cellar",0


=== xlm-mlm-en-2048 ===

architecture:	xlm
tokenizer:	XLMTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"match 1 : tag team table match bubba ray and spike dudley vs eddie guerrero and chris benoit bubba ray and spike dudley started things off with a tag team table match against eddie guerrero and chris benoit. according to the rules of the match, both opponents have to go through tables in order to get the win. benoit and guerrero heated up early on by taking turns hammering first spike and then bubba ray. a german suplex by benoit to bubba took the wind out of the dudley brother. spike tried to help his brother, but the referee restrained him while benoit and guerrero ganged up on him in the corner",1
1,"also known as the big spook war. the great yokai war is miike's attempt at a family film and damn fine job he does as well. the problem is that i can 't imagine many parents wanting to subject their children to this movie. the best kids movies are the ones that are scary or have mildly disturbing imagery, neverending story and return to oz spring to mind, but in the case of the great yokai war miike probably takes things a little too far. in fact at the screening i was at the person introducing the movie reiterated to the two families there that it was probably not very suitable",1


=== xlm-roberta-base ===

architecture:	xlm_roberta
tokenizer:	XLMRobertaTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Match 1: Tag Team Table Match Bubba Ray and Spike Dudley vs Eddie Guerrero and Chris Benoit Bubba Ray and Spike Dudley started things off with a Tag Team Table Match against Eddie Guerrero and Chris Benoit. According to the rules of the match, both opponents have to go through tables in order to get the win. Benoit and Guerrero heated up early on by taking turns hammering first Spike and then Bubba Ray. A German suplex by Benoit to Bubba took the wind out of the Dudley brother.",1
1,"Just got through watching this version of ""Samhain"", and even though I still like it, it's nothing like the ""rough cut"" version I have. If you check the message board, you'll see an apology from the director for this cut down version, 79 minutes., and he says he had nothing to do with this R-rated trimmed down edit with a completely new screwed up ending. Christian really doesn't need to distant himself that much, because the basic gore elements still stand up, even though highly trimmed down. This is a damn shame, because this had",1


=== xlnet-base-cased ===

architecture:	xlnet
tokenizer:	XLNetTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Match 1: Tag Team Table Match Bubba Ray and Spike Dudley vs Eddie Guerrero and Chris Benoit Bubba Ray and Spike Dudley started things off with a Tag Team Table Match against Eddie Guerrero and Chris Benoit. According to the rules of the match, both opponents have to go through tables in order to get the win. Benoit and Guerrero heated up early on by taking turns hammering first Spike and then Bubba Ray. A German suplex by Benoit to Bubba took the wind out of the Dudley brother. Spike tried to help his brother, but the referee restrained him while Benoit and Guerrero ganged up on him in the corner.",1
1,"Retitled from its original Japanese name of LAPUTA (for being an offensive phrase, something which director Hayao Miyazaki was oblivious to at the time), CASTLE IN THE SKY is the master animator's third film, and it's one of his most beloved of all time. Initially a box office disappointment in its 1986 release, it has since been embraced by critics and audiences around the world. Inspired by Jonathan Swift's ""Gulliver's Travels"", CASTLE IN THE SKY is a steampunk-themed action adventure tale about two young",1


In [None]:
# hide_input
test_results_df = pd.DataFrame(test_results, columns=["arch", "tokenizer", "model_name", "result", "error"])
display_df(test_results_df)


Unnamed: 0,arch,tokenizer,model_name,result,error
0,albert,AlbertTokenizerFast,hf-internal-testing/tiny-albert,PASSED,
1,bart,BartTokenizerFast,hf-internal-testing/tiny-random-bart,PASSED,
2,bert,BertTokenizerFast,hf-internal-testing/tiny-bert,PASSED,
3,big_bird,BigBirdTokenizerFast,google/bigbird-roberta-base,PASSED,
4,bigbird_pegasus,PegasusTokenizerFast,google/bigbird-pegasus-large-arxiv,PASSED,
5,ctrl,CTRLTokenizer,hf-internal-testing/tiny-random-ctrl,PASSED,
6,camembert,CamembertTokenizerFast,camembert-base,PASSED,
7,canine,CanineTokenizer,hf-internal-testing/tiny-random-canine,PASSED,
8,convbert,ConvBertTokenizerFast,YituTech/conv-bert-base,PASSED,
9,deberta,DebertaTokenizerFast,hf-internal-testing/tiny-deberta,PASSED,


## Summary

The `blurr.data.core` module contains the fundamental bits for all data preprocessing tasks

In [None]:
# hide
from nbdev.export import notebook2script

notebook2script()


Converted 00_utils.ipynb.
Converted 01_data-core.ipynb.
Converted 01_modeling-core.ipynb.
Converted 02_data-language-modeling.ipynb.
Converted 02_modeling-language-modeling.ipynb.
Converted 03_data-token-classification.ipynb.
Converted 03_modeling-token-classification.ipynb.
Converted 04_data-question-answering.ipynb.
Converted 04_modeling-question-answering.ipynb.
Converted 10_data-seq2seq-core.ipynb.
Converted 10_modeling-seq2seq-core.ipynb.
Converted 11_data-seq2seq-summarization.ipynb.
Converted 11_modeling-seq2seq-summarization.ipynb.
Converted 12_data-seq2seq-translation.ipynb.
Converted 12_modeling-seq2seq-translation.ipynb.
Converted 99a_examples-high-level-api.ipynb.
Converted 99b_examples-glue.ipynb.
Converted 99c_examples-glue-plain-pytorch.ipynb.
Converted 99d_examples-multilabel.ipynb.
Converted 99e_examples-causal-lm-gpt2.ipynb.
Converted index.ipynb.
