In [None]:
# default_exp data.core


In [None]:
# all_slow


In [None]:
#hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# data.core

> This module contains the core bits required to use the fastai DataBlock API and/or mid-level data processing pipelines to turn your raw datasets into modelable `DataLoaders`

In [None]:
# export
import os, inspect
from dataclasses import dataclass
from functools import reduce, partial
from typing import Any, Callable, List, Optional, Union, Type

from datasets import Dataset, load_dataset, concatenate_datasets
from fastcore.all import *
from fastai.data.block import TransformBlock
from fastai.data.core import Datasets, DataLoader, DataLoaders, TfmdDL
from fastai.imports import *
from fastai.losses import CrossEntropyLossFlat
from fastai.text.data import SortedDL
from fastai.torch_core import *
from fastai.torch_imports import *
from transformers import (
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    PretrainedConfig,
    PreTrainedTokenizerBase,
    PreTrainedModel,
    logging,
)

from blurr.utils import BLURR

logging.set_verbosity_error()


In [None]:
# hide_input
import pdb

from fastai.data.block import CategoryBlock, ColReader, ColSplitter, DataBlock, ItemGetter, RandomSplitter
from fastcore.test import *
from nbverbose.showdoc import show_doc

from blurr.utils import print_versions

os.environ["TOKENIZERS_PARALLELISM"] = "false"
print("What we're running with at the time this documentation was generated:")
print_versions("torch fastai transformers")


What we're running with at the time this documentation was generated:
torch: 1.10.1+cu111
fastai: 2.5.3
transformers: 4.16.2


In [None]:
# hide
# cuda
torch.cuda.set_device(1)
print(f"Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}")


Using GPU #1: GeForce GTX 1080 Ti


## Setup

We'll use a subset of `imdb` to demonstrate how to configure your BLURR for sequence classification tasks

In [None]:
raw_datasets = load_dataset("imdb", split=["train", "test"])
raw_datasets[0] = raw_datasets[0].add_column("is_valid", [False] * len(raw_datasets[0]))
raw_datasets[1] = raw_datasets[1].add_column("is_valid", [True] * len(raw_datasets[1]))

final_ds = concatenate_datasets([raw_datasets[0].shuffle().select(range(1000)), raw_datasets[1].shuffle().select(range(200))])
imdb_df = pd.DataFrame(final_ds)
imdb_df.head()


Reusing dataset imdb (/home/wgilliam/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached shuffled indices for dataset at /home/wgilliam/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-65b5588450d6b196.arrow


Unnamed: 0,text,label,is_valid
0,This movie was horrible. I swear they didn't even write a script they just kinda winged it through out the whole movie. Ice-T was annoying as hell. *SPOILERS Phht more like reasons not to watch it* They sit down and eat breakfast for 20 minutes. he coulda been long gone. The ground was hard it would of been close to impossible to to track him with out dogs. And when ICE-T is on that Hill and uses that Spaz-15 Assault SHOTGUN like its a sniper rifle (and then cuts down a tree with eight shells?? It would take 1000's of shells to cut down a tree that size.) Shotguns and hand guns are conside...,0,False
1,"I have seen this movie at the cinema many years ago, and one thing surprised me so negatively that I could not see any redeeming virtues in the movies: Dennis Quaid was cast as a policeman that never smiles or grin, while his smile and grin are two of his trademarks. Danny Glover was cast as the bad guy, but - again - most viewers' imagination could not go far enough as to believe him in that role. Also, Jared Leto was not believable as the former medicine student. The tension was just not there, since the killer was known very early. The finale was, again, neither dramatic nor tense: nobo...",0,False
2,"This is a fantastic series first and foremost. It is very well done and very interesting. As a huge WWII buff, I had learned a lot before seeing this series. One of the best things this has going for it is all the interviews with past individuals back when the war was relatively fresh in their minds, comparatively speaking that is. It is nothing against the men that you see getting interviewed in the programs of today, it is just that most of these men weren't really involved in the upper echelons of what was happening then. One of the best parts is the narrating by Sir Laurence Oliver. I ...",1,False
3,Kurosawa really blew it on this one. Every genius is allowed a failure. The concept is fine but the execution is badly blurred.<br /><br />There is an air of fantasy about this film making it something of an art film. The poverty stricken of Tokyo deserve a fairer and more realistic portrayal. Many of them have interesting stories to tell. A very disappointing film.,0,False
4,"MGM were unsure of how to market Garbo when she first arrived in Hollywood. Mayer had a lot of faith in her and her appearance in ""Torrent"" justified that. She did not speak a word of English so she must have found it difficult to work, also Ricardo Cortez did not make it very easy for her.<br /><br />The torrent of the title is the river Juscar that winds through a sleepy little village in Spain. Leonora (Greta Garbo) hopes someday that her voice will bring great wealth and happiness to her struggling parents. Leonora and Don Rafael (Ricardo Cortez) are in love but he is under his mother'...",1,False


In [None]:
labels = raw_datasets[0].features["label"].names
labels


['neg', 'pos']

In [None]:
model_cls = AutoModelForSequenceClassification

pretrained_model_name = "roberta-base"  # "bert-base-multilingual-cased"
n_labels = len(labels)

hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(
    pretrained_model_name, model_cls=model_cls, config_kwargs={"num_labels": n_labels}
)

hf_arch, type(hf_config), type(hf_tokenizer), type(hf_model)


('roberta',
 transformers.models.roberta.configuration_roberta.RobertaConfig,
 transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast,
 transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification)

## Preprocessing

Starting with version 2.0, BLURR provides a preprocessing base class that can be used to build task specific pre-processed datasets from pandas DataFrames or Hugging Face Datasets

In [None]:
# export
class Preprocessor:
    def __init__(
        self,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # The number of examples to process at a time
        batch_size: int = 1000,
        # The attribute holding the text
        text_attr: str = "text",
        # The attribute holding the text_pair
        text_pair_attr: Optional[str] = None,
        # Tokenization kwargs that will be applied with calling the tokenizer
        tok_kwargs: dict = {},
    ):
        self.hf_tokenizer = hf_tokenizer
        self.batch_size = batch_size
        self.text_attr, self.text_pair_attr = text_attr, text_pair_attr
        self.tok_kwargs = tok_kwargs

    def process_df(self, training_df: pd.DataFrame, validation_df: Optional[pd.DataFrame] = None):
        df = training_df.copy()

        # concatenate the validation dataset if it is included
        if validation_df is not None:
            valid_df = validation_df.copy()
            # add an `is_valid_col` column to both training/validation DataFrames to indicate what data is part of the validation set
            if self.is_valid_attr:
                valid_df[self.is_valid_attr] = True
                df[self.is_valid_attr] = False

            df = pd.concat([df, valid_df])

        return df

    def process_hf_dataset(self, training_ds: Dataset, validation_ds: Optional[Dataset] = None):
        ds = training_ds

        # concatenate the validation dataset if it is included
        if validation_ds is not None:
            # add an `is_valid_col` column to both training/validation DataFrames to indicate what data is part of
            # the validation set
            if self.is_valid_attr:
                validation_ds = validation_ds.add_column(self.is_valid_attr, [True] * len(validation_ds))
                training_ds = training_ds.add_column(self.is_valid_attr, [False] * len(training_ds))

            ds = concatenate_datasets([training_ds, validation_ds])

        return ds

    def _tokenize_function(self, example):
        truncation = self.tok_kwargs.pop("truncation", True)

        txts = example[self.text_attr]
        txt_pairs = example[self.text_pair_attr] if self.text_pair_attr else None
        
        return self.hf_tokenizer(txts, txt_pairs, truncation=truncation, **self.tok_kwargs)


### `ClassificationPreprocessor`

Starting with version 2.0, BLURR provides a sequence classification preprocessing class that can be used to preprocess DataFrames or Hugging Face Datasets.

This class can be used for preprocessing both multiclass and multilabel classification datasets, and includes a `proc_{your_text_attr}` and `proc_{your_text_pair_attr}` (optional) attributes containing your modified text as a result of tokenization (e.g., if you specify a `max_length` the `proc_{your_text_attr}` may contain truncated text). 

**Note**: This class works for both slow and fast tokenizers

In [None]:
# export
class ClassificationPreprocessor(Preprocessor):
    def __init__(
        self,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # The number of examples to process at a time
        batch_size: int = 1000,
        # Whether the dataset should be processed for multi-label; if True, will ensure `label_attrs` are
        # converted to a value of either 0 or 1 indiciating the existence of the class in the example
        is_multilabel: bool = False,
        # The unique identifier in the dataset
        id_attr: Optional[str] = None,
        # The attribute holding the text
        text_attr: str = "text",
        # The attribute holding the text_pair
        text_pair_attr: Optional[str] = None,
        # The attribute holding the label(s) of the example
        label_attrs: Union[str, List[str]] = "label",
        # The attribute that should be created if your are processing individual training and validation
        # datasets into a single dataset, and will indicate to which each example is associated
        is_valid_attr: Optional[str] = "is_valid",
        # A list indicating the valid labels for the dataset (optional, defaults to the unique set of labels
        # found in the full dataset)
        label_mapping: Optional[List[str]] = None,
        # Tokenization kwargs that will be applied with calling the tokenizer
        tok_kwargs: dict = {},
    ):
        tok_kwargs = {**tok_kwargs, "return_offsets_mapping": True}
        super().__init__(hf_tokenizer, batch_size, text_attr, text_pair_attr, tok_kwargs)

        self.is_multilabel = is_multilabel
        self.id_attr = id_attr
        self.label_attrs = label_attrs
        self.is_valid_attr = is_valid_attr
        self.label_mapping = label_mapping

    def process_df(self, training_df: pd.DataFrame, validation_df: Optional[pd.DataFrame] = None):
        df = super().process_df(training_df, validation_df)

        # convert even single "labels" to a list to make things easier
        label_cols = listify(self.label_attrs)

        # if `is_multilabel`, convert all targets to an int, 0 or 1, rounding floats if necessary
        if self.is_multilabel:
            for label_col in label_cols:
                df[label_col] = df[label_col].apply(lambda v: int(bool(max(0, round(v)))))

        # if a `label_mapping` is included, add a "[label_col]_name" field with the label Ids converted to their label names
        if self.label_mapping:
            for label_col in label_cols:
                df[f"{label_col}_name"] = df[label_col].apply(lambda v: self.label_mapping[v])

        # process df in mini-batches
        final_df = pd.DataFrame()
        for g, batch_df in df.groupby(np.arange(len(df)) // self.batch_size):
            final_df = final_df.append(self._process_df_batch(batch_df))

        final_df.reset_index(drop=True, inplace=True)
        return final_df

    def process_hf_dataset(self, training_ds: Dataset, validation_ds: Optional[Dataset] = None):
        ds = super().process_hf_dataset(training_ds, validation_ds)
        return Dataset.from_pandas(self.process_df(pd.DataFrame(ds)))

    # ----- utility methods -----
    def _process_df_batch(self, batch_df):
        batch_df.reset_index(drop=True, inplace=True)

        # grab our inputs
        inputs = self._tokenize_function(batch_df.to_dict(orient="list"))

        for txt_seq_idx, txt_attr in enumerate([self.text_attr, self.text_pair_attr]):
            if txt_attr is None:
                break

            char_idxs = []
            for idx, offset_mapping in enumerate(inputs["offset_mapping"]):
                text_offsets = [offset_mapping[i] for i, seq_id in enumerate(inputs.sequence_ids(idx)) if seq_id == txt_seq_idx]
                char_idxs.append([min(text_offsets)[0], max(text_offsets)[1]])

            batch_df = pd.concat(
                [batch_df, pd.DataFrame(char_idxs, columns=[f"{txt_attr}_start_char_idx", f"{txt_attr}_end_char_idx"])], axis=1
            )
            batch_df.insert(
                0,
                f"proc_{txt_attr}",
                batch_df.apply(lambda r: r[txt_attr][r[f"{txt_attr}_start_char_idx"] : r[f"{txt_attr}_end_char_idx"] + 1], axis=1),
            )

            return batch_df


#### Using a `DataFrame`

In [None]:
preprocessor = ClassificationPreprocessor(hf_tokenizer, label_mapping=labels, tok_kwargs={"max_length": 24})
proc_df = preprocessor.process_df(imdb_df)
proc_df.columns, len(proc_df)
proc_df.head(2)


Unnamed: 0,proc_text,text,label,is_valid,label_name,text_start_char_idx,text_end_char_idx
0,This movie was horrible. I swear they didn't even write a script they just kinda winged it through out,This movie was horrible. I swear they didn't even write a script they just kinda winged it through out the whole movie. Ice-T was annoying as hell. *SPOILERS Phht more like reasons not to watch it* They sit down and eat breakfast for 20 minutes. he coulda been long gone. The ground was hard it would of been close to impossible to to track him with out dogs. And when ICE-T is on that Hill and uses that Spaz-15 Assault SHOTGUN like its a sniper rifle (and then cuts down a tree with eight shells?? It would take 1000's of shells to cut down a tree that size.) Shotguns and hand guns are conside...,0,False,neg,0,102
1,"I have seen this movie at the cinema many years ago, and one thing surprised me so negatively that I could","I have seen this movie at the cinema many years ago, and one thing surprised me so negatively that I could not see any redeeming virtues in the movies: Dennis Quaid was cast as a policeman that never smiles or grin, while his smile and grin are two of his trademarks. Danny Glover was cast as the bad guy, but - again - most viewers' imagination could not go far enough as to believe him in that role. Also, Jared Leto was not believable as the former medicine student. The tension was just not there, since the killer was known very early. The finale was, again, neither dramatic nor tense: nobo...",0,False,neg,0,106


#### Using a Hugging Face `Dataset`

In [None]:
preprocessor = ClassificationPreprocessor(hf_tokenizer, label_mapping=labels)
proc_ds = preprocessor.process_hf_dataset(final_ds)
proc_ds


Dataset({
    features: ['proc_text', 'text', 'label', 'is_valid', 'label_name', 'text_start_char_idx', 'text_end_char_idx'],
    num_rows: 1200
})

## Mid-level API

Base tokenization, batch transform, and DataBlock methods

### `TextInput`

A `TextInput` object is returned from the decodes method of `BatchDecodeTransform` as a means to customize `@typedispatch`ed functions like `DataLoaders.show_batch` and `Learner.show_results`. The value will the your "input_ids".

In [None]:
# export
class TextInput(TensorBase):
    """The base represenation of your inputs; used by the various fastai `show` methods"""

    pass


### `BatchTokenizeTransform` 

Inspired by this [article](https://docs.fast.ai/tutorial.transformers.html), inputs can come in as raw **text**, **a list of words** (e.g., tasks like Named Entity Recognition (NER), where you want to predict the label of each token), or as a **dictionary** that includes extra information you want to use during post-processing.

**On-the-fly Batch-Time Tokenization**: 

Part of the inspiration for this derives from the mechanics of Hugging Face tokenizers, in particular it can return a collated mini-batch of data given a list of sequences. As such, the collating required for our inputs can be done during tokenization ***before*** our batch transforms run in a `before_batch_tfms` transform (where we get a list of examples)! This allows users of BLURR to have everything done dynamically at batch-time without prior preprocessing with at least four potential benefits:
1. Less code
2. Faster mini-batch creation
3. Less RAM utilization and time spent tokenizing beforehand (this really helps with very large datasets)
4. Flexibility

In [None]:
# export
class BatchTokenizeTransform(Transform):
    """
    Handles everything you need to assemble a mini-batch of inputs and targets, as well as
    decode the dictionary produced as a byproduct of the tokenization process in the `encodes` method.
    """

    def __init__(
        self,
        # The abbreviation/name of your Hugging Face transformer architecture (e.b., bert, bart, etc..)
        hf_arch: str,
        # A specific configuration instance you want to use
        hf_config: PretrainedConfig,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # A Hugging Face model
        hf_model: PreTrainedModel,
        # To control whether the "labels" are included in your inputs. If they are, the loss will be calculated in
        # the model's forward function and you can simply use `PreCalculatedLoss` as your `Learner`'s loss function to use it
        include_labels: bool = True,
        # The token ID that should be ignored when calculating the loss
        ignore_token_id: int = CrossEntropyLossFlat().ignore_index,
        # To control the length of the padding/truncation. It can be an integer or None,
        # in which case it will default to the maximum length the model can accept. If the model has no
        # specific maximum input length, truncation/padding to max_length is deactivated.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        max_length: int = None,
        # To control the `padding` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `'do_not_pad'.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        padding: Union[bool, str] = True,
        # To control `truncation` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `do_not_truncate`.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        truncation: Union[bool, str] = True,
        # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
        # if your inputs are pre-tokenized (not numericalized)
        is_split_into_words: bool = False,
        # Any other keyword arguments you want included when using your `hf_tokenizer` to tokenize your inputs
        tok_kwargs: dict = {},
        # Keyword arguments to apply to `BatchTokenizeTransform`
        **kwargs
    ):
        store_attr()
        self.kwargs = kwargs

    def encodes(self, samples, return_batch_encoding=False):
        """
        This method peforms on-the-fly, batch-time tokenization of your data. In other words, your raw inputs
        are tokenized as needed for each mini-batch of data rather than requiring pre-tokenization of your full
        dataset ahead of time.
        """
        samples = L(samples)

        # grab inputs
        is_dict = isinstance(samples[0][0], dict) 
        test_inp = samples[0][0]["text"] if is_dict else samples[0][0]

        if is_listy(test_inp) and not self.is_split_into_words:
            if is_dict:
                inps = [(item["text"][0], item["text"][1]) for item in samples.itemgot(0).items]
            else:
                inps = list(zip(samples.itemgot(0, 0), samples.itemgot(0, 1)))
        else:
            inps = [item["text"] for item in samples.itemgot(0).items] if is_dict else samples.itemgot(0).items

        inputs = self.hf_tokenizer(
            inps,
            max_length=self.max_length,
            padding=self.padding,
            truncation=self.truncation,
            is_split_into_words=self.is_split_into_words,
            return_tensors="pt",
            **self.tok_kwargs
        )

        d_keys = inputs.keys()

        # update the samples with tokenized inputs (e.g. input_ids, attention_mask, etc...), as well as extra information
        # if the inputs is a dictionary.
        # (< 2.0.0): updated_samples = [(*[{k: inputs[k][idx] for k in d_keys}], *sample[1:]) for idx, sample in enumerate(samples)]
        updated_samples = []
        for idx, sample in enumerate(samples):
            inps = {k: inputs[k][idx] for k in d_keys}
            if is_dict:
                inps = {**inps, **{k: v for k,v in sample[0].items() if k not in ['text']}}

            trgs = sample[1:]
            if self.include_labels and len(trgs) > 0:
                inps["labels"] = trgs[0]

            updated_samples.append((*[inps], *trgs))

        if return_batch_encoding:
            return updated_samples, inputs

        return updated_samples


### `BatchDecodeTransform`

As of fastai 2.1.5, before batch transforms no longer have a `decodes` method ... and so, I've introduced a standard batch transform here (one that occurs "after" the batch has been created) that will do the decoding for us.

In [None]:
# export
class BatchDecodeTransform(Transform):
    """A class used to cast your inputs as `input_return_type` for fastai `show` methods"""

    def __init__(self, input_return_type: Type = TextInput, **kwargs):
        store_attr()

    def decodes(self, items: dict):
        """Returns the proper object and data for show related fastai methods"""
        return self.input_return_type(items["input_ids"])


### `TextBlock`

A basic `DataBlock` for our inputs, `TextBlock` is designed with sensible defaults to minimize user effort in defining their transforms pipeline. It handles setting up your `BatchTokenizeTransform` and `BatchDecodeTransform` transforms regardless of data source (e.g., this will work with files, DataFrames, whatever). 

**Note**: You must either pass in your own instance of a `BatchTokenizeTransform` class or the Hugging Face objects returned from `BLURR.get_hf_objects` (e.g.,architecture, config, tokenizer, and model). The other args are optional.

We also include a `blurr_sort_func` that works with `SortedDL` to properly sort based on the number of tokens in each example.

In [None]:
# export
def blurr_sort_func(
    example,
    # A Hugging Face tokenizer
    hf_tokenizer: PreTrainedTokenizerBase,
    # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
    # if your inputs are pre-tokenized (not numericalized)
    is_split_into_words: bool = False,
    # Any other keyword arguments you want to include during tokenization
    tok_kwargs: dict = {},
):
    """This method is used by the `SortedDL` to ensure your dataset is sorted *after* tokenization"""
    txt = example[0]["text"] if isinstance(example[0], dict) else example[0]
    return len(txt) if is_split_into_words else len(hf_tokenizer.tokenize(txt, **tok_kwargs))


In [None]:
# export
class TextBlock(TransformBlock):
    """The core `TransformBlock` to prepare your inputs for training in Blurr with fastai's `DataBlock` API"""

    def __init__(
        self,
        # The abbreviation/name of your Hugging Face transformer architecture (not required if passing in an
        # instance of `BatchTokenizeTransform` to `before_batch_tfm`)
        hf_arch: Optional[str] = None,
        # A Hugging Face configuration object (not required if passing in an
        # instance of `BatchTokenizeTransform` to `before_batch_tfm`)
        hf_config: Optional[PretrainedConfig] = None,
        # A Hugging Face tokenizer (not required if passing in an
        # instance of `BatchTokenizeTransform` to `before_batch_tfm`)
        hf_tokenizer: Optional[PreTrainedTokenizerBase] = None,
        # A Hugging Face model (not required if passing in an
        # instance of `BatchTokenizeTransform` to `before_batch_tfm`)
        hf_model: Optional[PreTrainedModel] = None,
        # To control whether the "labels" are included in your inputs. If they are, the loss will be calculated in
        # the model's forward function and you can simply use `PreCalculatedLoss` as your `Learner`'s loss function to use it
        include_labels: bool = True,
        # The token ID that should be ignored when calculating the loss
        ignore_token_id=CrossEntropyLossFlat().ignore_index,
        # The before_batch_tfm you want to use to tokenize your raw data on the fly
        # (defaults to an instance of `BatchTokenizeTransform`)
        batch_tokenize_tfm: Optional[BatchTokenizeTransform] = None,
        # The batch_tfm you want to decode your inputs into a type that can be used in the fastai show methods,
        # (defaults to BatchDecodeTransform)
        batch_decode_tfm: Optional[BatchDecodeTransform] = None,
        # To control the length of the padding/truncation. It can be an integer or None,
        # in which case it will default to the maximum length the model can accept. If the model has no
        # specific maximum input length, truncation/padding to max_length is deactivated.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        max_length: Optional[int] = None,
        # To control the `padding` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `'do_not_pad'.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        padding: Union[bool, str] = True,
        # To control `truncation` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `do_not_truncate`.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        truncation: Union[bool, str] = True,
        # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
        # if your inputs are pre-tokenized (not numericalized)
        is_split_into_words: bool = False,
        # The return type your decoded inputs should be cast too (used by methods such as `show_batch`)
        input_return_type: Type = TextInput,
        # The type of `DataLoader` you want created (defaults to `SortedDL`)
        dl_type: Optional[DataLoader] = None,
        # Any keyword arguments you want applied to your `batch_tokenize_tfm`
        batch_tokenize_kwargs: dict = {},
        # Any keyword arguments you want applied to your `batch_decode_tfm` (will be set as a fastai `batch_tfms`)
        batch_decode_kwargs: dict = {},
        # Any keyword arguments you want your Hugging Face tokenizer to use during tokenization
        tok_kwargs: dict = {},
        # Any keyword arguments you want to have applied with generating text
        text_gen_kwargs: dict = {},
        # Any keyword arguments you want applied to `TextBlock`
        **kwargs
    ):
        if (not all([hf_arch, hf_config, hf_tokenizer, hf_model])) and batch_tokenize_tfm is None:
            raise ValueError("You must supply an hf_arch, hf_config, hf_tokenizer, hf_model -or- a BatchTokenizeTransform")

        if batch_tokenize_tfm is None:
            batch_tokenize_tfm = BatchTokenizeTransform(
                hf_arch,
                hf_config,
                hf_tokenizer,
                hf_model,
                include_labels=include_labels,
                ignore_token_id=ignore_token_id,
                max_length=max_length,
                padding=padding,
                truncation=truncation,
                is_split_into_words=is_split_into_words,
                tok_kwargs=tok_kwargs.copy(),
                **batch_tokenize_kwargs.copy()
            )

        if batch_decode_tfm is None:
            batch_decode_tfm = BatchDecodeTransform(input_return_type=input_return_type, **batch_decode_kwargs.copy())

        if dl_type is None:
            dl_sort_func = partial(
                blurr_sort_func,
                hf_tokenizer=batch_tokenize_tfm.hf_tokenizer,
                is_split_into_words=batch_tokenize_tfm.is_split_into_words,
                tok_kwargs=batch_tokenize_tfm.tok_kwargs.copy(),
            )

            dl_type = partial(SortedDL, sort_func=dl_sort_func)

        return super().__init__(dl_type=dl_type, dls_kwargs={"before_batch": batch_tokenize_tfm}, batch_tfms=batch_decode_tfm)


## Low-level API

For working with PyTorch and/or fast.ai Datasets & DataLoaders, the low-level API allows you to get back fast.ai specific features such as `show_batch`, `show_results`, etc... when using plain ol' PyTorch Datasets, Hugging Face Datasets, etc...

In [None]:
# export
@dataclass
class BlurrBatchCreator:
    """
    A class that can be assigned to a `TfmdDL.create_batch` method; used to in Blurr's low-level API
    to create batches that can be used in the Blurr library
    """

    def __init__(
        self,
        # Your Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # Defaults to use Hugging Face's DataCollatorWithPadding(tokenizer=hf_tokenizer)
        data_collator: Type = None,
    ):
        self.hf_tokenizer = hf_tokenizer
        self.data_collator = data_collator if (data_collator) else DataCollatorWithPadding(tokenizer=hf_tokenizer)

    def __call__(self, features):  # A mini-batch (list of examples to run through your model)
        """This method will collate your data using `self.data_collator` and add a target element to the
        returned tuples if `labels` are defined as is the case when most Hugging Face datasets
        """
        batch = self.data_collator(features)
        if isinstance(features[0], dict):
            return dict(batch), batch["labels"] if ("labels" in features[0]) else dict(batch)

        return batch


In [None]:
# export
class BlurrBatchDecodeTransform(BatchDecodeTransform):
    """A class used to cast your inputs into something understandable in fastai `show` methods"""

    def __init__(
        self,
        # The abbreviation/name of your Hugging Face transformer architecture (not required if passing in an
        # instance of `BatchTokenizeTransform` to `before_batch_tfm`)
        hf_arch: Optional[str] = None,
        # A Hugging Face configuration object (not required if passing in an
        # instance of `BatchTokenizeTransform` to `before_batch_tfm`)
        hf_config: Optional[PretrainedConfig] = None,
        # A Hugging Face tokenizer (not required if passing in an
        # instance of `BatchTokenizeTransform` to `before_batch_tfm`)
        hf_tokenizer: Optional[PreTrainedTokenizerBase] = None,
        # A Hugging Face model (not required if passing in an
        # instance of `BatchTokenizeTransform` to `before_batch_tfm`)
        hf_model: Optional[PreTrainedModel] = None,
        # To control whether the "labels" are included in your inputs. If they are, the loss will be calculated in
        # the model's forward function and you can simply use `PreCalculatedLoss` as your `Learner`'s loss function to use it
        include_labels: bool = True,
        # The token ID to ignore when calculating loss/metrics
        ignore_token_id: int = CrossEntropyLossFlat().ignore_index,
        # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
        # if your inputs are pre-tokenized (not numericalized)
        is_split_into_words: bool = False,
        # Any other keyword arguments you want included when using your `hf_tokenizer` to tokenize your inputs
        tok_kwargs: dict = {},
        # Any text generation keyword arguments
        text_gen_kwargs: dict = {},
        # The return type your decoded inputs should be cast too (used by methods such as `show_batch`)
        input_return_type: Type = TextInput,
        # Any other keyword arguments you need to pass to `BatchDecodeTransform`
        **kwargs
    ):
        super().__init__(input_return_type=input_return_type)
        store_attr()
        self.kwargs = kwargs


In [None]:
# export
@delegates()
class BlurrDataLoader(TfmdDL):
    """A class that makes creating a fast.ai `DataLoader` that works with Blurr"""

    def __init__(
        self,
        # A standard PyTorch Dataset
        dataset: Union[torch.utils.data.dataset.Dataset, Datasets],
        # The abbreviation/name of your Hugging Face transformer architecture (not required if passing in an
        # instance of `BatchTokenizeTransform` to `before_batch_tfm`)
        hf_arch: str,
        # A Hugging Face configuration object (not required if passing in an instance of `BatchTokenizeTransform`
        # to `before_batch_tfm`)
        hf_config: PretrainedConfig,
        # A Hugging Face tokenizer (not required if passing in an instance of `BatchTokenizeTransform` to
        # `before_batch_tfm`)
        hf_tokenizer: PreTrainedTokenizerBase,
        # A Hugging Face model (not required if passing in an instance of `BatchTokenizeTransform` to
        # `before_batch_tfm`)
        hf_model: PreTrainedModel,
        # An instance of `BlurrBatchCreator` or equivalent (defaults to `BlurrBatchCreator`)
        batch_creator: Optional[BlurrBatchCreator] = None,
        # The batch_tfm used to decode Blurr batches (defaults to `BlurrBatchDecodeTransform`)
        batch_decode_tfm: Optional[BlurrBatchDecodeTransform] = None,
        # (optional) A preprocessing function that will be applied to your dataset
        preproccesing_func: Callable[
            [Union[torch.utils.data.dataset.Dataset, Datasets], PreTrainedTokenizerBase, PreTrainedModel],
            Union[torch.utils.data.dataset.Dataset, Datasets],
        ] = None,
        # Keyword arguments to be applied to your `batch_decode_tfm`
        batch_decode_kwargs: dict = {},
        # Keyword arguments to be applied to `BlurrDataLoader`
        **kwargs,
    ):
        if preproccesing_func:
            dataset = preproccesing_func(dataset, hf_tokenizer, hf_model)

        if "create_batch" in kwargs:
            kwargs.pop("create_batch")
        if not batch_creator:
            batch_creator = BlurrBatchCreator(hf_tokenizer=hf_tokenizer)

        if "after_batch" in kwargs:
            kwargs.pop("after_batch")
        if not batch_decode_tfm:
            batch_decode_tfm = BlurrBatchDecodeTransform(hf_arch, hf_config, hf_tokenizer, hf_model, **batch_decode_kwargs.copy())

        super().__init__(dataset=dataset, create_batch=batch_creator, after_batch=batch_decode_tfm, **kwargs)
        store_attr(names="hf_arch, hf_config, hf_tokenizer, hf_model")

    def new(
        self,
        # A standard PyTorch and fastai dataset
        dataset: Union[torch.utils.data.dataset.Dataset, Datasets] = None,
        # The class you want to create an instance of (will be "self" if None)
        cls: Type = None,
        #  Any additional keyword arguments you want to pass to the __init__ method of `cls`
        **kwargs,
    ):
        """We have to override the new method in order to add back the Hugging Face objects in this factory
        method (called for example in places like `show_results`). With the exception of the additions to the kwargs
        dictionary, the code below is pulled from the `DataLoaders.new` method as is.
        """
        if dataset is None:
            dataset = self.dataset
        if cls is None:
            cls = type(self)

        cur_kwargs = dict(
            dataset=dataset,
            num_workers=self.fake_l.num_workers,
            pin_memory=self.pin_memory,
            timeout=self.timeout,
            bs=self.bs,
            shuffle=self.shuffle,
            drop_last=self.drop_last,
            indexed=self.indexed,
            device=self.device,
        )

        for n in self._methods:
            o = getattr(self, n)
            if not isinstance(o, MethodType):
                cur_kwargs[n] = o

        # we need to add these arguments back in (these, after_batch, and create_batch will go in as kwargs)
        kwargs["hf_arch"] = self.hf_arch
        kwargs["hf_config"] = self.hf_config
        kwargs["hf_tokenizer"] = self.hf_tokenizer
        kwargs["hf_model"] = self.hf_model

        return cls(**merge(cur_kwargs, kwargs))


## Utility classes and methods 

These methods are use internally for getting blurr transforms associated to your `DataLoaders`

In [None]:
# export
def get_blurr_tfm(
    # A list of transforms (e.g., dls.after_batch, dls.before_batch, etc...)
    tfms_list: Pipeline,
    # The transform to find
    tfm_class: Transform = BatchTokenizeTransform,
):
    """
    Given a fastai DataLoaders batch transforms, this method can be used to get at a transform
    instance used in your Blurr DataBlock
    """
    return next(filter(lambda el: issubclass(type(el), tfm_class), tfms_list), None)


In [None]:
show_doc(get_blurr_tfm)


<h4 id="get_blurr_tfm" class="doc_header"><code>get_blurr_tfm</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>get_blurr_tfm</code>(**`tfms_list`**:`Pipeline`, **`tfm_class`**:`Transform`=*`BatchTokenizeTransform`*)

Given a fastai DataLoaders batch transforms, this method can be used to get at a transform
instance used in your Blurr DataBlock

**Parameters:**


 - **`tfms_list`** : *`<class 'fastcore.transform.Pipeline'>`*	<p>A list of transforms (e.g., dls.after_batch, dls.before_batch, etc...)</p>


 - **`tfm_class`** : *`<class 'fastcore.transform.Transform'>`*, *optional*	<p>The transform to find</p>



In [None]:
# export
def first_blurr_tfm(
    # Your fast.ai `DataLoaders
    dls: DataLoaders,  
    # The Blurr transforms to look for in order
    tfms: List[Transform] = [BatchTokenizeTransform, BatchDecodeTransform, BlurrBatchDecodeTransform]
):
    """
    This convenience method will find the first Blurr transform required for methods such as
    `show_batch` and `show_results`. The returned transform should have everything you need to properly
    decode and 'show' your Hugging Face inputs/targets
    """
    for tfm in tfms:
        found_tfm = get_blurr_tfm(dls.before_batch, tfm_class=tfm)
        if found_tfm:
            return found_tfm

        found_tfm = get_blurr_tfm(dls.after_batch, tfm_class=tfm)
        if found_tfm:
            return found_tfm


In [None]:
show_doc(first_blurr_tfm)


<h4 id="first_blurr_tfm" class="doc_header"><code>first_blurr_tfm</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>first_blurr_tfm</code>(**`dls`**:`DataLoaders`, **`tfms`**:`List`\[`Transform`\]=*`[<class '__main__.BatchTokenizeTransform'>, <class '__main__.BatchDecodeTransform'>, <class '__main__.BlurrBatchDecodeTransform'>]`*)

This convenience method will find the first Blurr transform required for methods such as
`show_batch` and `show_results`. The returned transform should have everything you need to properly
decode and 'show' your Hugging Face inputs/targets

**Parameters:**


 - **`dls`** : *`<class 'fastai.data.core.DataLoaders'>`*	<p>Your fast.ai `DataLoaders</p>


 - **`tfms`** : *`typing.List[fastcore.transform.Transform]`*, *optional*	<p>The Blurr transforms to look for in order</p>



## `show_batch`

In [None]:
# export
@typedispatch
def show_batch(
    # This typedispatched `show_batch` will be called for `TextInput` typed inputs
    x: TextInput,
    # Your targets
    y,
    # Your raw inputs/targets
    samples,
    # Your `DataLoaders`. This is required so as to get at the Hugging Face objects for
    # decoding them into something understandable
    dataloaders,
    # Your `show_batch` context
    ctxs=None,
    # The maximum number of items to show
    max_n=6,
    # Any truncation your want applied to your decoded inputs
    trunc_at=None,
    # Any other keyword arguments you want applied to `show_batch`
    **kwargs,
):
    # grab our tokenizer
    tfm = first_blurr_tfm(dataloaders)
    hf_tokenizer = tfm.hf_tokenizer

    # if we've included our labels list, we'll use it to look up the value of our target(s)
    trg_labels = tfm.kwargs["labels"] if ("labels" in tfm.kwargs) else None

    res = L()
    n_inp = dataloaders.n_inp

    for idx, (input_ids, label, sample) in enumerate(zip(x, y, samples)):
        if idx >= max_n:
            break

        rets = [hf_tokenizer.decode(input_ids, skip_special_tokens=True)[:trunc_at]]
        for item in sample[n_inp:]:
            if not torch.is_tensor(item):
                trg = trg_labels[int(item)] if trg_labels else item
            elif is_listy(item.tolist()):
                trg = [trg_labels[idx] for idx, val in enumerate(label.numpy().tolist()) if (val == 1)] if (trg_labels) else label.numpy()
            else:
                trg = trg_labels[label.item()] if (trg_labels) else label.item()

            rets.append(trg)
        res.append(tuplify(rets))

    cols = ["text"] + ["target" if (i == 0) else f"target_{i}" for i in range(len(res[0]) - n_inp)]
    display_df(pd.DataFrame(res, columns=cols)[:max_n])
    return ctxs


## Examples

The following eamples demonstrate several approaches to construct your `DataBlock` for sequence classication tasks using the mid-level API, and also an example on how to accomplish the same using the low-level API and standard PyTorch/Hugging Face/fast.ai Datasets and DataLoaders.

### Using the mid-level API

#### Batch-Time Tokenization

##### Step 1: Get your Hugging Face objects.

There are a bunch of ways we can get at the four Hugging Face elements we need (e.g., architecture name, tokenizer, config, and model).  We can just create them directly, or we can use one of the helper methods available via `BLURR`.

In [None]:
# hide_output
from transformers import AutoModelForSequenceClassification

model_cls = AutoModelForSequenceClassification

pretrained_model_name = "distilroberta-base"  # "distilbert-base-uncased" "bert-base-uncased"
hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(pretrained_model_name, model_cls=model_cls)


#####  Step 2: Create your `DataBlock`

In [None]:
blocks = (TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model, batch_tokenize_kwargs={"labels": labels}), CategoryBlock)
dblock = DataBlock(blocks=blocks, get_x=ColReader("text"), get_y=ColReader("label"), splitter=ColSplitter())


##### Step 3: Build your `DataLoaders`

In [None]:
dls = dblock.dataloaders(imdb_df, bs=4)


In [None]:
b = dls.one_batch()
len(b), len(b[0]["input_ids"]), b[0]["input_ids"].shape, len(b[1])


(2, 4, torch.Size([4, 512]), 4)

In [None]:
b[0]

{'input_ids': tensor([[    0,  6142,    54,  ...,  6717,   619,     2],
         [    0,    38,   269,  ...,     4,   616,     2],
         [    0,   870,   659,  ...,    21, 35718,     2],
         [    0,    83,   333,  ...,    66, 35341,     2]], device='cuda:1'),
 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1]], device='cuda:1'),
 'labels': TensorCategory([0, 0, 0, 0], device='cuda:1')}

Let's take a look at the actual types represented by our batch

In [None]:
explode_types(b)


{tuple: [dict, fastai.torch_core.TensorCategory]}

In [None]:
dls.show_batch(dataloaders=dls, max_n=2, trunc_at=500)


Unnamed: 0,text,target
0,"Anyone who visited drive-ins in the 1950s, 60s, and 70s, must have seen a film or two by American International Pictures, a distributor that resembled 1980s giant Cannon Films. Wherever movie-goers ventured, AIP would be right there to supply the latest en vogue titles - in the 50s came horror movies like 'Voodoo Woman' and 'The Undead;' in the 60s were Frankie Avalon-Annette Funicello beach comedies and biker flicks like 'The Glory Stompers;' and into the 70s, AIP churned out grindhouse-level",neg
1,"(I'll indicate in this review the point where spoilers begin.) My dissatisfaction is split: 30% tone-deafness, 70% lackluster writing.<br /><br />The 30%: I agree with the first commenter's synopsis about the lack of diversity in the characters and scope of the stories. I was surprised how, this film, at best, woefully shortchanges the real NYC by presenting a collection of people and relationships so narrow as to come across as if it's inhabited only by the cast of Gossip Girl (this is coming",neg


#### Using a preprocessed dataset

Preprocessing your raw data is the more traditional approach to using Transformers. It is required, for example, when you want to work with documents longer than your model will allow. A preprocessed dataset is used in the same way a non-preprocessed dataset is.

##### Step 1a: Get your Hugging Face objects.

In [None]:
hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(pretrained_model_name, model_cls=model_cls)


##### Step 1b. Preprocess dataset

In [None]:
preprocessor = ClassificationPreprocessor(hf_tokenizer, label_mapping=labels)
proc_ds = preprocessor.process_hf_dataset(final_ds)
proc_ds


Dataset({
    features: ['proc_text', 'text', 'label', 'is_valid', 'label_name', 'text_start_char_idx', 'text_end_char_idx'],
    num_rows: 1200
})

##### Step 2: Create your `DataBlock`

In [None]:
blocks = (TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model, batch_tokenize_kwargs={"labels": labels}), CategoryBlock)
dblock = DataBlock(blocks=blocks, get_x=ItemGetter("proc_text"), get_y=ItemGetter("label"), splitter=RandomSplitter())


##### Step 3: Build your `DataLoaders`

In [None]:
dls = dblock.dataloaders(proc_ds, bs=4)


In [None]:
dls.show_batch(dataloaders=dls, max_n=2, trunc_at=500)


Unnamed: 0,text,target
0,"This TV production of 1970 starring Susannah York and George C. Scott is another proof of how difficult it is to adopt ""Jane Eyre"" to the screen, and how much can go wrong in doing so. It is true that the movie suffered in the transfer to DVD - some scenes which were complete in the original were shortened and so badly edited that there are striking continuity gaps and that even one crucial scene between Jane and Rochester starts in the middle of a sentence! But even if the editing were better,",neg
1,"From the start, you know this is a Sam Sherman film more than an Al Adamson film because as the credits roll, ""A Sam Sherman Production"" appears in letters as big as the title credit. Not only that, Mr. Sherman co-wrote the screenplay and it was his idea to use Bob Livingstone, a washed-up, 69 year old Western star of the old Hollywood era to be his male lead in a picture that Sherman thought would capitalize on the recent success of ""Swinging Stewardesses"". <br /><br />Now why would you want t",neg


#### Passing extra information

As of v.2, BLURR now also allows you to pass extra information alongside your inputs in the form of a dictionary.  If you use this approach, you must assign your text(s) to the `text` attribute of the dictionary.  This is a useful approach when splitting long documents into chunks, but wanting to score/predict by example rather than chunk (for example in extractive question answering tasks).

**Note**: A good place to access to this extra information during training/validation is in the `before_batch` method of a `Callback`.

In [None]:
blocks = (TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model, batch_tokenize_kwargs={"labels": labels}), CategoryBlock)

def get_x(item):
    return {"text": item.text, "another_val": "testing123" }

dblock = DataBlock(blocks=blocks, get_x=get_x, get_y=ColReader("label"), splitter=ColSplitter())

In [None]:
dls = dblock.dataloaders(imdb_df, bs=4)

In [None]:
b = dls.one_batch()
len(b), len(b[0]["input_ids"]), b[0]["input_ids"].shape, len(b[1])

(2, 4, torch.Size([4, 512]), 4)

In [None]:
dls.show_batch(dataloaders=dls, max_n=2, trunc_at=500)

Unnamed: 0,text,target
0,"Anyone who visited drive-ins in the 1950s, 60s, and 70s, must have seen a film or two by American International Pictures, a distributor that resembled 1980s giant Cannon Films. Wherever movie-goers ventured, AIP would be right there to supply the latest en vogue titles - in the 50s came horror movies like 'Voodoo Woman' and 'The Undead;' in the 60s were Frankie Avalon-Annette Funicello beach comedies and biker flicks like 'The Glory Stompers;' and into the 70s, AIP churned out grindhouse-level",neg
1,"Cult of the Cobra is now available on DVD in a pristine print that does full justice to whatever merits it has as a movie. Unfortunately, that is not saying much.<br /><br />It has a competent cast of second-rankers that acquit themselves as well as could be expected under the circumstances. It is efficiently directed, entirely on sound stages and standing sets on the studio backlot. It looks OK, but is ponderously over-plotted and at a scant 80 minutes it is still heavily padded.<br /><br />Fo",neg


### Using the low-level API

#### Step 1: Build your datasets

In [None]:
raw_datasets = load_dataset("glue", "mrpc")


Reusing dataset glue (/home/wgilliam/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
def tokenize_function(example):
    return hf_tokenizer(example["sentence1"], example["sentence2"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

Loading cached processed dataset at /home/wgilliam/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-6e3b62256f7dcee8.arrow
Loading cached processed dataset at /home/wgilliam/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-a37d5e554eb61497.arrow
Loading cached processed dataset at /home/wgilliam/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-83f60d5306f74c84.arrow


#### Step 2: Dataset pre-processing (optional)

In [None]:
# export
def preproc_hf_dataset(
    # A standard PyTorch Dataset or fast.ai Datasets
    dataset: Union[torch.utils.data.dataset.Dataset, Datasets],
    # A Hugging Face tokenizer
    hf_tokenizer: PreTrainedTokenizerBase,
    # A Hugging Face model
    hf_model: PreTrainedModel,
):
    """This method can be used to preprocess most Hugging Face Datasets for use in Blurr and other training
    libraries
    """
    if ("label") in dataset.column_names:
        dataset = dataset.rename_column("label", "labels")

    hf_model_fwd_args = list(inspect.signature(hf_model.forward).parameters.keys())
    bad_cols = set(dataset.column_names).difference(hf_model_fwd_args)
    dataset = dataset.remove_columns(bad_cols)

    dataset.set_format("torch")
    return dataset


#### Step 3: Build your `DataLoaders`.

Use `BlurrDataLoader` to build Blurr friendly dataloaders from your datasets. Passing `{'labels': label_names}` to your `batch_tfm_kwargs` will ensure that your lable/target names will be displayed in methods like `show_batch` and `show_results` (just as it works with the mid-level API)

In [None]:
label_names = raw_datasets["train"].features["label"].names

trn_dl = BlurrDataLoader(
    tokenized_datasets["train"],
    hf_arch,
    hf_config,
    hf_tokenizer,
    hf_model,
    preproccesing_func=preproc_hf_dataset,
    batch_decode_kwargs={"labels": label_names},
    shuffle=True,
    batch_size=8,
)

val_dl = BlurrDataLoader(
    tokenized_datasets["validation"],
    hf_arch,
    hf_config,
    hf_tokenizer,
    hf_model,
    preproccesing_func=preproc_hf_dataset,
    batch_decode_kwargs={"labels": label_names},
    batch_size=16,
)

dls = DataLoaders(trn_dl, val_dl)


In [None]:
b = dls.one_batch()
b[0]["input_ids"].shape


torch.Size([8, 71])

In [None]:
dls.show_batch(dataloaders=dls, max_n=2, trunc_at=800)


Unnamed: 0,text,target
0,"It will also unveil a version of its Windows Server 2003 operating system tuned specifically for storage devices. It also unveiled an update to its Windows Server 2003 operating system, which is tuned specifically for storage devices.",equivalent
1,"Country-music station KKCS has suspended two disc jockeys for playing songs by the Dixie Chicks in violation of a ban imposed after one group member criticized President George Bush. A radio station has suspended two disc jockeys for locking themselves in the studio and continuously playing Dixie Chicks songs, violating the station's two-month-old ban on the group's music.",not_equivalent


## Tests

The tests below to ensure the core DataBlock code above works for **all** pretrained sequence classification models available in Hugging Face.  These tests are excluded from the CI workflow because of how long they would take to run and the amount of data that would be required to download.

**Note**: Feel free to modify the code below to test whatever pretrained classification models you are working with ... and if any of your pretrained sequence classification models fail, please submit a github issue *(or a PR if you'd like to fix it yourself)*

In [None]:
# hide
[model_type for model_type in BLURR.get_models(task="SequenceClassification") if (not model_type.startswith("TF"))]


['AlbertForSequenceClassification',
 'BartForSequenceClassification',
 'BertForSequenceClassification',
 'BigBirdForSequenceClassification',
 'BigBirdPegasusForSequenceClassification',
 'CTRLForSequenceClassification',
 'CamembertForSequenceClassification',
 'CanineForSequenceClassification',
 'ConvBertForSequenceClassification',
 'DebertaForSequenceClassification',
 'DebertaV2ForSequenceClassification',
 'DistilBertForSequenceClassification',
 'ElectraForSequenceClassification',
 'FNetForSequenceClassification',
 'FlaubertForSequenceClassification',
 'FunnelForSequenceClassification',
 'GPT2ForSequenceClassification',
 'GPTJForSequenceClassification',
 'GPTNeoForSequenceClassification',
 'HubertForSequenceClassification',
 'IBertForSequenceClassification',
 'LEDForSequenceClassification',
 'LayoutLMForSequenceClassification',
 'LayoutLMv2ForSequenceClassification',
 'LongformerForSequenceClassification',
 'MBartForSequenceClassification',
 'MPNetForSequenceClassification',
 'MegatronB

In [None]:
# hide
pretrained_model_names = [
    "hf-internal-testing/tiny-albert",
    "hf-internal-testing/tiny-random-bart",
    "hf-internal-testing/tiny-bert",
    "google/bigbird-roberta-base",
    "google/bigbird-pegasus-large-arxiv",
    "hf-internal-testing/tiny-random-ctrl",
    "camembert-base",
    "hf-internal-testing/tiny-random-canine",
    "YituTech/conv-bert-base",
    "hf-internal-testing/tiny-deberta",
    "hf-internal-testing/tiny-random-deberta-v2",
    "hf-internal-testing/tiny-random-distilbert",
    "hf-internal-testing/tiny-electra",
    "google/fnet-base",
    "hf-internal-testing/tiny-random-flaubert",
    "hf-internal-testing/tiny-random-funnel",
    "hf-internal-testing/tiny-random-gpt2",
    "anton-l/gpt-j-tiny-random",
    "hf-internal-testing/tiny-random-gpt_neo",
    "kssteven/ibert-roberta-base",
    "hf-internal-testing/tiny-random-led",
    "hf-internal-testing/tiny-random-longformer",
    "hf-internal-testing/tiny-random-mbart",
    "hf-internal-testing/tiny-random-mpnet",
    # "nvidia/megatron-bert-cased-345m",                 could not test
    "hf-internal-testing/tiny-random-mobilebert",
    "openai-gpt",
    "google/reformer-crime-and-punishment",
    "google/rembert",
    "junnyu/roformer_chinese_sim_char_ft_small",
    "roberta-base",
    "squeezebert/squeezebert-uncased",
    "hf-internal-testing/tiny-random-transfo-xl",
    "xlm-mlm-en-2048",
    "xlm-roberta-base",
    "xlnet-base-cased",
]


In [None]:
# hide
# for model_name in pretrained_model_names:
#     tok = AutoTokenizer.from_pretrained(model_name)
#     print(f'=== {model_name} ===')
#     print(f'=== {tok.padding_side} ===')
#     print(f'=== {tok.pad_token_id} ===')
#     print(tok(['hi', 'hello everyone. its good to be here'], ['yo', 'yo'], padding='max_length', max_length=128))


In [None]:
# hide
raw_datasets = load_dataset("imdb", split=["train", "test"])
raw_datasets[0] = raw_datasets[0].add_column("is_valid", [False] * len(raw_datasets[0]))
raw_datasets[1] = raw_datasets[1].add_column("is_valid", [True] * len(raw_datasets[1]))

final_ds = concatenate_datasets([raw_datasets[0].shuffle().select(range(1000)), raw_datasets[1].shuffle().select(range(200))])
imdb_df = pd.DataFrame(final_ds)


Reusing dataset imdb (/home/wgilliam/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# hide
from transformers import RobertaTokenizer

model_cls = AutoModelForSequenceClassification
bsz = 2
seq_sz = 128

test_results = []
for model_name in pretrained_model_names:
    error = None

    print(f"=== {model_name} ===\n")

    tok_class = RobertaTokenizer if ("/ibert" in model_name) else None

    hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(model_name, model_cls=model_cls, tokenizer_cls=tok_class)

    print(f"architecture:\t{hf_arch}\ntokenizer:\t{type(hf_tokenizer).__name__}\n")

    # not all architectures include a native pad_token (e.g., gpt2, ctrl, etc...), so we add one here
    if hf_tokenizer.pad_token is None:
        hf_tokenizer.add_special_tokens({"pad_token": "<pad>"})
        hf_config.pad_token_id = hf_tokenizer.get_vocab()["<pad>"]
        hf_model.resize_token_embeddings(len(hf_tokenizer))

    try:
        blocks = (TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model, padding="max_length", max_length=seq_sz), CategoryBlock)

        dblock = DataBlock(blocks=blocks, get_x=ColReader("text"), get_y=ColReader("label"), splitter=ColSplitter())
        dls = dblock.dataloaders(imdb_df, bs=bsz)
        b = dls.one_batch()

        print("*** TESTING DataLoaders ***\n")
        test_eq(len(b), 2)
        test_eq(len(b[0]["input_ids"]), bsz)
        test_eq(b[0]["input_ids"].shape, torch.Size([bsz, seq_sz]))
        test_eq(len(b[1]), bsz)

        test_results.append((hf_arch, type(hf_tokenizer).__name__, model_name, "PASSED", ""))
        dls.show_batch(dataloaders=dls, max_n=2, trunc_at=1000)

    except Exception as err:
        test_results.append((hf_arch, type(hf_tokenizer).__name__, model_name, "FAILED", err))


=== hf-internal-testing/tiny-albert ===

architecture:	albert
tokenizer:	AlbertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"some have praised atlantis:thelostempire as a disney adventure for adults. i don't think so--at least not for thinking adults.br /br /this script suggests a beginning as a live-action movie, that struck someone as the type of crap you cannot sell to adults anymore. the ""crack staff"" of many older adventure movies has been done well before, (think the dir",0
1,"hollywood now has officially gone too far and i really hope that this travesty of a motion picture creates a genuine backlash against their crap machines, in spite of the good box office returns. if you are an industry person reading our comments looking for hints on what to do next, stop. stop making our tv shows into these repellent, stupid, money grubbing waste of time movies that suck. by doing so you are proving one thing: hollywood is out of ideas, and going to see the movies they chu",0


=== hf-internal-testing/tiny-random-bart ===

architecture:	bart
tokenizer:	BartTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Some have praised _Atlantis:_The_Lost_Empire_ as a Disney adventure for adults. I don't think so--at least not for thinking adults.<br /><br />This script suggests a beginning as a live-action movie, that struck someone as the type of crap you cannot sell to",0
1,"You cannot deny that we have an affinity for speed. That's why movies like Fast and the Furious, Dhoom, Rempit get made to play to the satisfaction of audiences, especially local ones. We live on a tiny island, and I cannot fathom why, for the relative efficiency of the public transportation system, most of",0


=== hf-internal-testing/tiny-bert ===

architecture:	bert
tokenizer:	BertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"some have praised _ atlantis : _ the _ lost _ empire _ as a disney adventure for adults. i don't think so - - at least not for thinking adults. < br / > < br / > this script suggests a beginning as a live - action movie, that struck someone as the type of crap you cannot sell to adults anymore. the "" crack staff "" of many older adventure movies has been done well before, ( think _ the dirty dozen _ ) but _ atlantis _ represents one of the worse films in that motif. the characters are weak. even the background that each member trots out seems stock and awkward",0
1,"before i begin, you need to know that i am a huge fan of many of sonny chiba's films. his biographical series of the life of his master, mas oyama, were amazing and among the best martial arts films ever made, as were most of his street fighter films. the action was practically non - stop and with the possible exception of bruce lee ( depending on who you ask ), he was the greatest martial arts practitioner on film during the 1970s. because they are so good, i've seen at least 15 of his films and recently bought some more ( which i am in the process of watching ).",0


=== google/bigbird-roberta-base ===



normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.


architecture:	big_bird
tokenizer:	BigBirdTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Some have praised _Atlantis:_The_Lost_Empire_ as a Disney adventure for adults. I don't think so--at least not for thinking adults.<br /><br />This script suggests a beginning as a live-action movie, that struck someone as the type of crap you cannot sell to adults anymore. The ""crack staff"" of many older adventure movies has been done well before, (think _The Dirty Dozen_) but _Atlantis_ represents one of the worse films in that motif. The characters are weak. Even the background that each member trots out seems stock and awkward",0
1,"Before I begin, you need to know that I am a huge fan of many of Sonny Chiba's films. His biographical series of the life of his master, Mas Oyama, were amazing and among the best martial arts films ever made, as were most of his Street Fighter films. The action was practically non-stop and with the possible exception of Bruce Lee (depending on who you ask), he was the greatest martial arts practitioner on film during the 1970s. Because they are so good, I've seen at least 15 of his films and recently bought some more (which I am in the process of watching).<",0


=== google/bigbird-pegasus-large-arxiv ===

architecture:	bigbird_pegasus
tokenizer:	PegasusTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Some have praised _Atlantis:_The_Lost_Empire_ as a Disney adventure for adults. I don't think so--at least not for thinking adults.br />br />This script suggests a beginning as a live-action movie, that struck someone as the type of crap you cannot sell to adults anymore. The ""crack staff"" of many older adventure movies has been done well before, (think _The Dirty Dozen_) but _Atlantis_ represents one of the worse films in that motif. The characters are weak. Even the background that each member trots out seems stock",0
1,"An independent feature can now be seen as both a work of film art and a video resume. Enter Broken, and aggressively promoted, twenty minute short with style and enthusiams to spare. But is it any good as a film, or does it only work as a demo piece? Ah, there in lies the rub.br />br />Broken is the story of Bonnie Clayton who is abducted after awakening from a reoccurring nightmare one night by ""a sadistic stranger and his colorful entourage"" (quote from the video box). As she's held captive, it becomes obvious that her abductors know things about her that",1


=== hf-internal-testing/tiny-random-ctrl ===



  angle_rates = 1 / torch.pow(10000, (2 * (i // 2)) / d_model_size)
Using pad_token, but it is not set yet.


architecture:	ctrl
tokenizer:	CTRLTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Some have praised _Atlantis:_The_Lost_Empire_ as a Disney adventure for adults. I don't think so--at least not for thinking adults.<br /><br />This script suggests a beginning as a live-action movie, that struck someone as the type of crap you cannot sell to adults anymore. The ""crack staff"" of many older adventure movies has been done well before, (think _The Dirty Dozen_) but _Atlantis_ represents one of the worse films in that motif. The characters are weak. Even the background that each member trots out seems stock and awkward at best. An MD/Medicine Man, a tomboy mechanic whose father",0
1,"There is no doubt that during the decade of the 30s, the names of Boris Karloff and Bela Lugosi became a sure guarantee of excellent performances in high quality horror films. After being Universal's ""first monster"" in the seminal classic, ""Dracula"", Bela Lugosi became the quintessential horror villain thanks to his elegant style and his foreign accent (sadly, this last factor would also led him to be type-casted during the 40s). In the same way, Boris Karloff's performance in James Whale's ""Frankenstein"" transformed him into the man to look for when one wanted a good monster. Of course, it was only natural for these icons to end up sharing the screen, and the movie",1


=== camembert-base ===

architecture:	camembert
tokenizer:	CamembertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Some have praised _Atlantis:_The_Lost_Empire_ as a Disney adventure for adults. I don't think so--at least not for thinking adults.<br /><br />This script suggests a beginning as a live-action movie, that struck someone as the type of crap you cannot sell to adults anymore. The ""crack staff"" of many older adventure movies has been done well before",0
1,"Here's another movie that should be loaded into a satellite, fired into space and pointed in the direction of the galaxy Andromeda to show distant possible civilizations the best of humanity. This movie is so endearingly stupid and revealingly honest in being little more than a rip-off of the already bad movie classic KING KONG from 1976 that it not only manages to upstage that film in terms of sheer belly laugh idiot",1


=== hf-internal-testing/tiny-random-canine ===



Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.


architecture:	canine
tokenizer:	CanineTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,Some have praised _Atlantis:_The_Lost_Empire_ as a Disney adventure for adults. I don't think so--at least not for thinking ad,0
1,Okay. Who was it? Who gave Revolver 10 out of 10? Are you tripping of your head on Ecstasy pipes? There were so many of you. D,0


=== YituTech/conv-bert-base ===

architecture:	convbert
tokenizer:	ConvBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"some have praised _ atlantis : _ the _ lost _ empire _ as a disney adventure for adults. i don't think so - - at least not for thinking adults. < br / > < br / > this script suggests a beginning as a live - action movie, that struck someone as the type of crap you cannot sell to adults anymore. the "" crack staff "" of many older adventure movies has been done well before, ( think _ the dirty dozen _ ) but _ atlantis _ represents one of the worse films in that motif. the characters are weak. even the background that each member trots out seems stock and awkward",0
1,"the lady from shanghai is weird even by the standards of its eminent director, orson welles, whose last hollywood film this was for many a moon. it's a kind of post - modern film noir made during the period when more conventional films of this type were quite popular, and it concerns a happy go lucky irish sailor ( played by welles ) who falls in with a mysterious lady ( rita hayworth, who was married to welles at the time ), and her crippled, and probably impotent husband, played with a brainy, malevolent gusto by everett sloan. a long sea voyage follows, with welles in",1


=== hf-internal-testing/tiny-deberta ===

architecture:	deberta
tokenizer:	DebertaTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Some have praised _Atlantis:_The_Lost_Empire_ as a Disney adventure for adults. I don't think so--at least not for thinking adults.br /br /This script suggests a beginning as a live-action movie, that struck someone as the type of crap you cannot sell to adults anymore. The ""crack staff"" of many older adventure movies",0
1,"... And that's a bad thing, because at least if this had been a Troma film, it would have had wanton violence and a greater sense of anarchic abandon that might have brought my rating up a bit.br /br /So what we have instead is a very tame (rated PG), barely lukewarm, low budget (Roger Corman produced it with an un",0


=== hf-internal-testing/tiny-random-deberta-v2 ===

architecture:	deberta_v2
tokenizer:	DebertaV2Tokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Some have praised _Atlantis:_The_Lost_Empire_ as a Disney adventure for adults. I don't think so--at least not for thinking adults.<br /><br />This script suggests a beginning as a live-action movie, that struck someone as the type of crap you cannot sell to adults anymore. The ""crack staff"" of many older adventure movies has been done well before, (think _The Dirty Dozen_) but _Atlantis_ represents one of the worse films in that motif. The characters are weak. Even the background that each member trots out seems stock and",0
1,"(Rating: 21 by The Film Snob.) (See our blog What-To-See-Next for details on our rating system.)<br /><br />Here's a movie that will have you clawing at your own face in an attempt to earn release from the on-screen tedium. <br /><br />You'll not be wringing your hands, nor rolling your eyes, nor sighing into your popcorn. No indeed. For a movie of *this* averagousity, only clawing at your own face will do. <br /><br />When you begin to claw your own face --",0


=== hf-internal-testing/tiny-random-distilbert ===

architecture:	distilbert
tokenizer:	DistilBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,some have praised _ atlantis : _ the _ lost _ empire _ as a disney adventure for adults. i don't think so - - at least not for thinking adults. < br / > < br / > thi,0
1,to be a buster keaton fan is to have your heart broken on a regular basis. most of us first encounter keaton in one of the brilliant feature films from his g,0


=== hf-internal-testing/tiny-electra ===

architecture:	electra
tokenizer:	ElectraTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"some have praised _ atlantis : _ the _ lost _ empire _ as a disney adventure for adults. i don't think so - - at least not for thinking adults. < br / > < br / > this script suggests a beginning as a live - action movie, that struck someone as the type of crap you cannot sell to adults anymore. the "" crack staff "" of many older adventure movies has been done well before, ( think _",0
1,"okay. who was it? who gave revolver 10 out of 10? are you tripping of your head on ecstasy pipes? there were so many of you. did you do it for a dare? is this some kind of cult? or did guy richie himself sign up 788 times under different names? < br / > < br / > before i say anything else, i'll say this. just because you don't understand a film doesn't mean that it's not great. maybe you've had a",0


=== google/fnet-base ===

architecture:	fnet
tokenizer:	FNetTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Some have praised _Atlantis:_The_Lost_Empire_ as a Disney adventure for adults. I don't think so--at least not for thinking adults.<br /><br />This script suggests a beginning as a live-action movie, that struck someone as the type of crap you cannot sell to adults anymore. The ""crack staff"" of many older adventure movies has been done well before, (think _The Dirty Dozen_) but _Atlantis_ represents one of the worse films in that motif.",0
1,WAQT is a perfect example of a chicken soup not exactly for your soul. The broth unfortunately has lost its actual taste thanks to all the excess dilution and garnishing that went into its making.<br /><br />What's surprising and disappointing about WAQT is that it comes from a director who stayed away from the usual clichés of Hindi cinema in his first venture but who in his second outing gives in for all the stereotype film formulas. While Vipul Shah had the conviction to show something as implausible as blind men,1


=== hf-internal-testing/tiny-random-flaubert ===

architecture:	flaubert
tokenizer:	FlaubertTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Some have praised _ Atlantis : _ The _ Lost _ Empire _ as a Disney adventure for adults. I don' t think so--at least not for thinking adults. < br / > < br / > This script suggests a beginning as a live-action movie, that struck someone as the type of crap you cannot sell to adults anymore. The "" crack staff "" of many older adventure movies has been done well before, ( think _ The Dirty Dozen _ ) but _ Atlantis _",0
1,""" Hollywood Hotel "" has relationships to many films like "" Ella Cinders "" and "" Merton of the Movies "" about someone winning a contest including a contract to make films in Hollywood, only to find the road to stardom either paved with pitfalls or non-existent. In fact, as I was watching it tonight, on Turner Classic Movies, I was considering whether or not the authors of the later musical classic "" Singing In The Rain "" may have taken some of their ideas from "" Hollywood",1


=== hf-internal-testing/tiny-random-funnel ===

architecture:	funnel
tokenizer:	FunnelTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,some have praised _ atlantis : _ the _ lost _ empire _ as a disney adventure for adults. i don't think so - - at least not for thinking adults. < br / > < br / > thi,0
1,"in this excellent twentieth - century fox film - noir, the metropolis is a labyrinth of despair in which scavengers and predators survive by living off",1


=== hf-internal-testing/tiny-random-gpt2 ===



Using pad_token, but it is not set yet.


architecture:	gpt2
tokenizer:	GPT2TokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Some have praised _Atlantis:_The_Lost_Empire_ as a Disney adventure for adults. I don't think so--at least not for thinking adults.<br /><br />This script suggests a beginning as a live-action movie, that struck someone as the type of crap you cannot sell to adults",0
1,"Don't you just hate it when you order steak but the restaurant gives you chicken?<br /><br />Such is how I felt watching this so-called ""Battlestar Galactica"". Arguments can be made over its quality but the fact remains, it's NOT what the fans ordered.<br /><br />Imagine if",0


=== anton-l/gpt-j-tiny-random ===



Using pad_token, but it is not set yet.


architecture:	gptj
tokenizer:	GPT2TokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Some have praised _Atlantis:_The_Lost_Empire_ as a Disney adventure for adults. I don't think so--at least not for thinking adults.<br /><br />This script suggests a beginning as a live-action movie, that struck someone as the type of crap you cannot sell to adults anymore. The ""crack staff"" of many older adventure movies has been done well before, (think _The Dirty Dozen_) but _Atlantis_ represents one of the worse films in that motif. The characters are weak. Even the background that each member trots out seems stock and awkward at best",0
1,"When a movie of a book seems pointless and incomprehensible, the cause can invariably be found in the book: either it was pointless to start with, or the point is one not easily conveyed to film, or the movie missed the point, which is the most frequent of these results, and the easiest to happen, especially when the point is one not easily defined. The book ""Morvern Callar"" has a point; every reader of the book must have felt this, and felt as if he had gotten it; but I suspect most of them could not state it in words. I'm not sure I can, myself, but perhaps it",0


=== hf-internal-testing/tiny-random-gpt_neo ===



Using pad_token, but it is not set yet.


architecture:	gpt_neo
tokenizer:	GPT2TokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Some have praised _Atlantis:_The_Lost_Empire_ as a Disney adventure for adults. I don't think so--at least not for thinking adults.<br /><br />This script suggests a beginning as a live-action movie, that struck someone as the type of crap you cannot sell to adults",0
1,"More wide-eyed, hysterical 50s hyper-cheerfulness that gives new meaning to anti-social, pathological behaviour. Danza and Grayson will leave you begging for mercy.<br /><br />It's a shame that all the people involved in the making of this movie are now dead (or in nursing homes",0


=== kssteven/ibert-roberta-base ===

architecture:	ibert
tokenizer:	RobertaTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Some have praised _Atlantis:_The_Lost_Empire_ as a Disney adventure for adults. I don't think so--at least not for thinking adults.<br /><br />This script suggests a beginning as a live-action movie, that struck someone as the type of crap you cannot sell to adults anymore. The ""crack staff"" of many older adventure movies has been done well before, (think _The Dirty Dozen_) but _Atlantis_ represents one of the worse films in that motif. The characters are weak. Even the background that each member trots out seems stock and awkward",0
1,"Hollywood now has officially gone too far and I really hope that this travesty of a motion picture creates a genuine backlash against their crap machines, in spite of the good box office returns. If you are an industry person reading our comments looking for hints on what to do next, STOP. Stop making our TV shows into these repellent, stupid, money grubbing waste of time movies that suck. By doing so you are proving one thing: Hollywood is out of ideas, and going to see the movies they churn out only perpetuates the cycle of disgust. What's next -- You guys gonna go & ruin The B",0


=== hf-internal-testing/tiny-random-led ===

architecture:	led
tokenizer:	LEDTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Some have praised _Atlantis:_The_Lost_Empire_ as a Disney adventure for adults. I don't think so--at least not for thinking adults.<br /><br />This script suggests a beginning as a live-action movie, that struck someone as the type of crap you cannot sell to",0
1,WAQT is a perfect example of a chicken soup not exactly for your soul. The broth unfortunately has lost its actual taste thanks to all the excess dilution and garnishing that went into its making.<br /><br />What's surprising and disappointing about WAQT is that it comes from a director who stayed away from the,1


=== hf-internal-testing/tiny-random-longformer ===

architecture:	longformer
tokenizer:	LongformerTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Some have praised _Atlantis:_The_Lost_Empire_ as a Disney adventure for adults. I don't think so--at least not for thinking adults.<br /><br />This script suggests a beginning as a live-action movie, that struck someone as the type of crap you cannot sell to",0
1,"To be a Buster Keaton fan is to have your heart broken on a regular basis. Most of us first encounter Keaton in one of the brilliant feature films from his great period of independent production: 'The General', 'The Navigator', 'Sherlock Jnr'. We recognise him as the greatest figure in the entire history of film com",0


=== hf-internal-testing/tiny-random-mbart ===

architecture:	mbart
tokenizer:	MBartTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Some have praised Atlantis:TheLostEmpire as a Disney adventure for adults. I don't think so--at least not for thinking adults.br br This script suggests a beginning as a live-action movie, that struck someone as the type of crap you cannot sell to adults anymor",0
1,"Watching Cliffhanger makes me nostalgic for the early '0s, a time when virtually every new action movie could be described as ""Die Hard in a on a."" Cliffhanger is ""Die Hard on a mountain,"" and pretty good, for what it is.br br But unlike Passenger 5 and U",1


=== hf-internal-testing/tiny-random-mpnet ===

architecture:	mpnet
tokenizer:	MPNetTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,some have praised _ atlantis : _ the _ lost _ empire _ as a disney adventure for adults. i don't think so - - at least not for thinking adults. < br / > < br / > thi,0
1,"this one is considered a key pre - code film from the director who later made the musical biopic the jolson story ( 1946 ), but also the paranoid sci - fi inv",1


=== hf-internal-testing/tiny-random-mobilebert ===

architecture:	mobilebert
tokenizer:	MobileBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,some have praised _ atlantis : _ the _ lost _ empire _ as a disney adventure for adults. i don't think so - - at least not for thinking adults. < br / > < br / > thi,0
1,"when a movie of a book seems pointless and incomprehensible, the cause can invariably be found in the book : either it was pointless to start with, or the",0


=== openai-gpt ===



Using pad_token, but it is not set yet.


architecture:	openai
tokenizer:	OpenAIGPTTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"some have praised _ atlantis : _ the _ lost _ empire _ as a disney adventure for adults. i don't think so - - at least not for thinking adults. < br / > < br / > this script suggests a beginning as a live - action movie, that struck someone as the type of crap you cannot sell to adults anymore. the "" crack staff "" of many older adventure movies has been done well before, ( think _ the dirty dozen _ ) but _ atlantis _ represents one of the worse films in that motif. the characters are weak. even the background that each member trots out seems stock and awkward",0
1,"hollywood now has officially gone too far and i really hope that this travesty of a motion picture creates a genuine backlash against their crap machines, in spite of the good box office returns. if you are an industry person reading our comments looking for hints on what to do next, stop. stop making our tv shows into these repellent, stupid, money grubbing waste of time movies that suck. by doing so you are proving one thing : hollywood is out of ideas, and going to see the movies they churn out only perpetuates the cycle of disgust. what's next - - you guys gonna go & ruin the bionic man",0


=== google/reformer-crime-and-punishment ===



Using pad_token, but it is not set yet.


architecture:	reformer
tokenizer:	ReformerTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Some have praised _Atlantis:_The_Lost_Empire_ as a Disney adventure for adults. I dont think so--at least not for thinking adults.br br This script suggests a beginning as a live-action movie, that struck someone as the type",0
1,"Watching Cliffhanger makes me nostalgic for the early s, a time when virtually every new action movie could be described as Die Hard in a on a. Cliffhanger is Die Hard on a mountain, and pretty good, for what it is.br br",1


=== google/rembert ===

architecture:	rembert
tokenizer:	RemBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Some have praised _Atlantis:_The_Lost_Empire_ as a Disney adventure for adults. I don't think so--at least not for thinking adults.<br /><br />This script suggests a beginning as a live-action movie, that struck someone as the type of crap you cannot sell to adults anymore. The ""crack staff"" of many older adventure movies has been done well before, (think _The Dirty Dozen_) but _Atlantis_ represents one of the worse films in that motif. The characters are weak. Even the background that each member trots out seems",0
1,"The Lady From Shanghai is weird even by the standards of its eminent director, Orson Welles, whose last Hollywood film this was for many a moon. It's a kind of post-modern film noir made during the period when more conventional films of this type were quite popular, and it concerns a happy go lucky Irish sailor (played by Welles) who falls in with a mysterious lady (Rita Hayworth, who was married to Welles at the time), and her crippled, and probably impotent husband, played with a brainy, malevolent gusto by Everett Sloan. A long sea voyage follows",1


=== junnyu/roformer_chinese_sim_char_ft_small ===

architecture:	roformer
tokenizer:	RoFormerTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"some have praised atlantis : the lost empire as a disney adventure for adults. i don't think so - - at least not for thinking adults. < br / > < br / > this script suggests a beginning as a live - action movie, that struck someone as the type of crap you cannot sell to adults anymore. the "" crack staff "" of many older adventure movies has been done well before",0
1,"neatly skipping over everything from the coup in cuba to his undercover entry into bolivia, part two of soderbergh's portrayal of che guevara is that of the tragic hero. as with che part one, this rather rambling guerrilla warfare escapade through the colourful mountains of bolivia is probably destined to disappoint more people than it will satisfy, so why was the film (",1


=== roberta-base ===

architecture:	roberta
tokenizer:	RobertaTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Some have praised _Atlantis:_The_Lost_Empire_ as a Disney adventure for adults. I don't think so--at least not for thinking adults.<br /><br />This script suggests a beginning as a live-action movie, that struck someone as the type of crap you cannot sell to adults anymore. The ""crack staff"" of many older adventure movies has been done well before, (think _The Dirty Dozen_) but _Atlantis_ represents one of the worse films in that motif. The characters are weak. Even the background that each member trots out seems stock and awkward",0
1,"WAQT is a perfect example of a chicken soup not exactly for your soul. The broth unfortunately has lost its actual taste thanks to all the excess dilution and garnishing that went into its making.<br /><br />What's surprising and disappointing about WAQT is that it comes from a director who stayed away from the usual clichés of Hindi cinema in his first venture but who in his second outing gives in for all the stereotype film formulas. While Vipul Shah had the conviction to show something as implausible as blind men robbing a bank in AANKHEN, he just fails to induce life in the",1


=== squeezebert/squeezebert-uncased ===

architecture:	squeezebert
tokenizer:	SqueezeBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"some have praised _ atlantis : _ the _ lost _ empire _ as a disney adventure for adults. i don't think so - - at least not for thinking adults. < br / > < br / > this script suggests a beginning as a live - action movie, that struck someone as the type of crap you cannot sell to adults anymore. the "" crack staff "" of many older adventure movies has been done well before, ( think _ the dirty dozen _ ) but _ atlantis _ represents one of the worse films in that motif. the characters are weak. even the background that each member trots out seems stock and awkward",0
1,""" hollywood hotel "" has relationships to many films like "" ella cinders "" and "" merton of the movies "" about someone winning a contest including a contract to make films in hollywood, only to find the road to stardom either paved with pitfalls or non - existent. in fact, as i was watching it tonight, on turner classic movies, i was considering whether or not the authors of the later musical classic "" singing in the rain "" may have taken some of their ideas from "" hollywood hotel "", most notably a temperamental leading lady star in a movie studio and a conclusion concerning one person singing a film score while another person",1


=== hf-internal-testing/tiny-random-transfo-xl ===



Using pad_token, but it is not set yet.


architecture:	transfo_xl
tokenizer:	TransfoXLTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Some have praised _ Atlantis: _ The _ Lost _ Empire _ as a Disney adventure for adults. I don't think least not for thinking adults. < br / > < br / > This script suggests a beginning as a live-action movie, that struck someone as the type of crap you cannot sell to adults anymore. The ""crack staff"" of many older adventure movies has been done well before, (think _ The Dirty Dozen _) but _ Atlantis _ represents one of the worse films in that motif. The characters are weak. Even the background that each member trots out seems stock and awkward at best. An MD / Medicine",0
1,"Don't you just hate it when you order steak but the restaurant gives you chicken? < br / > < br / > Such is how I felt watching this so-called ""Battlestar Galactica."" Arguments can be made over its quality but the fact remains, it's NOT what the fans ordered. < br / > < br / > Imagine if you were sitting down at that proverbial restaurant I mentioned. You have waited years for them to bring back their famous New York Strip steak which you loved. When your meal arrives, you find they've applied the name ""New York Strip"" to a chicken dish. You complain but the waiter merely",0


=== xlm-mlm-en-2048 ===

architecture:	xlm
tokenizer:	XLMTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"some have praised _ atlantis : _ the _ lost _ empire _ as a disney adventure for adults. i don 't think so--at least not for thinking adults. < br / > < br / > this script suggests a beginning as a live-action movie, that struck someone as the type of crap you cannot sell to adults anymore. the "" crack staff "" of many older adventure movies has been done well before, ( think _ the dirty dozen _ ) but _ atlantis _ represents one of the worse films in that motif. the characters are weak. even the background that each member trots out seems stock and awkward at be",0
1,"an independent feature can now be seen as both a work of film art and a video resume. enter broken, and aggressively promoted, twenty minute short with style and enthusiams to spare. but is it any good as a film, or does it only work as a demo piece? ah, there in lies the rub. < br / > < br / > broken is the story of bonnie clayton who is abducted after awakening from a reoccurring nightmare one night by "" a sadistic stranger and his colorful entourage "" ( quote from the video box ). as she's held captive, it becomes obvious that her",1


=== xlm-roberta-base ===

architecture:	xlm_roberta
tokenizer:	XLMRobertaTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Some have praised _Atlantis:_The_Lost_Empire_ as a Disney adventure for adults. I don't think so--at least not for thinking adults.<br /><br />This script suggests a beginning as a live-action movie, that struck someone as the type of crap you cannot sell to adults anymore. The ""crack staff"" of many older adventure movies has been done well before, (think _The Dirty Dozen_) but _Atlantis_ represents one of the worse films in that motif. The characters are weak. Even the background that",0
1,"In this excellent Twentieth-Century Fox film-noir, the metropolis is a labyrinth of despair in which scavengers and predators survive by living off one another. Brooding cityscapes lower over puny humanity in bleak expressionist symbolism.<br /><br />A prostitute has her purse snatched on the subway. It contains a microfilm, and a communist spy ring will go to any lengths to recover it. Two parallel investigations unfold as both spies and cops hunt down the",1


=== xlnet-base-cased ===

architecture:	xlnet
tokenizer:	XLNetTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Some have praised _Atlantis:_The_Lost_Empire_ as a Disney adventure for adults. I don't think so--at least not for thinking adults.<br /><br />This script suggests a beginning as a live-action movie, that struck someone as the type of crap you cannot sell to adults anymore. The ""crack staff"" of many older adventure movies has been done well before, (think _The Dirty Dozen_) but _Atlantis_ represents one of the worse films in that motif. The characters are weak.",0
1,"... And that's a bad thing, because at least if this had been a Troma film, it would have had wanton violence and a greater sense of anarchic abandon that might have brought my rating up a bit.<br /><br />So what we have instead is a very tame (rated PG), barely lukewarm, low budget (Roger Corman produced it with an unknown director who has subsequently remained unknown) Gremlins (1984)/Critters (1986)-wannabe with almost exclusively flat humor",0


In [None]:
# hide_input
test_results_df = pd.DataFrame(test_results, columns=["arch", "tokenizer", "model_name", "result", "error"])
display_df(test_results_df)


Unnamed: 0,arch,tokenizer,model_name,result,error
0,albert,AlbertTokenizerFast,hf-internal-testing/tiny-albert,PASSED,
1,bart,BartTokenizerFast,hf-internal-testing/tiny-random-bart,PASSED,
2,bert,BertTokenizerFast,hf-internal-testing/tiny-bert,PASSED,
3,big_bird,BigBirdTokenizerFast,google/bigbird-roberta-base,PASSED,
4,bigbird_pegasus,PegasusTokenizerFast,google/bigbird-pegasus-large-arxiv,PASSED,
5,ctrl,CTRLTokenizer,hf-internal-testing/tiny-random-ctrl,PASSED,
6,camembert,CamembertTokenizerFast,camembert-base,PASSED,
7,canine,CanineTokenizer,hf-internal-testing/tiny-random-canine,PASSED,
8,convbert,ConvBertTokenizerFast,YituTech/conv-bert-base,PASSED,
9,deberta,DebertaTokenizerFast,hf-internal-testing/tiny-deberta,PASSED,


## Summary

The `blurr.data.core` module contains the fundamental bits for all data preprocessing tasks

In [None]:
# hide
from nbdev.export import notebook2script

notebook2script()


Converted 00_utils.ipynb.
Converted 01_data-core.ipynb.
Converted 01_modeling-core.ipynb.
Converted 02_data-language-modeling.ipynb.
Converted 02_modeling-language-modeling.ipynb.
Converted 03_data-token-classification.ipynb.
Converted 03_modeling-token-classification.ipynb.
Converted 04_data-question-answering.ipynb.
Converted 04_modeling-question-answering.ipynb.
Converted 10_data-seq2seq-core.ipynb.
Converted 10_modeling-seq2seq-core.ipynb.
Converted 11_data-seq2seq-summarization.ipynb.
Converted 11_modeling-seq2seq-summarization.ipynb.
Converted 12_data-seq2seq-translation.ipynb.
Converted 12_modeling-seq2seq-translation.ipynb.
Converted 99a_examples-high-level-api.ipynb.
Converted 99b_examples-glue.ipynb.
Converted 99c_examples-glue-plain-pytorch.ipynb.
Converted 99d_examples-multilabel.ipynb.
Converted 99e_examples-causal-lm-gpt2.ipynb.
Converted index.ipynb.
