In [None]:
# default_exp data.core


In [None]:
# all_slow


In [None]:
#hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# data.core

> This module contains the core bits required to use the fastai DataBlock API and/or mid-level data processing pipelines to organize your data in a way modelable by Hugging Face transformer implementations.

In [None]:
# export
import os, inspect
from dataclasses import dataclass
from functools import reduce, partial
from typing import Any, Callable, List, Optional, Union, Type

from fastcore.all import *
from fastai.data.block import TransformBlock
from fastai.data.core import Datasets, DataLoader, DataLoaders, TfmdDL
from fastai.imports import *
from fastai.losses import CrossEntropyLossFlat
from fastai.text.data import SortedDL
from fastai.torch_core import *
from fastai.torch_imports import *
from transformers import DataCollatorWithPadding, logging, PretrainedConfig, PreTrainedTokenizerBase, PreTrainedModel

from blurr.utils import BLURR

logging.set_verbosity_error()


In [None]:
# hide_input
import pdb

from datasets import load_dataset, concatenate_datasets
from fastai.data.block import CategoryBlock, ColReader, ColSplitter, DataBlock, ItemGetter, RandomSplitter
from fastcore.test import *
from nbverbose.showdoc import show_doc

from blurr.utils import print_versions

os.environ["TOKENIZERS_PARALLELISM"] = "false"
print("What we're running with at the time this documentation was generated:")
print_versions("torch fastai transformers")


What we're running with at the time this documentation was generated:
torch: 1.10.1+cu111
fastai: 2.5.3
transformers: 4.15.0


In [None]:
# hide
# cuda
torch.cuda.set_device(1)
print(f"Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}")


Using GPU #1: GeForce GTX 1080 Ti


## Mid-level API

Base tokenization, batch transform, and DataBlock methods

### `HF_BaseInput`

A `HF_BaseInput` object is returned from the decodes method of `HF_AfterBatchTransform` as a means to customize @typedispatched functions like `DataLoaders.show_batch` and `Learner.show_results`. It uses the "input_ids" of a Hugging Face object as the representative tensor for `show` methods

In [None]:
# export
class HF_BaseInput(TensorBase):
    """The base represenation of your inputs; used by the various fastai `show` methods"""

    def show(
        self,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # The "context" associated to the current `show_batch/results` call
        ctx=None,
        # Any truncation you want to apply to the decoded tokenized inputs
        trunc_at: int = None,
        # A decoded string of your tokenized inputs (input_ids)
    ) -> str:
        input_ids = self.cpu().numpy()
        decoded_input = str(hf_tokenizer.decode(input_ids, skip_special_tokens=True))[:trunc_at]

        return show_title(decoded_input, ctx=ctx, label="text")


### `HF_BeforeBatchTransform` 

Inspired by this [article](https://docs.fast.ai/tutorial.transformers.html), inputs can come in as raw text, a list of words (e.g., tasks like Named Entity Recognition (NER), where you want to predict the label of each token), or pre-processed "input_ids"

**On-the-fly Batch-Time Tokenization**: 

The previous version of the library performed the tokenization/numericalization as a type transform when the raw data was read, and included a couple batch transforms to prepare the data for collation (e.g., to be made into a mini-batch). With this update, everything is done in a single batch transform.  

Why?  Part of the inspiration had to do with the mechanics of the huggingrace tokenizer, in particular how by default it returns a collated mini-batch of data given a list of sequences. And where do we get a list of examples with fastai? In the batch transforms!  So I thought, hey, why not do everything dynamically at batch time?  And with a bit of tweaking, I got everything to work pretty well.  The result is *less code*, *faster mini-batch creation*, *less RAM utilization* and time spent tokenizing (really helps with very large datasets), and *more flexibility*.

In [None]:
# export
class HF_BeforeBatchTransform(Transform):
    """
    Handles everything you need to assemble a mini-batch of inputs and targets, as well as
    decode the dictionary produced as a byproduct of the tokenization process in the `encodes` method.
    """

    def __init__(
        self,
        # The abbreviation/name of your Hugging Face transformer architecture (e.b., bert, bart, etc..)
        hf_arch: str,
        # A specific configuration instance you want to use
        hf_config: PretrainedConfig,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # A Hugging Face model
        hf_model: PreTrainedModel,
        # If you are passing in the "inpu_ids" as your inputs, set `is_numericalised` = True
        is_numericalized: bool = False,
        # The token ID that should be ignored when calculating the loss
        ignore_token_id=CrossEntropyLossFlat().ignore_index,
        # To control the length of the padding/truncation. It can be an integer or None,
        # in which case it will default to the maximum length the model can accept. If the model has no
        # specific maximum input length, truncation/padding to max_length is deactivated.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        max_length: int = None,
        # To control the `padding` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `'do_not_pad'.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        padding: Union[bool, str] = True,
        # To control `truncation` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `do_not_truncate`.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        truncation: Union[bool, str] = True,
        # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
        # if your inputs are pre-tokenized (not numericalized)
        is_split_into_words: bool = False,
        # Any other keyword arguments you want included when using your `hf_tokenizer` to tokenize your inputs
        tok_kwargs: dict = {},
        # Keyword arguments to apply to `HF_BeforeBatchTransform`
        **kwargs
    ):
        store_attr(self=self, names="hf_arch, hf_config, hf_tokenizer, hf_model, is_numericalized")
        store_attr(self=self, names="max_length, padding, truncation, is_split_into_words, ignore_token_id, tok_kwargs")
        store_attr(self=self, names="kwargs")

    def encodes(self, samples, return_batch_encoding=False):
        """
        This method peforms on-the-fly, batch-time tokenization of your data. In other words, your raw inputs
        are tokenized as needed for each mini-batch of data rather than requiring pre-tokenization of your full
        dataset ahead of time.
        """
        samples = L(samples)

        # grab inputs
        if is_listy(samples[0][0]) and not self.is_split_into_words and not self.is_numericalized:
            inps = list(zip(samples.itemgot(0, 0), samples.itemgot(0, 1)))
        else:
            inps = samples.itemgot(0).items

        # if passing "input_ids" as your inputs, build the other sequence attributes using `prepare_for_model` since
        # the inputs have already been tokenized/numericalized ... else we tokenize the raw text using `__call__`
        batch_encoding_func = self.hf_tokenizer.prepare_for_model if self.is_numericalized else self.hf_tokenizer

        batch_encoding = batch_encoding_func(
            inps,
            max_length=self.max_length,
            padding=self.padding,
            truncation=self.truncation,
            is_split_into_words=self.is_split_into_words,
            return_tensors="pt",
            **self.tok_kwargs
        )

        # update the samples with tokenized inputs (e.g. input_ids, attention_mask, etc...)
        d_keys = batch_encoding.keys()
        updated_samples = [(*[{k: batch_encoding[k][idx] for k in d_keys}], *sample[1:]) for idx, sample in enumerate(samples)]

        if return_batch_encoding:
            return updated_samples, batch_encoding

        return updated_samples


### `HF_AfterBatchTransform`

With fastai 2.1.5, before batch transforms no longer have a `decodes` method ... and so, I've introduced a standard batch transform here (one that occurs "after" the batch has been created) that will do the decoding for us.

In [None]:
# export
class HF_AfterBatchTransform(Transform):
    """A class used to cast your inputs into something understandable in fastai `show` methods"""

    def __init__(
        self,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # The return type your decoded inputs should be cast too (used by methods such as `show_batch`)
        input_return_type: Type = HF_BaseInput,
    ):
        store_attr(self=self, names="hf_tokenizer, input_return_type")

    def decodes(
        self,
        # The encoded samples for your batch. `input_ids` will be pulled out of your dictionary of Hugging Face
        # inputs, cast to `self.input_return_type` and returned for methods such as `show_batch`
        encoded_samples: Type,
    ):
        """Returns the proper object and data for show related fastai methods"""
        if isinstance(encoded_samples, dict):
            return self.input_return_type(encoded_samples["input_ids"], hf_tokenizer=self.hf_tokenizer)
        return encoded_samples


### `HF_TextBlock`

A basic wrapper that links defaults transforms for the Data Block API, `HF_TextBlock` is designed with sensible defaults to minimize user effort in defining their transforms pipeline. It handles setting up your `HF_BeforeBatchTransform` and `HF_AfterBatchTransform` transforms regardless of data source (e.g., this will work with files, DataFrames, whatever). 

You must either pass in your own instance of a `HF_BeforeBatchTransform` class or the Hugging Face objects returned from `BLURR.get_hf_objects` (e.g.,architecture, config, tokenizer, and model). The other args are optional.

We also include a `blurr_sort_func` that works with `SortedDL` to sort based on the number of tokens in each example.

In [None]:
# export
def blurr_sort_func(
    example,
    # A Hugging Face tokenizer
    hf_tokenizer: PreTrainedTokenizerBase,
    # If you are passing in the "inpu_ids" as your inputs, set `is_numericalised` = True
    is_numericalized: bool = False,
    # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
    # if your inputs are pre-tokenized (not numericalized)
    is_split_into_words: bool = False,
    # Any other keyword arguments you want to include during tokenization
    tok_kwargs: dict = {},
):
    """This method is used by the `SortedDL` to ensure your dataset is sorted *after* tokenization"""
    if is_split_into_words or is_numericalized:
        return len(example[0])

    return len(hf_tokenizer.tokenize(example[0], **tok_kwargs))


In [None]:
# export
class HF_TextBlock(TransformBlock):
    """The core `TransformBlock` to prepare your data for training in Blurr with fastai's `DataBlock` API"""

    def __init__(
        self,
        # The abbreviation/name of your Hugging Face transformer architecture (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_arch: str = None,
        # A Hugging Face configuration object (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_config: PretrainedConfig = None,
        # A Hugging Face tokenizer (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_tokenizer: PreTrainedTokenizerBase = None,
        # A Hugging Face model (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_model: PreTrainedModel = None,
        # If you are passing in the "inpu_ids" as your inputs, set `is_numericalised` = True
        is_numericalized: bool = False,
        # The token ID that should be ignored when calculating the loss
        ignore_token_id=CrossEntropyLossFlat().ignore_index,
        # The before batch transform you want to use to tokenize your raw data on the fly
        # (defaults to an instance of `HF_BeforeBatchTransform` created using the Hugging Face objects defined above)
        before_batch_tfm: HF_BeforeBatchTransform = None,
        # The batch_tfms to apply to the creation of your DataLoaders,
        # (defaults to HF_AfterBatchTransform created using the Hugging Face objects defined above)
        after_batch_tfm: HF_AfterBatchTransform = None,
        # To control the length of the padding/truncation. It can be an integer or None,
        # in which case it will default to the maximum length the model can accept. If the model has no
        # specific maximum input length, truncation/padding to max_length is deactivated.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        max_length: int = None,
        # To control the `padding` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `'do_not_pad'.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        padding: Union[bool, str] = True,
        # To control `truncation` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `do_not_truncate`.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        truncation: Union[bool, str] = True,
        # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
        # if your inputs are pre-tokenized (not numericalized)
        is_split_into_words: bool = False,
        # The return type your decoded inputs should be cast too (used by methods such as `show_batch`)
        input_return_type: Type = HF_BaseInput,
        # The type of `DataLoader` you want created (defaults to `SortedDL`)
        dl_type: DataLoader = None,
        # Any keyword arguments you want applied to your before batch tfm
        before_batch_kwargs: dict = {},
        # Any keyword arguments you want applied to your after batch tfm (or referred to in fastai as `batch_tfms`)
        after_batch_kwargs: dict = {},
        # Any keyword arguments you want your Hugging Face tokenizer to use during tokenization
        tok_kwargs: dict = {},
        # Any keyword arguments you want to have applied with generating text
        text_gen_kwargs: dict = {},
        # Any keyword arguments you want applied to `HF_TextBlock`
        **kwargs
    ):
        if (not all([hf_arch, hf_config, hf_tokenizer, hf_model])) and before_batch_tfm is None:
            raise ValueError("You must supply an hf_arch, hf_config, hf_tokenizer, hf_model -or- a HF_BeforeBatchTransform")

        if before_batch_tfm is None:
            before_batch_tfm = HF_BeforeBatchTransform(
                hf_arch,
                hf_config,
                hf_tokenizer,
                hf_model,
                is_numericalized=is_numericalized,
                ignore_token_id=ignore_token_id,
                max_length=max_length,
                padding=padding,
                truncation=truncation,
                is_split_into_words=is_split_into_words,
                tok_kwargs=tok_kwargs.copy(),
                **before_batch_kwargs.copy()
            )

        if after_batch_tfm is None:
            after_batch_tfm = HF_AfterBatchTransform(
                hf_tokenizer=before_batch_tfm.hf_tokenizer, input_return_type=input_return_type, **after_batch_kwargs.copy()
            )

        if dl_type is None:
            dl_sort_func = partial(
                blurr_sort_func,
                hf_tokenizer=before_batch_tfm.hf_tokenizer,
                is_numericalized=is_numericalized,
                is_split_into_words=before_batch_tfm.is_split_into_words,
                tok_kwargs=before_batch_tfm.tok_kwargs.copy(),
            )

            dl_type = partial(SortedDL, sort_func=dl_sort_func)

        # set the TransformBlock's Hugging Face face objects
        self.hf_arch = before_batch_tfm.hf_arch
        self.hf_config = before_batch_tfm.hf_config
        self.hf_tokenizer = before_batch_tfm.hf_tokenizer
        self.hf_model = before_batch_tfm.hf_model

        return super().__init__(dl_type=dl_type, dls_kwargs={"before_batch": before_batch_tfm}, batch_tfms=after_batch_tfm)


## Low-level API

For working with PyTorch and/or fast.ai Datasets & DataLoaders, the low-level API allows you to get back fast.ai specific features such as `show_batch`, `show_results`, etc... when using plain ol' PyTorch Datasets, Hugging Face Datasets, etc...

In [None]:
# export
@dataclass
class BlurrBatchCreator:
    """
    A class that can be assigned to a `TfmdDL.create_batch` method; used to in Blurr's low-level API
    to create batches that can be used in the Blurr library
    """

    def __init__(
        self,
        # Your Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # Defaults to use Hugging Face's DataCollatorWithPadding(tokenizer=hf_tokenizer)
        data_collator: Type = None,
    ):
        self.hf_tokenizer = hf_tokenizer
        self.data_collator = data_collator if (data_collator) else DataCollatorWithPadding(tokenizer=hf_tokenizer)

    def __call__(self, features):  # A mini-batch (list of examples to run through your model)
        """This method will collate your data using `self.data_collator` and add a target element to the
        returned tuples if `labels` are defined as is the case when most Hugging Face datasets
        """
        batch = self.data_collator(features)
        if isinstance(features[0], dict):
            return dict(batch), batch["labels"] if ("labels" in features[0]) else dict(batch)

        return batch


In [None]:
# export
class BlurrBatchTransform(HF_AfterBatchTransform):
    """A class used to cast your inputs into something understandable in fastai `show` methods"""

    def __init__(
        self,
        # The abbreviation/name of your Hugging Face transformer architecture (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_arch: str = None,
        # A Hugging Face configuration object (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_config: PretrainedConfig = None,
        # A Hugging Face tokenizer (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_tokenizer: PreTrainedTokenizerBase = None,
        # A Hugging Face model (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_model: PreTrainedModel = None,
        # If you are passing in the "inpu_ids" as your inputs, set `is_numericalised` = True
        is_numericalized: bool = False,
        # The token ID to ignore when calculating loss/metrics
        ignore_token_id: int = CrossEntropyLossFlat().ignore_index,
        # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
        # if your inputs are pre-tokenized (not numericalized)
        is_split_into_words: bool = False,
        # Any other keyword arguments you want included when using your `hf_tokenizer` to tokenize your inputs
        tok_kwargs: dict = {},
        # Any text generation keyword arguments
        text_gen_kwargs: dict = {},
        # The return type your decoded inputs should be cast too (used by methods such as `show_batch`)
        input_return_type: Type = HF_BaseInput,
        # Any other keyword arguments you need to pass to `HF_AfterBatchTransform`
        **kwargs
    ):
        super().__init__(hf_tokenizer=hf_tokenizer, input_return_type=input_return_type)

        store_attr(self=self, names="hf_arch, hf_config, hf_model, hf_tokenizer, is_numericalized, ignore_token_id")
        store_attr(self=self, names="is_split_into_words, tok_kwargs, text_gen_kwargs, kwargs")


In [None]:
# export
@delegates()
class BlurrDataLoader(TfmdDL):
    """A class that makes creating a fast.ai `DataLoader` that works with Blurr"""

    def __init__(
        self,
        # A standard PyTorch Dataset
        dataset: Union[torch.utils.data.dataset.Dataset, Datasets],
        # The abbreviation/name of your Hugging Face transformer architecture (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_arch: str,
        # A Hugging Face configuration object (not required if passing in an instance of `HF_BeforeBatchTransform`
        # to `before_batch_tfm`)
        hf_config: PretrainedConfig,
        # A Hugging Face tokenizer (not required if passing in an instance of `HF_BeforeBatchTransform` to
        # `before_batch_tfm`)
        hf_tokenizer: PreTrainedTokenizerBase,
        # A Hugging Face model (not required if passing in an instance of `HF_BeforeBatchTransform` to
        # `before_batch_tfm`)
        hf_model: PreTrainedModel,
        # An instance of `BlurrBatchCreator` or equivalent
        batch_creator: BlurrBatchCreator = None,
        # The batch_tfm used to decode Blurr batches (default: HF_AfterBatchTransform)
        batch_tfm: BlurrBatchTransform = None,
        # (optional) A preprocessing function that will be applied to your dataset
        preproccesing_func: Callable[
            [Union[torch.utils.data.dataset.Dataset, Datasets], PreTrainedTokenizerBase, PreTrainedModel],
            Union[torch.utils.data.dataset.Dataset, Datasets],
        ] = None,
        # Keyword arguments to be applied to your `batch_tfm`
        batch_tfm_kwargs: dict = {},
        # Keyword arguments to be applied to `BlurrDataLoader`
        **kwargs,
    ):
        if preproccesing_func:
            dataset = preproccesing_func(dataset, hf_tokenizer, hf_model)

        if "create_batch" in kwargs:
            kwargs.pop("create_batch")
        if not batch_creator:
            batch_creator = BlurrBatchCreator(hf_tokenizer=hf_tokenizer)

        if "after_batch" in kwargs:
            kwargs.pop("after_batch")
        if not batch_tfm:
            batch_tfm = BlurrBatchTransform(hf_arch, hf_config, hf_tokenizer, hf_model, **batch_tfm_kwargs.copy())

        super().__init__(dataset=dataset, create_batch=batch_creator, after_batch=batch_tfm, **kwargs)
        store_attr(self=self, names="hf_arch, hf_config, hf_tokenizer, hf_model")

    def new(
        self,
        # A standard PyTorch and fastai dataset
        dataset: Union[torch.utils.data.dataset.Dataset, Datasets] = None,
        # The class you want to create an instance of (will be "self" if None)
        cls: Type = None,
        #  Any additional keyword arguments you want to pass to the __init__ method of `cls`
        **kwargs,
    ):
        """We have to override the new method in order to add back the Hugging Face objects in this factory
        method (called for example in places like `show_results`). With the exception of the additions to the kwargs
        dictionary, the code below is pulled from the `DataLoaders.new` method as is.
        """
        if dataset is None:
            dataset = self.dataset
        if cls is None:
            cls = type(self)

        cur_kwargs = dict(
            dataset=dataset,
            num_workers=self.fake_l.num_workers,
            pin_memory=self.pin_memory,
            timeout=self.timeout,
            bs=self.bs,
            shuffle=self.shuffle,
            drop_last=self.drop_last,
            indexed=self.indexed,
            device=self.device,
        )

        for n in self._methods:
            o = getattr(self, n)
            if not isinstance(o, MethodType):
                cur_kwargs[n] = o

        # we need to add these arguments back in (these, after_batch, and create_batch will go in as kwargs)
        kwargs["hf_arch"] = self.hf_arch
        kwargs["hf_config"] = self.hf_config
        kwargs["hf_tokenizer"] = self.hf_tokenizer
        kwargs["hf_model"] = self.hf_model

        return cls(**merge(cur_kwargs, kwargs))


## Utility methods 

These methods are use internally for getting blurr transforms associated to your `DataLoaders`

In [None]:
# export
def get_blurr_tfm(
    # A list of transforms (e.g., dls.after_batch, dls.before_batch, etc...)
    tfms_list: Pipeline,
    # The transform to find
    tfm_class: Transform = HF_BeforeBatchTransform,
):
    """
    Given a fastai DataLoaders batch transforms, this method can be used to get at a transform
    instance used in your Blurr DataBlock
    """
    return next(filter(lambda el: issubclass(type(el), tfm_class), tfms_list), None)


In [None]:
show_doc(get_blurr_tfm)


<h4 id="get_blurr_tfm" class="doc_header"><code>get_blurr_tfm</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>get_blurr_tfm</code>(**`tfms_list`**:`Pipeline`, **`tfm_class`**:`Transform`=*`HF_BeforeBatchTransform`*)

Given a fastai DataLoaders batch transforms, this method can be used to get at a transform
instance used in your Blurr DataBlock

**Parameters:**


 - **`tfms_list`** : *`<class 'fastcore.transform.Pipeline'>`*	<p>A list of transforms (e.g., dls.after_batch, dls.before_batch, etc...)</p>


 - **`tfm_class`** : *`<class 'fastcore.transform.Transform'>`*, *optional*	<p>The transform to find</p>



In [None]:
# export
def first_blurr_tfm(
    dls: DataLoaders,  # Your fast.ai `DataLoaders
    before_batch_tfm_class: Transform = HF_BeforeBatchTransform,  # The before_batch transform to look for
    blurr_batch_tfm_class: Transform = BlurrBatchTransform,  # The after_batch (or batch_tfm) to look for
):
    """
    This convenience method will find the first Blurr transform required for methods such as
    `show_batch` and `show_results`. The returned transform should have everything you need to properly
    decode and 'show' your Hugging Face inputs/targets
    """
    # try our befor_batch tfms (this will be used if you're using the mid-level DataBlock API)
    tfm = get_blurr_tfm(dls.before_batch, tfm_class=before_batch_tfm_class)
    if tfm:
        return tfm

    # try our after_batch tfms (this will be used if you're using the low-level Blurr data API)
    return get_blurr_tfm(dls.after_batch, tfm_class=blurr_batch_tfm_class)


In [None]:
show_doc(first_blurr_tfm)


<h4 id="first_blurr_tfm" class="doc_header"><code>first_blurr_tfm</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>first_blurr_tfm</code>(**`dls`**:`DataLoaders`, **`before_batch_tfm_class`**:`Transform`=*`HF_BeforeBatchTransform`*, **`blurr_batch_tfm_class`**:`Transform`=*`BlurrBatchTransform`*)

This convenience method will find the first Blurr transform required for methods such as
`show_batch` and `show_results`. The returned transform should have everything you need to properly
decode and 'show' your Hugging Face inputs/targets

**Parameters:**


 - **`dls`** : *`<class 'fastai.data.core.DataLoaders'>`*	<p>Your fast.ai `DataLoaders</p>


 - **`before_batch_tfm_class`** : *`<class 'fastcore.transform.Transform'>`*, *optional*	<p>The before_batch transform to look for</p>


 - **`blurr_batch_tfm_class`** : *`<class 'fastcore.transform.Transform'>`*, *optional*	<p>The after_batch (or batch_tfm) to look for</p>



## `show_batch`

In [None]:
# export
@typedispatch
def show_batch(
    # This typedispatched `show_batch` will be called for `HF_BaseInput` typed inputs
    x: HF_BaseInput,
    # Your targets
    y,
    # Your raw inputs/targets
    samples,
    # Your `DataLoaders`. This is required so as to get at the Hugging Face objects for
    # decoding them into something understandable
    dataloaders,
    # Your `show_batch` context
    ctxs=None,
    # The maximum number of items to show
    max_n=6,
    # Any truncation your want applied to your decoded inputs
    trunc_at=None,
    # Any other keyword arguments you want applied to `show_batch`
    **kwargs,
):
    # grab our tokenizer
    tfm = first_blurr_tfm(dataloaders)
    hf_tokenizer = tfm.hf_tokenizer

    # if we've included our labels list, we'll use it to look up the value of our target(s)
    trg_labels = tfm.kwargs["labels"] if ("labels" in tfm.kwargs) else None

    res = L()
    n_inp = dataloaders.n_inp

    for idx, (input_ids, label, sample) in enumerate(zip(x, y, samples)):
        if idx >= max_n:
            break

        rets = [hf_tokenizer.decode(input_ids, skip_special_tokens=True)[:trunc_at]]
        for item in sample[n_inp:]:
            if not torch.is_tensor(item):
                trg = trg_labels[int(item)] if trg_labels else item
            elif is_listy(item.tolist()):
                trg = [trg_labels[idx] for idx, val in enumerate(label.numpy().tolist()) if (val == 1)] if (trg_labels) else label.item()
            else:
                trg = trg_labels[label.item()] if (trg_labels) else label.item()

            rets.append(trg)
        res.append(tuplify(rets))

    cols = ["text"] + ["target" if (i == 0) else f"target_{i}" for i in range(len(res[0]) - n_inp)]
    display_df(pd.DataFrame(res, columns=cols)[:max_n])
    return ctxs


## Sequence classification

The following eamples demonstrate several approaches to construct your `DataBlock` for sequence classication tasks using the mid-level API, and also an example on how to accomplish the same using the low-level API and standard PyTorch/Hugging Face/fast.ai Datasets and DataLoaders.

### Using the mid-level API

In [None]:
raw_datasets = load_dataset("imdb", split=["train", "test"])
raw_datasets[0] = raw_datasets[0].add_column("is_valid", [False] * len(raw_datasets[0]))
raw_datasets[1] = raw_datasets[1].add_column("is_valid", [True] * len(raw_datasets[1]))

final_ds = concatenate_datasets([raw_datasets[0].shuffle().select(range(1000)), raw_datasets[1].shuffle().select(range(200))])
imdb_df = pd.DataFrame(final_ds)
imdb_df.head()


Reusing dataset imdb (/home/wgilliam/.cache/huggingface/datasets/imdb/plain_text/1.0.0/4ea52f2e58a08dbc12c2bd52d0d92b30b88c00230b4522801b3636782f625c5b)


Unnamed: 0,label,text,is_valid
0,1,"While the original 1932 version, with Preston Foster, was good, there's no remake more worthy than this 1959 one, or more impossible to find anywhere, just as I strongly suspect Mickey Rooney to have had something to do with that. Never could a mere performance have ever been so masterfully brilliant, or a script more thought-provoking, as well as an improvement upon the original. Many years after the last of my several viewings of this film, in 1970, I read an article in which Mickey Rooney was recounting a visit he'd made to death row, and which had apparently very drastically eliminated...",False
1,1,"Ironically the most talked-about American film in the 2008 New York Film Festival is 98% in Spanish. The extra-long film's controversy began at the Cannes Festival. There were love-hate notices, and considerable doubts about commercial prospects. As consolation the star, Benicio Del Toro, got the Best Actor award there. I'm talking about Steven Soderbergh's 'Che,' of course. That's the name it's going by in this version, shown in New York as at Cannes in two 2-hour-plus segments without opening title or end credits. 'Che' is certainly appropriate since Ernesto ""Che"" Guevara is in almost ev...",False
2,0,"Bingo is the game, bullshit is the name. Rarely has the screen been smeared with such a blown-up hodgepodge of half-baked conspiracy theories, puritan prudery, and new-age gibberish. The bulk of the story is set at Viciente, a Cristian resort in the Peruvian jungle. Think Tolkien's Rivendell meets Star Trek's Planet Baku, inhabited by dimwitted followers of a not-so-mysterious, but surprisingly narrow-minded cult of love and peace. Thanks to gruesome acting and tacky production design (the rainbow-colored visualization of the mysterious all-healing ""energy"" is particularly hideous), ""The C...",False
3,1,"There are so many reasons as to why I rate the sopranos so highly, one of its biggest triumphs being the cast and character building. Each character unfolds more and more each series. Also each series has an array of different 'small time characters' as well as the main. A good example of a character (who was only in three episodes) who you can feel for is David the compulsive gambler played brilliantly by Robert Patrick. Every little detail builds the perfect TV series. The show revolves round mob boss Tony Soprano (James Gandolfini) who attempts to balance his life of crime with his role...",False
4,0,"I thought watching employment videos on corporate compliance was tedious. This movie went nowhere fast. What could have been a somewhat cheesy half hour twilight zone episode turned into a seemingly endless waste of film on people parking their cars, a picture of some dude's swimming pool (he really needs to answer his phone by the way) a dot matrix printer doing its job, and Heuy and Louey sitting in a yellow lighted control room repeating ""T minus 10 and counting"" as if something exciting is going to happen. It doesn't so don't get your hopes up. The best thing about this movie is to see...",False


In [None]:
labels = raw_datasets[0].features["label"].names
labels


['neg', 'pos']

#### Batch-Time Tokenization

##### Step 1: Get your Hugging Face objects.

There are a bunch of ways we can get at the four Hugging Face elements we need (e.g., architecture name, tokenizer, config, and model).  We can just create them directly, or we can use one of the helper methods available via `BLURR`.

In [None]:
# hide_output
from transformers import AutoModelForSequenceClassification

model_cls = AutoModelForSequenceClassification

pretrained_model_name = "distilroberta-base"  # "distilbert-base-uncased" "bert-base-uncased"
hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(pretrained_model_name, model_cls=model_cls)


#####  Step 2: Create your `DataBlock`

In [None]:
blocks = (HF_TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model, before_batch_kwargs={"labels": labels}), CategoryBlock)
dblock = DataBlock(blocks=blocks, get_x=ColReader("text"), get_y=ColReader("label"), splitter=ColSplitter())


##### Step 3: Build your `DataLoaders`

In [None]:
dls = dblock.dataloaders(imdb_df, bs=4)


In [None]:
b = dls.one_batch()
len(b), len(b[0]["input_ids"]), b[0]["input_ids"].shape, len(b[1])


(2, 4, torch.Size([4, 512]), 4)

Let's take a look at the actual types represented by our batch

In [None]:
explode_types(b)


{tuple: [dict, fastai.torch_core.TensorCategory]}

In [None]:
dls.show_batch(dataloaders=dls, max_n=2, trunc_at=500)


Unnamed: 0,text,target
0,"*!!- SPOILERS -!!*<br /><br />Before I begin this, let me say that I have had both the advantages of seeing this movie on the big screen and of having seen the ""Authorized Version"" of this movie, remade by Stephen King, himself, in 1997.<br /><br />Both advantages made me appreciate this version of ""The Shining,"" all the more.<br /><br />Also, let me say that I've read Mr. King's book, ""The Shining"" on many occasions over the years, and while I love the book and am a huge fan of his work, Stanl",pos
1,"*****WARNING, MAY CONTAIN SPOILERS WHICH WILL BE MORE ENTERTAINING THAN THIS TRIPE.**** <br /><br />Heres some good advise to anyone living in the U.K. Whenever Channel 5 has an old 80's comedy on late at night, read a book instead. I am currently in the process of recovering from a seizure, due to reading some of the comments on this film on here. I am actually shocked at the fact that someone actually said this film was realistic! All I can say is thank god the Cold War never escalated or els",neg


#### Pre-tokenized/numericalized

BLURR now also works with pre-processed datasets where your inputs are actually "input_ids".  Preprocessing your raw data is the more traditional approach to using Transformers, and is required when you are working with documents that may be longer than your model can handle.  In the later case, in addition to task specific preprocessing, you typically want to tell your tokenizer to create "chunks" of text from such documents by setting `return_overflowing_tokens": True`.

Below is an example of how we can use pre-tokenized/numericalized inputs

In [None]:
raw_datasets = load_dataset("imdb", split=["train", "test"])
raw_datasets[0] = raw_datasets[0].add_column("is_valid", [False] * len(raw_datasets[0]))
raw_datasets[1] = raw_datasets[1].add_column("is_valid", [True] * len(raw_datasets[1]))

final_ds = concatenate_datasets([raw_datasets[0].shuffle().select(range(1000)), raw_datasets[1].shuffle().select(range(200))])


Reusing dataset imdb (/home/wgilliam/.cache/huggingface/datasets/imdb/plain_text/1.0.0/4ea52f2e58a08dbc12c2bd52d0d92b30b88c00230b4522801b3636782f625c5b)


##### Step 1: Get your Hugging Face objects.

In [None]:
hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(pretrained_model_name, model_cls=model_cls)


##### Step 1b. Preprocess dataset

In [None]:
def preprocess_data(examples, tokenizer):
    encoding = tokenizer(examples["text"], truncation=False, padding=False, return_length=True, verbose=False)

    n_chars = []
    for i in range(len(encoding["input_ids"])):
        n_chars.append(len(examples["text"][i]))

    encoding["n_chars"] = n_chars
    return encoding


tokenized_ds = final_ds.map(partial(preprocess_data, tokenizer=hf_tokenizer), batched=True)
tokenized_ds


  0%|          | 0/2 [00:00<?, ?ba/s]

Dataset({
    features: ['attention_mask', 'input_ids', 'is_valid', 'label', 'length', 'n_chars', 'text'],
    num_rows: 1200
})

##### Step 2: Create your `DataBlock`

In [None]:
blocks = (
    HF_TextBlock(
        hf_arch,
        hf_config,
        hf_tokenizer,
        hf_model,
        is_numericalized=True,
        before_batch_kwargs={"labels": labels},
        tok_kwargs={"add_special_tokens": False},
    ),
    CategoryBlock,
)
dblock = DataBlock(blocks=blocks, get_x=ItemGetter("input_ids"), get_y=ItemGetter("label"), splitter=RandomSplitter())


##### Step 3: Build your `DataLoaders`

In [None]:
dls = dblock.dataloaders(tokenized_ds, bs=4)


In [None]:
dls.show_batch(dataloaders=dls, max_n=2, trunc_at=500)


Unnamed: 0,text,target
0,"Okay, so I'm not a big video game buff, but was the game House of the Dead really famous enough to make a movie from? Sure, they went as far as to actually put in quick video game clips throughout the movie, as though justifying any particular scene of violence, but there are dozens and dozens of games that look exactly the same, with the hand in the bottom on the screen, supposedly your own, holding whatever weapon and goo-ing all kinds of aliens or walking dead or snipers or whatever the case",neg
1,"Secret Sunshine (2007) is famous for its awards at the Festival de Cannes in 2007 and other film festivals. Jeon-Do Yeon, who played the newly widowed Shin-ae, won the best actress trophy at the 60th Cannes festival. Secret Sunshine was also a winner of best feature film and Jeon-Do Yeon received a best actress nod from Asia Pacific Screen Awards. In addition, this movie won the best film awards in virtually all Korean film festivals. Masterfully written and directed, and uniquely photographed,",pos


### Using the low-level API

#### Step 1: Build your datasets

In [None]:
raw_datasets = load_dataset("glue", "mrpc")


Reusing dataset glue (/home/wgilliam/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


In [None]:
raw_datasets["train"].features
raw_datasets["train"]


Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})

In [None]:
def tokenize_function(example):
    return hf_tokenizer(example["sentence1"], example["sentence2"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)


  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

#### Step 2: Dataset pre-processing (optional)

In [None]:
# export
def preproc_hf_dataset(
    # A standard PyTorch Dataset or fast.ai Datasets
    dataset: Union[torch.utils.data.dataset.Dataset, Datasets],
    # A Hugging Face tokenizer
    hf_tokenizer: PreTrainedTokenizerBase,
    # A Hugging Face model
    hf_model: PreTrainedModel,
):
    """This method can be used to preprocess most Hugging Face Datasets for use in Blurr and other training
    libraries
    """
    if ("label") in dataset.column_names:
        dataset = dataset.rename_column("label", "labels")

    hf_model_fwd_args = list(inspect.signature(hf_model.forward).parameters.keys())
    bad_cols = set(dataset.column_names).difference(hf_model_fwd_args)
    dataset = dataset.remove_columns(bad_cols)

    dataset.set_format("torch")
    return dataset


#### Step 3: Build your `DataLoaders`.

Use `BlurrDataLoader` to build Blurr friendly dataloaders from your datasets. Passing `{'labels': label_names}` to your `batch_tfm_kwargs` will ensure that your lable/target names will be displayed in methods like `show_batch` and `show_results` (just as it works with the mid-level API)

In [None]:
label_names = raw_datasets["train"].features["label"].names

trn_dl = BlurrDataLoader(
    tokenized_datasets["train"],
    hf_arch,
    hf_config,
    hf_tokenizer,
    hf_model,
    preproccesing_func=preproc_hf_dataset,
    batch_tfm_kwargs={"labels": label_names},
    shuffle=True,
    batch_size=8,
)

val_dl = BlurrDataLoader(
    tokenized_datasets["validation"],
    hf_arch,
    hf_config,
    hf_tokenizer,
    hf_model,
    preproccesing_func=preproc_hf_dataset,
    batch_tfm_kwargs={"labels": label_names},
    batch_size=16,
)

dls = DataLoaders(trn_dl, val_dl)


In [None]:
b = dls.one_batch()
b[0]["input_ids"].shape


torch.Size([8, 77])

In [None]:
dls.show_batch(dataloaders=dls, max_n=2, trunc_at=800)


Unnamed: 0,text,target
0,"The proposal likely would result in the election of 22 Republicans and 10 Democrats to Congress, instead of the state's current 17 Democrats and 15 Republicans, officials say. The plan would likely result in the election of 22 Republicans and 10 Democrats from Texas, versus the current 17 Democrats and 15 Republicans, officials say.",equivalent
1,"Now, nearly two years later, Mallard prepares for trial on charges of murder and tampering with evidence. Chante Jawaon Mallard, 27, is charged with murder and tampering with evidence.",equivalent


## Tests

The tests below to ensure the core DataBlock code above works for **all** pretrained sequence classification models available in Hugging Face.  These tests are excluded from the CI workflow because of how long they would take to run and the amount of data that would be required to download.

**Note**: Feel free to modify the code below to test whatever pretrained classification models you are working with ... and if any of your pretrained sequence classification models fail, please submit a github issue *(or a PR if you'd like to fix it yourself)*

In [None]:
# hide
[model_type for model_type in BLURR.get_models(task="SequenceClassification") if (not model_type.startswith("TF"))]


['AlbertForSequenceClassification',
 'BartForSequenceClassification',
 'BertForSequenceClassification',
 'BigBirdForSequenceClassification',
 'BigBirdPegasusForSequenceClassification',
 'CTRLForSequenceClassification',
 'CamembertForSequenceClassification',
 'CanineForSequenceClassification',
 'ConvBertForSequenceClassification',
 'DebertaForSequenceClassification',
 'DebertaV2ForSequenceClassification',
 'DistilBertForSequenceClassification',
 'ElectraForSequenceClassification',
 'FNetForSequenceClassification',
 'FlaubertForSequenceClassification',
 'FunnelForSequenceClassification',
 'GPT2ForSequenceClassification',
 'GPTJForSequenceClassification',
 'GPTNeoForSequenceClassification',
 'HubertForSequenceClassification',
 'IBertForSequenceClassification',
 'LEDForSequenceClassification',
 'LayoutLMForSequenceClassification',
 'LayoutLMv2ForSequenceClassification',
 'LongformerForSequenceClassification',
 'MBartForSequenceClassification',
 'MPNetForSequenceClassification',
 'MegatronB

In [None]:
# hide
pretrained_model_names = [
    "hf-internal-testing/tiny-albert",
    "hf-internal-testing/tiny-random-bart",
    "hf-internal-testing/tiny-bert",
    "google/bigbird-roberta-base",
    "google/bigbird-pegasus-large-arxiv",
    "hf-internal-testing/tiny-random-ctrl",
    "camembert-base",
    "hf-internal-testing/tiny-random-canine",
    "YituTech/conv-bert-base",
    "hf-internal-testing/tiny-deberta",
    "hf-internal-testing/tiny-random-deberta-v2",
    "hf-internal-testing/tiny-random-distilbert",
    "hf-internal-testing/tiny-electra",
    "google/fnet-base",
    "hf-internal-testing/tiny-random-flaubert",
    "hf-internal-testing/tiny-random-funnel",
    "hf-internal-testing/tiny-random-gpt2",
    "anton-l/gpt-j-tiny-random",
    "hf-internal-testing/tiny-random-gpt_neo",
    "kssteven/ibert-roberta-base",
    "hf-internal-testing/tiny-random-led",
    "hf-internal-testing/tiny-random-longformer",
    "hf-internal-testing/tiny-random-mbart",
    "hf-internal-testing/tiny-random-mpnet",
    # "nvidia/megatron-bert-cased-345m",                 could not test
    "hf-internal-testing/tiny-random-mobilebert",
    "openai-gpt",
    "google/reformer-crime-and-punishment",
    "google/rembert",
    "junnyu/roformer_chinese_sim_char_ft_small",
    "roberta-base",
    "squeezebert/squeezebert-uncased",
    "hf-internal-testing/tiny-random-transfo-xl",
    "xlm-mlm-en-2048",
    "xlm-roberta-base",
    "xlnet-base-cased",
]


In [None]:
# hide
# for model_name in pretrained_model_names:
#     tok = AutoTokenizer.from_pretrained(model_name)
#     print(f'=== {model_name} ===')
#     print(f'=== {tok.padding_side} ===')
#     print(f'=== {tok.pad_token_id} ===')
#     print(tok(['hi', 'hello everyone. its good to be here'], ['yo', 'yo'], padding='max_length', max_length=128))


In [None]:
# hide
raw_datasets = load_dataset("imdb", split=["train", "test"])
raw_datasets[0] = raw_datasets[0].add_column("is_valid", [False] * len(raw_datasets[0]))
raw_datasets[1] = raw_datasets[1].add_column("is_valid", [True] * len(raw_datasets[1]))

final_ds = concatenate_datasets([raw_datasets[0].shuffle().select(range(1000)), raw_datasets[1].shuffle().select(range(200))])
imdb_df = pd.DataFrame(final_ds)


Reusing dataset imdb (/home/wgilliam/.cache/huggingface/datasets/imdb/plain_text/1.0.0/4ea52f2e58a08dbc12c2bd52d0d92b30b88c00230b4522801b3636782f625c5b)


In [None]:
# hide
from transformers import RobertaTokenizer

model_cls = AutoModelForSequenceClassification
bsz = 2
seq_sz = 128

test_results = []
for model_name in pretrained_model_names:
    error = None

    print(f"=== {model_name} ===\n")

    tok_class = RobertaTokenizer if ("/ibert" in model_name) else None

    hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(model_name, model_cls=model_cls, tokenizer_cls=tok_class)

    print(f"architecture:\t{hf_arch}\ntokenizer:\t{type(hf_tokenizer).__name__}\n")

    # not all architectures include a native pad_token (e.g., gpt2, ctrl, etc...), so we add one here
    if hf_tokenizer.pad_token is None:
        hf_tokenizer.add_special_tokens({"pad_token": "<pad>"})
        hf_config.pad_token_id = hf_tokenizer.get_vocab()["<pad>"]
        hf_model.resize_token_embeddings(len(hf_tokenizer))

    try:
        blocks = (HF_TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model, padding="max_length", max_length=seq_sz), CategoryBlock)

        dblock = DataBlock(blocks=blocks, get_x=ColReader("text"), get_y=ColReader("label"), splitter=ColSplitter())
        dls = dblock.dataloaders(imdb_df, bs=bsz)
        b = dls.one_batch()

        print("*** TESTING DataLoaders ***\n")
        test_eq(len(b), 2)
        test_eq(len(b[0]["input_ids"]), bsz)
        test_eq(b[0]["input_ids"].shape, torch.Size([bsz, seq_sz]))
        test_eq(len(b[1]), bsz)

        test_results.append((hf_arch, type(hf_tokenizer).__name__, model_name, "PASSED", ""))
        dls.show_batch(dataloaders=dls, max_n=2, trunc_at=1000)

    except Exception as err:
        test_results.append((hf_arch, type(hf_tokenizer).__name__, model_name, "FAILED", err))


=== hf-internal-testing/tiny-albert ===

architecture:	albert
tokenizer:	AlbertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"by now you've probably heard a bit about the new disney dub of miyazaki's classic film, laputa: castle in the sky. during late summer of 1998, disney released ""kiki's delivery service"" on video which included a preview of the laputa dub saying it was due out in ""1999"". it's obviously way past that year now, but the dub has been finally completed. and it's not ""laputa: castle in the sky"", just ""castle in the sky""",1
1,"sunday july 16, 8:00pm the castro, san franciscobr /br /""as a southern colonel your make-up is very indiana""br /br /the yammering gossips of hollywood have managed to sling more than a little mud in marion davies direction over the years. that she had fame handed to her and was undeserving, is often speculated. considering the mawkish dramas she was so often pushed into, this criticism would",1


=== hf-internal-testing/tiny-random-bart ===

architecture:	bart
tokenizer:	BartTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"By now you've probably heard a bit about the new Disney dub of Miyazaki's classic film, Laputa: Castle In The Sky. During late summer of 1998, Disney released ""Kiki's Delivery Service"" on video which included a preview of the Laputa dub saying it was due out in ""1999"". It's obv",1
1,"The late 30s and early 40s were a golden age for adventure movies, what with the rise in budgets during the economic recovery, the changes to screen entertainment since the production code became enforced and the general carefree optimism of the times. While most of these were rip-roaring swashbucklers about the wild, superhuman",1


=== hf-internal-testing/tiny-bert ===

architecture:	bert
tokenizer:	BertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"by now you've probably heard a bit about the new disney dub of miyazaki's classic film, laputa : castle in the sky. during late summer of 1998, disney released "" kiki's delivery service "" on video which included a preview of the laputa dub saying it was due out in "" 1999 "". it's obviously way past that year now, but the dub has been finally completed. and it's not "" laputa : castle in the sky "", just "" castle in the sky "" for the dub, since laputa is not such a nice word in spanish ( even though",1
1,"going down as the most expensive film in finnish history, to date, "" dark floors "" is a horror film with an extremely lynchian narrative that recounts an ever increasingly decrepit series of "" floors "" ( ironically enough ) in an abandoned hospital, in which our protagonists are trapped. lead by an autistic daughter and her father, himself disenchanted with the hospitals apparent lack of medical progress with his daughter, make their way into an elevator debating the issue with one of the hospitals nurses. accompanied by a security guard, a businessman and a seemingly intoxicated tramp the collective soon find the complex abandoned, but",0


=== google/bigbird-roberta-base ===



normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.


architecture:	big_bird
tokenizer:	BigBirdTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"By now you've probably heard a bit about the new Disney dub of Miyazaki's classic film, Laputa: Castle In The Sky. During late summer of 1998, Disney released ""Kiki's Delivery Service"" on video which included a preview of the Laputa dub saying it was due out in ""1999"". It's obviously way past that year now, but the dub has been finally completed. And it's not ""Laputa: Castle In The Sky"", just ""Castle In The Sky"" for the dub, since Laputa is not such a nice word in Spanish (even though they use the word Laputa",1
1,"Polish film maker Walerian Borowczyk's La Bête (French, 1975, aka The Beast) is among the most controversial and brave films ever made and a very excellent one too. This film tells everything that's generally been hidden and denied about our nature and our sexual nature in particular with the symbolism and silence of its images. The images may look wild, perverse, ""sick"" or exciting, but they are all in relation with the lastly mentioned. Sex, desire and death are very strong and primary things and dominate all the flesh that has a human soul inside it. They interest and temptate",1


=== google/bigbird-pegasus-large-arxiv ===

architecture:	bigbird_pegasus
tokenizer:	PegasusTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"By now you've probably heard a bit about the new Disney dub of Miyazaki's classic film, Laputa: Castle In The Sky. During late summer of 1998, Disney released ""Kiki's Delivery Service"" on video which included a preview of the Laputa dub saying it was due out in ""1999"". It's obviously way past that year now, but the dub has been finally completed. And it's not ""Laputa: Castle In The Sky"", just ""Castle In The Sky"" for the dub, since Laputa is not such a nice word in Spanish (even though they use the word Lap",1
1,"I rented the dubbed-English version of Lensman, hoping that since it came from well-known novels it would have some substance. While there were hints of substance in the movie, it mostly didn't rise above the level of kiddie cartoon. Maybe the movie was a bad adaptation of the book, or it lost a lot in the dubbed version. Or maybe even the source novels were lightweight. But for whatever reason, there wasn't much there.br />br />I noticed lots of details that were derivative, sloppy, poorly dramatized, or otherwise deficient. Some examples: The opening scenes looked borrowed from the",0


=== hf-internal-testing/tiny-random-ctrl ===



  angle_rates = 1 / torch.pow(10000, (2 * (i // 2)) / d_model_size)
Using pad_token, but it is not set yet.


architecture:	ctrl
tokenizer:	CTRLTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"By now you've probably heard a bit about the new Disney dub of Miyazaki's classic film, Laputa: Castle In The Sky. During late summer of 1998, Disney released ""Kiki's Delivery Service"" on video which included a preview of the Laputa dub saying it was due out in ""1999"". It's obviously way past that year now, but the dub has been finally completed. And it's not ""Laputa: Castle In The Sky"", just ""Castle In The Sky"" for the dub, since Laputa is not such a nice word in Spanish (even though they use the word Laputa many times throughout the dub). You've also probably heard that world renowned composer, Joe Hisaishi, who scored the",1
1,"Was this a comedy or was it a drama? I begin this review by asking this question because the film that I just witnessed, Hollywood Shuffle, was neither funny or rather dramatic. While it tried so hard to make a point, because of this lack of definition (comedy or drama), the clever themes and pointed remarks were lost. While I am a strong believer that there is too much racial profiling happening in Hollywood, even today, I do not believe that Townsend's directorial debut did much to stop it. Instead, I feel it only added more fuel to the fire. Townsend's comic timing in this film was disastrous due to the fact that the elements he was supposed to be making fun of, he",0


=== camembert-base ===

architecture:	camembert
tokenizer:	CamembertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"By now you've probably heard a bit about the new Disney dub of Miyazaki's classic film, Laputa: Castle In The Sky. During late summer of 1998, Disney released ""Kiki's Delivery Service"" on video which included a preview of the Laputa dub saying it was due out in ""1999"". It's obviously way past that year now, but the dub has been finally completed. And it's not ""La",1
1,"Watching Cliffhanger makes me nostalgic for the early '90s, a time when virtually every new action movie could be described as ""Die Hard in a /on a."" Cliffhanger is ""Die Hard on a mountain,"" and pretty good, for what it is.<br /><br />But unlike Passenger 57 and Under Siege, which are decent Die Hard clones on their own terms, Cliffhanger dispenses with the enclosed feeling of many",1


=== hf-internal-testing/tiny-random-canine ===



Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.


architecture:	canine
tokenizer:	CanineTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"By now you've probably heard a bit about the new Disney dub of Miyazaki's classic film, Laputa: Castle In The Sky. During late",1
1,The effects of job related stress and the pressures born of a moral dilemma that pits conscience against the obligations of a,1


=== YituTech/conv-bert-base ===

architecture:	convbert
tokenizer:	ConvBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"by now you've probably heard a bit about the new disney dub of miyazaki's classic film, laputa : castle in the sky. during late summer of 1998, disney released "" kiki's delivery service "" on video which included a preview of the laputa dub saying it was due out in "" 1999 "". it's obviously way past that year now, but the dub has been finally completed. and it's not "" laputa : castle in the sky "", just "" castle in the sky "" for the dub, since laputa is not such a nice word in spanish ( even though",1
1,"the late 30s and early 40s were a golden age for adventure movies, what with the rise in budgets during the economic recovery, the changes to screen entertainment since the production code became enforced and the general carefree optimism of the times. while most of these were rip - roaring swashbucklers about the wild, superhuman and often frankly misogynistic exploits of heartthrobs like errol flynn and tyrone power, gunga din is very different in its focus, scope and tone. < br / > < br / > part of gunga din's secret is the division of labour in its writing team.",1


=== hf-internal-testing/tiny-deberta ===

architecture:	deberta
tokenizer:	DebertaTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"By now you've probably heard a bit about the new Disney dub of Miyazaki's classic film, Laputa: Castle In The Sky. During late summer of 1998, Disney released ""Kiki's Delivery Service"" on video which included a preview of the Laputa dub saying it was due out in ""1999"". It's obviously way past that year now, but the d",1
1,"This could be well have been THE definitive film noir of all time, had not the Columbia Studios cut so much of Orson Welles's original. What we are left with is a flawed, yet brilliant film that showcases the overwhelming talent of Welles as an actor/director and Rita Hayworth as a serious dramatic talent.br /br /'The Lady Fr",1


=== hf-internal-testing/tiny-random-deberta-v2 ===

architecture:	deberta_v2
tokenizer:	DebertaV2Tokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"By now you've probably heard a bit about the new Disney dub of Miyazaki's classic film, Laputa: Castle In The Sky. During late summer of 1998, Disney released ""Kiki's Delivery Service"" on video which included a preview of the Laputa dub saying it was due out in ""1999"". It's obviously way past that year now, but the dub has been finally completed. And it's not ""Laputa: Castle In The Sky"", just ""Castle In The Sky"" for the dub, since Laputa is not such a nice word in Spanish (even though they use the word",1
1,"Attack Force has a horrendous title, and can almost certainly be judged by it's awful cover, because the film is horrible! A mish-mash of plot lines, a choppy mess, and a horribly stagnated pace, make the film hard to watch start to finish. I managed this and I'm proud. As a fan of Seagal's work (mostly of his old days), it's painful to see him star in such tripe. True Seagal's last half dozen movies or so, have sucked a lot, but some of them at least had some redeeming features. Attack Force is a mess.",0


=== hf-internal-testing/tiny-random-distilbert ===

architecture:	distilbert
tokenizer:	DistilBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"by now you've probably heard a bit about the new disney dub of miyazaki's classic film, laputa : castle in the sky. during late summer of 1998, disney rel",1
1,"polish film maker walerian borowczyk's la bete ( french, 1975, aka the beast ) is among the most controversial and brave films ever made and a very excel",1


=== hf-internal-testing/tiny-electra ===

architecture:	electra
tokenizer:	ElectraTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"by now you've probably heard a bit about the new disney dub of miyazaki's classic film, laputa : castle in the sky. during late summer of 1998, disney released "" kiki's delivery service "" on video which included a preview of the laputa dub saying it was due out in "" 1999 "". it's obviously way past that year now, but the dub has been finally completed. and it's not "" laputa : castle in the sky "", just",1
1,"the late 30s and early 40s were a golden age for adventure movies, what with the rise in budgets during the economic recovery, the changes to screen entertainment since the production code became enforced and the general carefree optimism of the times. while most of these were rip - roaring swashbucklers about the wild, superhuman and often frankly misogynistic exploits of heartthrobs like er",1


=== google/fnet-base ===

architecture:	fnet
tokenizer:	FNetTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"By now you've probably heard a bit about the new Disney dub of Miyazaki's classic film, Laputa: Castle In The Sky. During late summer of 1998, Disney released ""Kiki's Delivery Service"" on video which included a preview of the Laputa dub saying it was due out in ""1999"". It's obviously way past that year now, but the dub has been finally completed. And it's not ""Laputa: Castle In The Sky"", just ""Castle In The Sky"" for the dub, since Laputa",1
1,"***SPOILERS*** Like some evil Tinkers-to-Evers-to-Chance double-play combination we have in ""Omen IV"" the evil seed of the deceased AntiChrist Damien Thorn come back. Terrorizing his parents his schoolmates his neighbors and finally the entire world as a she named Delia York, Asia Vieila. After being given to a ""deserving"" couple the Yorks Karen & Gene, Fay Grant & Michael Woods,by the Catholic Church's St. Francis orphanage.<br /><br />",0


=== hf-internal-testing/tiny-random-flaubert ===

architecture:	flaubert
tokenizer:	FlaubertTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"By now you' ve probably heard a bit about the new Disney dub of Miyazaki' s classic film, Laputa : Castle In The Sky. During late summer of 1998, Disney released "" Kiki' s Delivery Service "" on video which included a preview of the Laputa dub saying it was due out in "" 1999 "". It' s obviously way past that year now, but the dub has been finally completed. And it' s not "" Laputa : Castle In The Sky "", just",1
1,"The best film on the battle of San Antonio, Texas in March 1836, was John Wayne' s 1960 epic THE ALAMO. In a one shot job as director producer, that temporarily financially strapped him, Wayne demonstrated that he was talented in movie making outside of his icon-like acting ability personifying the West. < br / > < br / > I have commented on that film in a review the other night, and I pointed out that Wayne and James Edward Grant ( the screenwri",1


=== hf-internal-testing/tiny-random-funnel ===

architecture:	funnel
tokenizer:	FunnelTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"by now you've probably heard a bit about the new disney dub of miyazaki's classic film, laputa : castle in the sky. during late summer of 1998, disney rel",1
1,"the shining is a weird example of adaptation : it has very little in common with the source novel, written by stephen king, yet it is widely remembered as",1


=== hf-internal-testing/tiny-random-gpt2 ===



Using pad_token, but it is not set yet.


architecture:	gpt2
tokenizer:	GPT2TokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"By now you've probably heard a bit about the new Disney dub of Miyazaki's classic film, Laputa: Castle In The Sky. During late summer of 1998, Disney released ""Kiki's Delivery Service"" on video which included a preview of the Laputa dub saying it was due out in ""1999"". It's obviously",1
1,"Polish film maker Walerian Borowczyk's La Bête (French, 1975, aka The Beast) is among the most controversial and brave films ever made and a very excellent one too. This film tells everything that's generally been hidden and denied about our nature and our sexual nature in particular with the symbolism and sil",1


=== anton-l/gpt-j-tiny-random ===



Using pad_token, but it is not set yet.


architecture:	gptj
tokenizer:	GPT2TokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"By now you've probably heard a bit about the new Disney dub of Miyazaki's classic film, Laputa: Castle In The Sky. During late summer of 1998, Disney released ""Kiki's Delivery Service"" on video which included a preview of the Laputa dub saying it was due out in ""1999"". It's obviously way past that year now, but the dub has been finally completed. And it's not ""Laputa: Castle In The Sky"", just ""Castle In The Sky"" for the dub, since Laputa is not such a nice word in Spanish (even though they use the word Laputa many times",1
1,"Chris Rock deserves better than he gives himself in ""Down To Earth."" As directed by brothers Chris & Paul Weitz of ""American Pie"" fame, this uninspired remake of Warren Beatty's 1978 fantasy ""Heaven Can Wait,"" itself a rehash of 1941's ""Here Comes Mr. Jordan,"" lacks the abrasively profane humor that won Chris Rock an Emmy for his first HBO special. Predictably, he spouts swear words from A to Z, but he consciously avoids the F-word. Anybody who saw this gifted African-American comic in ""Lethal Weapon 4,"" ""Dogma,"" or ""Nurse",0


=== hf-internal-testing/tiny-random-gpt_neo ===



Using pad_token, but it is not set yet.


architecture:	gpt_neo
tokenizer:	GPT2TokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"By now you've probably heard a bit about the new Disney dub of Miyazaki's classic film, Laputa: Castle In The Sky. During late summer of 1998, Disney released ""Kiki's Delivery Service"" on video which included a preview of the Laputa dub saying it was due out in ""1999"". It's obviously",1
1,"Something about the 40 Year Old Virgin and the other comedy hit of the summer, Wedding Crashers, is similar, but they are two different films in some respects. Both are romantic comedies that have that kind of over-the-top, crazy sensibility that keeps the teens and guys in their 20's along with the usual dating crowd to go se",1


=== kssteven/ibert-roberta-base ===

architecture:	ibert
tokenizer:	RobertaTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"By now you've probably heard a bit about the new Disney dub of Miyazaki's classic film, Laputa: Castle In The Sky. During late summer of 1998, Disney released ""Kiki's Delivery Service"" on video which included a preview of the Laputa dub saying it was due out in ""1999"". It's obviously way past that year now, but the dub has been finally completed. And it's not ""Laputa: Castle In The Sky"", just ""Castle In The Sky"" for the dub, since Laputa is not such a nice word in Spanish (even though they use the word Laputa",1
1,"Chris Rock deserves better than he gives himself in ""Down To Earth."" As directed by brothers Chris & Paul Weitz of ""American Pie"" fame, this uninspired remake of Warren Beatty's 1978 fantasy ""Heaven Can Wait,"" itself a rehash of 1941's ""Here Comes Mr. Jordan,"" lacks the abrasively profane humor that won Chris Rock an Emmy for his first HBO special. Predictably, he spouts swear words from A to Z, but he consciously avoids the F-word. Anybody who saw this gifted African-American comic in ""Lethal Weapon 4,"" ""Dogma,"" or """,0


=== hf-internal-testing/tiny-random-led ===

architecture:	led
tokenizer:	LEDTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"By now you've probably heard a bit about the new Disney dub of Miyazaki's classic film, Laputa: Castle In The Sky. During late summer of 1998, Disney released ""Kiki's Delivery Service"" on video which included a preview of the Laputa dub saying it was due out in ""1999"". It's obv",1
1,"Polish film maker Walerian Borowczyk's La Bête (French, 1975, aka The Beast) is among the most controversial and brave films ever made and a very excellent one too. This film tells everything that's generally been hidden and denied about our nature and our sexual nature in particular with the symbolism and",1


=== hf-internal-testing/tiny-random-longformer ===

architecture:	longformer
tokenizer:	LongformerTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"By now you've probably heard a bit about the new Disney dub of Miyazaki's classic film, Laputa: Castle In The Sky. During late summer of 1998, Disney released ""Kiki's Delivery Service"" on video which included a preview of the Laputa dub saying it was due out in ""1999"". It's obv",1
1,"WWE Armageddon, December 17, 2006 -- Live from Richmond Coliseum, Richmond, VA <br /><br />Kane vs. MVP in an Inferno match: So this is the fourth ever inferno match in the WWE and it is Kane vs. MVP (wonder why was it the first match on the card). I only viewed the end",1


=== hf-internal-testing/tiny-random-mbart ===

architecture:	mbart
tokenizer:	MBartTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"By now you've probably heard a bit about the new Disney dub of Miyazaki's classic film, Laputa: Castle In The Sky. During late summer of 18, Disney released ""Kiki's Delivery Service"" on video which included a preview of the Laputa dub saying it was d",1
1,"Going down as the most expensive film in Finnish history, to date, ""Dark Floors"" is a horror film with an extremely Lynchian narrative that recounts an ever increasingly decrepit series of ""Floors"" (ironically enough) in an abandoned hospital, in which our protagonis",0


=== hf-internal-testing/tiny-random-mpnet ===

architecture:	mpnet
tokenizer:	MPNetTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"by now you've probably heard a bit about the new disney dub of miyazaki's classic film, laputa : castle in the sky. during late summer of 1998, disney rel",1
1,"was this a comedy or was it a drama? i begin this review by asking this question because the film that i just witnessed, hollywood shuffle, was neither fu",0


=== hf-internal-testing/tiny-random-mobilebert ===

architecture:	mobilebert
tokenizer:	MobileBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"by now you've probably heard a bit about the new disney dub of miyazaki's classic film, laputa : castle in the sky. during late summer of 1998, disney rel",1
1,"chris rock deserves better than he gives himself in "" down to earth. "" as directed by brothers chris & paul weitz of "" american pie "" fame, this uninspired",0


=== openai-gpt ===



Using pad_token, but it is not set yet.


architecture:	openai
tokenizer:	OpenAIGPTTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"by now you've probably heard a bit about the new disney dub of miyazaki's classic film, laputa : castle in the sky. during late summer of 1998, disney released "" kiki's delivery service "" on video which included a preview of the laputa dub saying it was due out in "" 1999 "". it's obviously way past that year now, but the dub has been finally completed. and it's not "" laputa : castle in the sky "", just "" castle in the sky "" for the dub, since laputa is not such a nice word in spanish (",1
1,"if you want to see a movie that terribly mixes up one latin country with any other latin country, "" the celestine prophecy "" is a good example : 1. peru, not even in its most violent times, has not shown polices or soldiers as much as in this film. this showed a country like el salvador when civil war. since i'm a peruvian who lives in lima ( the capital of peru ), it was too funny to me seeing the police guards here, there and everywhere. 2. if you have a car in peru, and you want ( or need ) to be a taxi driver,",0


=== google/reformer-crime-and-punishment ===



Using pad_token, but it is not set yet.


architecture:	reformer
tokenizer:	ReformerTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"By now youve probably heard a bit about the new Disney dub of Miyazakis classic film, Laputa: Castle In The Sky. During late summer of, Disney released Kikis Delivery Service on video which included a previ",1
1,"The effects of job related stress and the pressures born of a moral dilemma that pits conscience against the obligations of a family business (albeit a unique one all brought to a head by-- or perhaps the catalyst of-- a midlife crisis, are examined in the",1


=== google/rembert ===

architecture:	rembert
tokenizer:	RemBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"By now you've probably heard a bit about the new Disney dub of Miyazaki's classic film, Laputa: Castle In The Sky. During late summer of 1998, Disney released ""Kiki's Delivery Service"" on video which included a preview of the Laputa dub saying it was due out in ""1999"". It's obviously way past that year now, but the dub has been finally completed. And it's not ""Laputa: Castle In The Sky"", just ""Castle In The Sky"" for the dub, since Laputa is not such a nice word in Spanish (even though they use the word",1
1,"The Choke starts as a rock band known as The Choke prepare for a gig at a nightclub called 'Club 905' owned & run by Guy Johnson (Andrew Parker). Lead singer Dylan (Sean Cook) & guitar player Mike (Jason McKee) plan to tell the other band members, bass player London (Brooke Bailey) & drummer Nancy (Tom Olson), that they are both going solo & their services won't be needed any longer. Once at the club Dylan prepares but Mike doesn't show up & the gig turns into a disaster. Then just as the band think things",0


=== junnyu/roformer_chinese_sim_char_ft_small ===

architecture:	roformer
tokenizer:	RoFormerTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"by now you've probably heard a bit about the new disney dub of miyazaki's classic film, laputa : castle in the sky. during late summer of 1998, disney released "" kiki's delivery service "" on video which included a preview of the laputa dub saying it was due out in "" 1999 "". it's obviously way past that year now, but the dub has been finally completed. and it's not "" laputa",1
1,as we all know the sub - genre of sex comedies is pretty crowded. simply being excessively raunchy isn't enough anymore. i've seen and heard so many disgusting jokes and actions that a sex comedy really needs to have other positive points to appeal to me these days. < br / > < br / > coming into the 40 year old virgin i knew basically what to expect ;,1


=== roberta-base ===

architecture:	roberta
tokenizer:	RobertaTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"By now you've probably heard a bit about the new Disney dub of Miyazaki's classic film, Laputa: Castle In The Sky. During late summer of 1998, Disney released ""Kiki's Delivery Service"" on video which included a preview of the Laputa dub saying it was due out in ""1999"". It's obviously way past that year now, but the dub has been finally completed. And it's not ""Laputa: Castle In The Sky"", just ""Castle In The Sky"" for the dub, since Laputa is not such a nice word in Spanish (even though they use the word Laputa",1
1,"Chris Rock deserves better than he gives himself in ""Down To Earth."" As directed by brothers Chris & Paul Weitz of ""American Pie"" fame, this uninspired remake of Warren Beatty's 1978 fantasy ""Heaven Can Wait,"" itself a rehash of 1941's ""Here Comes Mr. Jordan,"" lacks the abrasively profane humor that won Chris Rock an Emmy for his first HBO special. Predictably, he spouts swear words from A to Z, but he consciously avoids the F-word. Anybody who saw this gifted African-American comic in ""Lethal Weapon 4,"" ""Dogma,"" or """,0


=== squeezebert/squeezebert-uncased ===

architecture:	squeezebert
tokenizer:	SqueezeBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"by now you've probably heard a bit about the new disney dub of miyazaki's classic film, laputa : castle in the sky. during late summer of 1998, disney released "" kiki's delivery service "" on video which included a preview of the laputa dub saying it was due out in "" 1999 "". it's obviously way past that year now, but the dub has been finally completed. and it's not "" laputa : castle in the sky "", just "" castle in the sky "" for the dub, since laputa is not such a nice word in spanish ( even though",1
1,"polish film maker walerian borowczyk's la bete ( french, 1975, aka the beast ) is among the most controversial and brave films ever made and a very excellent one too. this film tells everything that's generally been hidden and denied about our nature and our sexual nature in particular with the symbolism and silence of its images. the images may look wild, perverse, "" sick "" or exciting, but they are all in relation with the lastly mentioned. sex, desire and death are very strong and primary things and dominate all the flesh that has a human soul inside it. they interest and temptate",1


=== hf-internal-testing/tiny-random-transfo-xl ===



Using pad_token, but it is not set yet.


architecture:	transfo_xl
tokenizer:	TransfoXLTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"By now you've probably heard a bit about the new Disney dub of Miyazaki's classic film, Laputa: Castle In The Sky. During late summer of 1998, Disney released ""Kiki's Delivery Service"" on video which included a preview of the Laputa dub saying it was due out in ""1999."" It's obviously way past that year now, but the dub has been finally completed. And it's not ""Laputa: Castle In The Sky,"" just ""Castle In The Sky"" for the dub, since Laputa is not such a nice word in Spanish (even though they use the word Laputa many times throughout the dub). You've",1
1,"The best film on the battle of San Antonio, Texas in March 1836, was John Wayne's 1960 epic THE. In a one shot job as director producer, that temporarily financially strapped him, Wayne demonstrated that he was talented in movie making outside of his icon-like acting ability personifying the West. < br / > < br / > I have commented on that film in a review the other night, and I pointed out that Wayne and James Edward Grant (the screenwriter) tackled some points that were barely mentioned in earlier films about the battle. They did bring in the issue of slavery. They also finally discussed the contribution of local Mexican land",1


=== xlm-mlm-en-2048 ===

architecture:	xlm
tokenizer:	XLMTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"by now you've probably heard a bit about the new disney dub of miyazaki's classic film, laputa : castle in the sky. during late summer of 1998, disney released "" kiki's delivery service "" on video which included a preview of the laputa dub saying it was due out in "" 1999. "" it's obviously way past that year now, but the dub has been finally completed. and it's not "" laputa : castle in the sky, "" just "" castle in the sky "" for the dub, since laputa is not such a nice word in spanish ( even though",1
1,"what we've got here is a situation. a man is found to be in distress and people want to help him -- in contrasting ways. at the end they are forced to let it go. you can 't fix people. and though in various aspects reign over me is conventionally hollywood, that message isn 't. < br / > < br / > this story is not about charlie fineman ( adam sandler ), a man who lost his wife and three daughters in a 9 / 11 plane who's gone into a nearly psychotic state of ptss since. it's about what meeting charlie does to alan johnson",1


=== xlm-roberta-base ===

architecture:	xlm_roberta
tokenizer:	XLMRobertaTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"By now you've probably heard a bit about the new Disney dub of Miyazaki's classic film, Laputa: Castle In The Sky. During late summer of 1998, Disney released ""Kiki's Delivery Service"" on video which included a preview of the Laputa dub saying it was due out in ""1999"". It's obviously way past that year now, but the dub has been finally completed. And it's not ""Laputa: Castle In The Sky"", just ""Castle In The Sky"" for the dub, since Laputa is not such a nice word in Spanish (even though",1
1,"Preminger's adaptation of G. B. Shaw's ''Saint Joan''(screenplay by Graham Greene) received one of the worst critical reactions in it's day. It was vilified by the pseudo-elite, the purists and the audiences was unresponsive to a film that lacked the piety and glamour expected of a historical pageant. As in ''Peeping Tom'', the reaction was malicious and unjustified. Preminger's adaptation of Shaw's intellectual exploration of the effects and actions surrounding Joan of Arc(her actual name",1


=== xlnet-base-cased ===

architecture:	xlnet
tokenizer:	XLNetTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"By now you've probably heard a bit about the new Disney dub of Miyazaki's classic film, Laputa: Castle In The Sky. During late summer of 1998, Disney released ""Kiki's Delivery Service"" on video which included a preview of the Laputa dub saying it was due out in ""1999"". It's obviously way past that year now, but the dub has been finally completed. And it's not ""Laputa: Castle In The Sky"", just ""Castle In The Sky"" for the dub, since Laputa is not such a nice word",1
1,"Watching Cliffhanger makes me nostalgic for the early '90s, a time when virtually every new action movie could be described as ""Die Hard in a /on a."" Cliffhanger is ""Die Hard on a mountain,"" and pretty good, for what it is.<br /><br />But unlike Passenger 57 and Under Siege, which are decent Die Hard clones on their own terms, Cliffhanger dispenses with the enclosed feeling of many action movies and embraces breathtaking landscapes that, in their immensity, threaten to overwhelm and trivialize the conflicts",1


In [None]:
# hide_input
test_results_df = pd.DataFrame(test_results, columns=["arch", "tokenizer", "model_name", "result", "error"])
display_df(test_results_df)


Unnamed: 0,arch,tokenizer,model_name,result,error
0,albert,AlbertTokenizerFast,hf-internal-testing/tiny-albert,PASSED,
1,bart,BartTokenizerFast,hf-internal-testing/tiny-random-bart,PASSED,
2,bert,BertTokenizerFast,hf-internal-testing/tiny-bert,PASSED,
3,big_bird,BigBirdTokenizerFast,google/bigbird-roberta-base,PASSED,
4,bigbird_pegasus,PegasusTokenizerFast,google/bigbird-pegasus-large-arxiv,PASSED,
5,ctrl,CTRLTokenizer,hf-internal-testing/tiny-random-ctrl,PASSED,
6,camembert,CamembertTokenizerFast,camembert-base,PASSED,
7,canine,CanineTokenizer,hf-internal-testing/tiny-random-canine,PASSED,
8,convbert,ConvBertTokenizerFast,YituTech/conv-bert-base,PASSED,
9,deberta,DebertaTokenizerFast,hf-internal-testing/tiny-deberta,PASSED,


## Summary

The `blurr.data.core` module contains the fundamental bits for all data preprocessing tasks

In [None]:
# hide
from nbdev.export import notebook2script

notebook2script()


Converted 00_utils.ipynb.
Converted 01_data-core.ipynb.
Converted 01_modeling-core.ipynb.
Converted 02_data-language-modeling.ipynb.
Converted 02_modeling-language-modeling.ipynb.
Converted 03_data-token-classification.ipynb.
Converted 03_modeling-token-classification.ipynb.
Converted 04_data-question-answering.ipynb.
Converted 04_modeling-question-answering.ipynb.
Converted 10_data-seq2seq-core.ipynb.
Converted 10_modeling-seq2seq-core.ipynb.
Converted 11_data-seq2seq-summarization.ipynb.
Converted 11_modeling-seq2seq-summarization.ipynb.
Converted 12_data-seq2seq-translation.ipynb.
Converted 12_modeling-seq2seq-translation.ipynb.
Converted 99a_examples-high-level-api.ipynb.
Converted 99b_examples-glue.ipynb.
Converted 99c_examples-glue-plain-pytorch.ipynb.
Converted 99d_examples-multilabel.ipynb.
Converted 99e_examples-causal-lm-gpt2.ipynb.
Converted index.ipynb.
