In [None]:
# default_exp data.core


In [None]:
# all_slow


In [None]:
#hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# data.core

> This module contains the core bits required to use the fastai DataBlock API and/or mid-level data processing pipelines to organize your data in a way modelable by Hugging Face transformer implementations.

In [None]:
# export
import os, inspect
from dataclasses import dataclass
from functools import reduce, partial
from typing import Any, Callable, List, Optional, Union, Type

from datasets import Dataset,load_dataset, concatenate_datasets
from fastcore.all import *
from fastai.data.block import TransformBlock
from fastai.data.core import Datasets, DataLoader, DataLoaders, TfmdDL
from fastai.imports import *
from fastai.losses import CrossEntropyLossFlat
from fastai.text.data import SortedDL
from fastai.torch_core import *
from fastai.torch_imports import *
from transformers import (
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    PretrainedConfig,
    PreTrainedTokenizerBase,
    PreTrainedModel,
    logging,
)

from blurr.utils import BLURR

logging.set_verbosity_error()


In [None]:
# hide_input
import pdb

from fastai.data.block import CategoryBlock, ColReader, ColSplitter, DataBlock, ItemGetter, RandomSplitter
from fastcore.test import *
from nbverbose.showdoc import show_doc

from blurr.utils import print_versions

os.environ["TOKENIZERS_PARALLELISM"] = "false"
print("What we're running with at the time this documentation was generated:")
print_versions("torch fastai transformers")


What we're running with at the time this documentation was generated:
torch: 1.10.1+cu111
fastai: 2.5.3
transformers: 4.15.0


In [None]:
# hide
# cuda
torch.cuda.set_device(1)
print(f"Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}")


Using GPU #1: GeForce GTX 1080 Ti


## Setup

We'll use a subset of `imdb` to demonstrate how to configure your blurr code for sequence classification tasks

In [None]:
raw_datasets = load_dataset("imdb", split=["train", "test"])
raw_datasets[0] = raw_datasets[0].add_column("is_valid", [False] * len(raw_datasets[0]))
raw_datasets[1] = raw_datasets[1].add_column("is_valid", [True] * len(raw_datasets[1]))

final_ds = concatenate_datasets([raw_datasets[0].shuffle().select(range(1000)), raw_datasets[1].shuffle().select(range(200))])
imdb_df = pd.DataFrame(final_ds)
imdb_df.head()


Reusing dataset imdb (/home/wgilliam/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached shuffled indices for dataset at /home/wgilliam/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-e351882882fc6e06.arrow


Unnamed: 0,text,label,is_valid
0,"Think of this pilot as ""Hawaii Five-O Lite"". It's set in Hawaii, it's an action/adventure crime drama, lots of scenes feature boats and palm trees and polyester fabrics and garish shirts...it even stars the character actor ""Zulu"" in a supporting role. Oh, there are some minor differences - Roy Thinnes is supposed to be some front-line undercover agent, and the supporting cast is much smaller (and less interesting), but basically the atmosphere is still the same. Problem is, ""Hawaii Five-O"" (another QM product) already existed at the time and had run for years. It filled the market demand f...",0,False
1,"I watched this movie at a Sneak Preview screening and I'm glad I didn't pay for it. This movie is just disgusting. Its full of dick and fart jokes and takes no pride in the action sequences(such as the shootout in ""Little Germany""). I made a little list of things I enjoyed in the movie.. and a lot of which I didn't agree of.<br /><br />1. Dave Foley's penis. 2. The fart jokes. 3. The Poop jokes. 4. The Dude was a pussy. 5. No Gary Coleman. 6. The Talibans 7. Again making fun of Bush.. WE GET IT HE'S AN IDIOT.. move on. 8. The Dude has blonde hair. 9. The Plot. 10. The killing of minors 11....",0,False
2,"Renown writer Mark Redfield (as Edgar Allen Poe) tries to conquer old addictions and start a new life for himself, as a Baltimore, Maryland magazine publisher. However, blackouts, delirium, and rejection threaten to thwart his efforts. He would also like to rekindle romance with an old sweetheart, a significantly flawed prospect, as things turns out. Mr. Redfield also directed this dramatization of the mysterious last days of Edgar Allen Poe. Redfield employs a lot of black and white, color, and trick photography to create mood. Kevin G. Shinnick (as Dr. John Moran) performs well, relative...",0,False
3,"David Mamet's film debut has been hailed by many as a real thinking-man's movie, a movie that makes you question everybody and everything. I saw it for the first time recently and couldn't understand what was supposed to be so great about it.<br /><br />The movie is about a female psychologist named Margaret who is also a best-selling author. Margaret has become disillusioned by her profession and her inability to really help anyone. She tries to rectify this by helping settle her patient's gambling debt to a shark named Mike (played by Joe Mantegna, who is the only reason to watch this fi...",0,False
4,This is one of the unusual cases in which a movie and the novel on which it is based are both great. Maybe this is because Gorris' takes Nabokov's initial ideas and gives them a different interpretation. The final consequence is a point of view over Luzhin which dignifies him more than the Nabokov's one.<br /><br />The only thing in the movie which I don't like is the influence of Valentinov's on Luzhin's destiny. I can't imagine Nabokov creating a person like Valentinov and giving him so great influence on novel's argument.,1,False


In [None]:
labels = raw_datasets[0].features["label"].names
labels


['neg', 'pos']

In [None]:
model_cls = AutoModelForSequenceClassification

pretrained_model_name = "roberta-base" # "bert-base-multilingual-cased"
n_labels = len(labels)

hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(
    pretrained_model_name, model_cls=model_cls, config_kwargs={"num_labels": n_labels}
)

hf_arch, type(hf_config), type(hf_tokenizer), type(hf_model)

('roberta',
 transformers.models.roberta.configuration_roberta.RobertaConfig,
 transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast,
 transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification)

## Preprocessing

Starting with version 2.0, Blurr provides a preprocessing base class that can be used to build task specific, pre-processed datasets, from either DataFrames or Hugging Face Datasets.

In [None]:
#export
class Preprocessor:
    def __init__(
        self, hf_tokenizer, batch_size: int = 1000, text_attrs: Union[str, List[str]] = "text", tok_kwargs={}
    ):
        self.hf_tokenizer = hf_tokenizer
        self.batch_size = batch_size
        self.text_attrs = text_attrs
        self.tok_kwargs = tok_kwargs

    def process_df(self, training_df: pd.DataFrame, validation_df: Optional[pd.DataFrame] = None):
        df = training_df.copy()

        # concatenate the validation dataset if it is included
        if validation_df is not None:
            valid_df = validation_df.copy()
            # add an `is_valid_col` column to both training/validation DataFrames to indicate what data is part of the validation set
            if self.is_valid_attr:
                valid_df[self.is_valid_attr] = True
                df[self.is_valid_attr] = False

            df = pd.concat([df, valid_df])

        return df

    def process_hf_dataset(self, training_ds: Dataset, validation_ds: Optional[Dataset] = None):
        ds = training_ds

        # concatenate the validation dataset if it is included
        if validation_ds is not None:
            # add an `is_valid_col` column to both training/validation DataFrames to indicate what data is part of
            # the validation set
            if self.is_valid_attr:
                validation_ds = validation_ds.add_column(self.is_valid_attr, [True] * len(validation_ds))
                training_ds = training_ds.add_column(self.is_valid_attr, [False] * len(training_ds))

            ds = concatenate_datasets([training_ds, validation_ds])

        return ds

    def _tokenize_function(self, example):
        if is_listy(self.text_attrs) and len(self.text_attrs) > 1:
            return self.hf_tokenizer(example[self.text_attrs[0]], example[self.text_attrs[1]], truncation=True, **self.tok_kwargs)
        else:
            return self.hf_tokenizer(example[self.text_attrs], truncation=True, **self.tok_kwargs)


### `ClassificationPreprocessor`

Starting with version 2.0, blurr provides a sequence classification preprocessing class that can be used to preprocess DataFrames or Hugging Face Datasets.

This resulting pre-processed data can also be used with the Hugging Face `Trainer` API, `Accelerate`, or your own custom training loop should you want to use one of those options instead of using blurr/fast.ai for training your models.

In [None]:
#export
class ClassificationPreprocessor(Preprocessor):
    def __init__(
        self,
        hf_tokenizer,
        batch_size: int = 1000,
        is_multilabel: bool = False,
        id_attr: Optional[str] = None,
        text_attrs: Union[str, List[str]] = "text",
        label_attrs: Union[str, List[str]] = "label",
        is_valid_attr: Optional[str] = "is_valid",
        label_mapping: Optional[List[str]] = None,
        tok_kwargs={},
    ):
        super().__init__(hf_tokenizer, batch_size, text_attrs, tok_kwargs)

        self.is_multilabel = is_multilabel
        self.id_attr = id_attr
        self.label_attrs = label_attrs
        self.is_valid_attr = is_valid_attr
        self.label_mapping = label_mapping

    def process_df(self, training_df: pd.DataFrame, validation_df: Optional[pd.DataFrame] = None):
        df = super().process_df(training_df, validation_df)

        # convert even single "labels" to a list to make things easier
        label_cols = listify(self.label_attrs)

        # if `is_multilabel`, convert all targets to an int, 0 or 1, rounding floats if necessary
        if self.is_multilabel:
            for label_col in label_cols:
                df[label_col] = df[label_col].apply(lambda v: int(bool(max(0, round(v)))))

        # if a `label_mapping` is included, add a "[label_col]_name" field with the label Ids converted to their label names
        if self.label_mapping:
            for label_col in label_cols:
                df[f"{label_col}_name"] = df[label_col].apply(lambda v: self.label_mapping[v])

        # tokenize in batches
        final_df = pd.DataFrame()
        for g, batch_df in df.groupby(np.arange(len(df)) // self.batch_size):
            inputs_df = batch_df.apply(lambda r: pd.Series(self._tokenize_function(r)), axis=1)
            final_df = final_df.append(pd.concat([batch_df, inputs_df], axis=1))

        # return the pre-processed DataFrame
        return final_df

    def process_hf_dataset(self, training_ds: Dataset, validation_ds: Optional[Dataset] = None):
        ds = super().process_hf_dataset(training_ds, validation_ds)

        # convert even single "labels" to a list to make things easier
        label_attrs = listify(self.label_attrs)

        # if `is_multilabel`, convert all targets to an int, 0 or 1, rounding floats if necessary
        if self.is_multilabel:
            for label_attr in label_attrs:
                ds = ds.map(lambda example: int(bool(max(0, round(example[label_attr])))))

        # if a `label_mapping` is included, add a "[label_col]_name" field with the label Ids converted to their label names
        if self.label_mapping:
            for label_attr in label_attrs:
                ds = ds.map(lambda example: {f"{label_attr}_name": self.label_mapping[example[label_attr]]})

        # tokenize in batches
        ds = ds.map(self._tokenize_function, batched=True, batch_size=self.batch_size)

        # return the pre-processed DataFrame
        return ds


#### Using a `DataFrame`

In [None]:
preprocessor = ClassificationPreprocessor(hf_tokenizer, label_mapping=labels)
proc_df = preprocessor.process_df(imdb_df)
proc_df.columns, len(proc_df)
proc_df.head(2)

Unnamed: 0,text,label,is_valid,label_name,input_ids,attention_mask
0,"Think of this pilot as ""Hawaii Five-O Lite"". It's set in Hawaii, it's an action/adventure crime drama, lots of scenes feature boats and palm trees and polyester fabrics and garish shirts...it even stars the character actor ""Zulu"" in a supporting role. Oh, there are some minor differences - Roy Thinnes is supposed to be some front-line undercover agent, and the supporting cast is much smaller (and less interesting), but basically the atmosphere is still the same. Problem is, ""Hawaii Five-O"" (another QM product) already existed at the time and had run for years. It filled the market demand f...",0,False,neg,"[0, 9387, 9, 42, 4792, 25, 22, 32152, 36729, 4934, 12, 673, 25191, 845, 85, 18, 278, 11, 6467, 6, 24, 18, 41, 814, 73, 625, 33289, 1846, 4149, 6, 3739, 9, 5422, 1905, 8934, 8, 14262, 3980, 8, 11424, 8939, 26348, 8, 15475, 1173, 15331, 734, 405, 190, 2690, 5, 2048, 2701, 22, 1301, 12709, 113, 11, 10, 3117, 774, 4, 5534, 6, 89, 32, 103, 3694, 5550, 111, 5470, 31747, 4977, 16, 3518, 7, 28, 103, 760, 12, 1902, 17814, 2936, 6, 8, 5, 3117, 2471, 16, 203, 2735, 36, 463, 540, 2679, 238, 53, 5072, 5, 5466, ...]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...]"
1,"I watched this movie at a Sneak Preview screening and I'm glad I didn't pay for it. This movie is just disgusting. Its full of dick and fart jokes and takes no pride in the action sequences(such as the shootout in ""Little Germany""). I made a little list of things I enjoyed in the movie.. and a lot of which I didn't agree of.<br /><br />1. Dave Foley's penis. 2. The fart jokes. 3. The Poop jokes. 4. The Dude was a pussy. 5. No Gary Coleman. 6. The Talibans 7. Again making fun of Bush.. WE GET IT HE'S AN IDIOT.. move on. 8. The Dude has blonde hair. 9. The Plot. 10. The killing of minors 11....",0,False,neg,"[0, 38, 3996, 42, 1569, 23, 10, 46702, 24005, 7231, 8, 38, 437, 7785, 38, 399, 75, 582, 13, 24, 4, 152, 1569, 16, 95, 21096, 4, 3139, 455, 9, 38594, 8, 36762, 11248, 8, 1239, 117, 7040, 11, 5, 814, 26929, 1640, 16918, 25, 5, 13818, 11, 22, 23675, 1600, 18653, 38, 156, 10, 410, 889, 9, 383, 38, 3776, 11, 5, 1569, 7586, 8, 10, 319, 9, 61, 38, 399, 75, 2854, 9, 49069, 3809, 1589, 49007, 3809, 48709, 134, 4, 4475, 20291, 18, 25128, 4, 132, 4, 20, 36762, 11248, 4, 155, 4, 20, 6002, 1517, 11248, ...]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...]"


#### Using a Hugging Face `Dataset`

In [None]:
preprocessor = ClassificationPreprocessor(hf_tokenizer, label_mapping=labels)
proc_ds = preprocessor.process_hf_dataset(final_ds)
proc_ds

  0%|          | 0/1200 [00:00<?, ?ex/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

Dataset({
    features: ['attention_mask', 'input_ids', 'is_valid', 'label', 'label_name', 'text'],
    num_rows: 1200
})

## Mid-level API

Base tokenization, batch transform, and DataBlock methods

### `HF_BaseInput`

A `HF_BaseInput` object is returned from the decodes method of `HF_AfterBatchTransform` as a means to customize @typedispatched functions like `DataLoaders.show_batch` and `Learner.show_results`. It uses the "input_ids" of a Hugging Face object as the representative tensor for `show` methods

In [None]:
# export
class HF_BaseInput(TensorBase):
    """The base represenation of your inputs; used by the various fastai `show` methods"""

    def show(
        self,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # The "context" associated to the current `show_batch/results` call
        ctx=None,
        # Any truncation you want to apply to the decoded tokenized inputs
        trunc_at: int = None,
        # A decoded string of your tokenized inputs (input_ids)
    ) -> str:
        input_ids = self.cpu().numpy()
        decoded_input = str(hf_tokenizer.decode(input_ids, skip_special_tokens=True))[:trunc_at]

        return show_title(decoded_input, ctx=ctx, label="text")


### `HF_BeforeBatchTransform` 

Inspired by this [article](https://docs.fast.ai/tutorial.transformers.html), inputs can come in as raw text, a list of words (e.g., tasks like Named Entity Recognition (NER), where you want to predict the label of each token), or pre-processed "input_ids"

**On-the-fly Batch-Time Tokenization**: 

The previous version of the library performed the tokenization/numericalization as a type transform when the raw data was read, and included a couple batch transforms to prepare the data for collation (e.g., to be made into a mini-batch). With this update, everything is done in a single batch transform.  

Why?  Part of the inspiration had to do with the mechanics of the huggingrace tokenizer, in particular how by default it returns a collated mini-batch of data given a list of sequences. And where do we get a list of examples with fastai? In the batch transforms!  So I thought, hey, why not do everything dynamically at batch time?  And with a bit of tweaking, I got everything to work pretty well.  The result is *less code*, *faster mini-batch creation*, *less RAM utilization* and time spent tokenizing (really helps with very large datasets), and *more flexibility*.

In [None]:
# export
class HF_BeforeBatchTransform(Transform):
    """
    Handles everything you need to assemble a mini-batch of inputs and targets, as well as
    decode the dictionary produced as a byproduct of the tokenization process in the `encodes` method.
    """

    def __init__(
        self,
        # The abbreviation/name of your Hugging Face transformer architecture (e.b., bert, bart, etc..)
        hf_arch: str,
        # A specific configuration instance you want to use
        hf_config: PretrainedConfig,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # A Hugging Face model
        hf_model: PreTrainedModel,
        # If you are passing in the "input_ids" as your inputs, set `is_pretokenized` = True
        is_pretokenized: bool = False,
        # The token ID that should be ignored when calculating the loss
        ignore_token_id=CrossEntropyLossFlat().ignore_index,
        # To control the length of the padding/truncation. It can be an integer or None,
        # in which case it will default to the maximum length the model can accept. If the model has no
        # specific maximum input length, truncation/padding to max_length is deactivated.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        max_length: int = None,
        # To control the `padding` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `'do_not_pad'.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        padding: Union[bool, str] = True,
        # To control `truncation` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `do_not_truncate`.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        truncation: Union[bool, str] = True,
        # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
        # if your inputs are pre-tokenized (not numericalized)
        is_split_into_words: bool = False,
        # Any other keyword arguments you want included when using your `hf_tokenizer` to tokenize your inputs
        tok_kwargs: dict = {},
        # Keyword arguments to apply to `HF_BeforeBatchTransform`
        **kwargs
    ):
        store_attr(self=self, names="hf_arch, hf_config, hf_tokenizer, hf_model, is_pretokenized")
        store_attr(self=self, names="max_length, padding, truncation, is_split_into_words, ignore_token_id, tok_kwargs")
        store_attr(self=self, names="kwargs")

    def encodes(self, samples, return_batch_encoding=False):
        """
        This method peforms on-the-fly, batch-time tokenization of your data. In other words, your raw inputs
        are tokenized as needed for each mini-batch of data rather than requiring pre-tokenization of your full
        dataset ahead of time.
        """
        samples = L(samples)

        # grab inputs
        if is_listy(samples[0][0]) and not self.is_split_into_words and not self.is_pretokenized:
            inps = list(zip(samples.itemgot(0, 0), samples.itemgot(0, 1)))
        else:
            inps = samples.itemgot(0).items

        # if passing "input_ids" as your inputs, build the other sequence attributes using `prepare_for_model` since
        # the inputs have already been tokenized/numericalized ... else we tokenize the raw text using `__call__`
        batch_encoding_func = self.hf_tokenizer.prepare_for_model if self.is_pretokenized else self.hf_tokenizer

        batch_encoding = batch_encoding_func(
            inps,
            max_length=self.max_length,
            padding=self.padding,
            truncation=self.truncation,
            is_split_into_words=self.is_split_into_words,
            return_tensors="pt",
            **self.tok_kwargs
        )

        # update the samples with tokenized inputs (e.g. input_ids, attention_mask, etc...)
        d_keys = batch_encoding.keys()
        updated_samples = [(*[{k: batch_encoding[k][idx] for k in d_keys}], *sample[1:]) for idx, sample in enumerate(samples)]

        if return_batch_encoding:
            return updated_samples, batch_encoding

        return updated_samples


### `HF_AfterBatchTransform`

With fastai 2.1.5, before batch transforms no longer have a `decodes` method ... and so, I've introduced a standard batch transform here (one that occurs "after" the batch has been created) that will do the decoding for us.

In [None]:
# export
class HF_AfterBatchTransform(Transform):
    """A class used to cast your inputs into something understandable in fastai `show` methods"""

    def __init__(
        self,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # The return type your decoded inputs should be cast too (used by methods such as `show_batch`)
        input_return_type: Type = HF_BaseInput,
    ):
        store_attr(self=self, names="hf_tokenizer, input_return_type")

    def decodes(
        self,
        # The encoded samples for your batch. `input_ids` will be pulled out of your dictionary of Hugging Face
        # inputs, cast to `self.input_return_type` and returned for methods such as `show_batch`
        encoded_samples: Type,
    ):
        """Returns the proper object and data for show related fastai methods"""
        if isinstance(encoded_samples, dict):
            return self.input_return_type(encoded_samples["input_ids"], hf_tokenizer=self.hf_tokenizer)
        return encoded_samples


### `HF_TextBlock`

A basic wrapper that links defaults transforms for the Data Block API, `HF_TextBlock` is designed with sensible defaults to minimize user effort in defining their transforms pipeline. It handles setting up your `HF_BeforeBatchTransform` and `HF_AfterBatchTransform` transforms regardless of data source (e.g., this will work with files, DataFrames, whatever). 

You must either pass in your own instance of a `HF_BeforeBatchTransform` class or the Hugging Face objects returned from `BLURR.get_hf_objects` (e.g.,architecture, config, tokenizer, and model). The other args are optional.

We also include a `blurr_sort_func` that works with `SortedDL` to sort based on the number of tokens in each example.

In [None]:
# export
def blurr_sort_func(
    example,
    # A Hugging Face tokenizer
    hf_tokenizer: PreTrainedTokenizerBase,
    # If you are passing in the "input_ids" as your inputs, set `is_pretokenized` = True
    is_pretokenized: bool = False,
    # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
    # if your inputs are pre-tokenized (not numericalized)
    is_split_into_words: bool = False,
    # Any other keyword arguments you want to include during tokenization
    tok_kwargs: dict = {},
):
    """This method is used by the `SortedDL` to ensure your dataset is sorted *after* tokenization"""
    if is_split_into_words or is_pretokenized:
        return len(example[0])

    return len(hf_tokenizer.tokenize(example[0], **tok_kwargs))


In [None]:
# export
class HF_TextBlock(TransformBlock):
    """The core `TransformBlock` to prepare your data for training in Blurr with fastai's `DataBlock` API"""

    def __init__(
        self,
        # The abbreviation/name of your Hugging Face transformer architecture (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_arch: str = None,
        # A Hugging Face configuration object (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_config: PretrainedConfig = None,
        # A Hugging Face tokenizer (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_tokenizer: PreTrainedTokenizerBase = None,
        # A Hugging Face model (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_model: PreTrainedModel = None,
        # If you are passing in the "input_ids" as your inputs, set `is_pretokenized` = True
        is_pretokenized: bool = False,
        # The token ID that should be ignored when calculating the loss
        ignore_token_id=CrossEntropyLossFlat().ignore_index,
        # The before batch transform you want to use to tokenize your raw data on the fly
        # (defaults to an instance of `HF_BeforeBatchTransform` created using the Hugging Face objects defined above)
        before_batch_tfm: HF_BeforeBatchTransform = None,
        # The batch_tfms to apply to the creation of your DataLoaders,
        # (defaults to HF_AfterBatchTransform created using the Hugging Face objects defined above)
        after_batch_tfm: HF_AfterBatchTransform = None,
        # To control the length of the padding/truncation. It can be an integer or None,
        # in which case it will default to the maximum length the model can accept. If the model has no
        # specific maximum input length, truncation/padding to max_length is deactivated.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        max_length: int = None,
        # To control the `padding` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `'do_not_pad'.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        padding: Union[bool, str] = True,
        # To control `truncation` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `do_not_truncate`.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        truncation: Union[bool, str] = True,
        # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
        # if your inputs are pre-tokenized (not numericalized)
        is_split_into_words: bool = False,
        # The return type your decoded inputs should be cast too (used by methods such as `show_batch`)
        input_return_type: Type = HF_BaseInput,
        # The type of `DataLoader` you want created (defaults to `SortedDL`)
        dl_type: DataLoader = None,
        # Any keyword arguments you want applied to your before batch tfm
        before_batch_kwargs: dict = {},
        # Any keyword arguments you want applied to your after batch tfm (or referred to in fastai as `batch_tfms`)
        after_batch_kwargs: dict = {},
        # Any keyword arguments you want your Hugging Face tokenizer to use during tokenization
        tok_kwargs: dict = {},
        # Any keyword arguments you want to have applied with generating text
        text_gen_kwargs: dict = {},
        # Any keyword arguments you want applied to `HF_TextBlock`
        **kwargs
    ):
        if (not all([hf_arch, hf_config, hf_tokenizer, hf_model])) and before_batch_tfm is None:
            raise ValueError("You must supply an hf_arch, hf_config, hf_tokenizer, hf_model -or- a HF_BeforeBatchTransform")

        if before_batch_tfm is None:
            before_batch_tfm = HF_BeforeBatchTransform(
                hf_arch,
                hf_config,
                hf_tokenizer,
                hf_model,
                is_pretokenized=is_pretokenized,
                ignore_token_id=ignore_token_id,
                max_length=max_length,
                padding=padding,
                truncation=truncation,
                is_split_into_words=is_split_into_words,
                tok_kwargs=tok_kwargs.copy(),
                **before_batch_kwargs.copy()
            )

        if after_batch_tfm is None:
            after_batch_tfm = HF_AfterBatchTransform(
                hf_tokenizer=before_batch_tfm.hf_tokenizer, input_return_type=input_return_type, **after_batch_kwargs.copy()
            )

        if dl_type is None:
            dl_sort_func = partial(
                blurr_sort_func,
                hf_tokenizer=before_batch_tfm.hf_tokenizer,
                is_pretokenized=is_pretokenized,
                is_split_into_words=before_batch_tfm.is_split_into_words,
                tok_kwargs=before_batch_tfm.tok_kwargs.copy(),
            )

            dl_type = partial(SortedDL, sort_func=dl_sort_func)

        # set the TransformBlock's Hugging Face face objects
        self.hf_arch = before_batch_tfm.hf_arch
        self.hf_config = before_batch_tfm.hf_config
        self.hf_tokenizer = before_batch_tfm.hf_tokenizer
        self.hf_model = before_batch_tfm.hf_model

        return super().__init__(dl_type=dl_type, dls_kwargs={"before_batch": before_batch_tfm}, batch_tfms=after_batch_tfm)


## Low-level API

For working with PyTorch and/or fast.ai Datasets & DataLoaders, the low-level API allows you to get back fast.ai specific features such as `show_batch`, `show_results`, etc... when using plain ol' PyTorch Datasets, Hugging Face Datasets, etc...

In [None]:
# export
@dataclass
class BlurrBatchCreator:
    """
    A class that can be assigned to a `TfmdDL.create_batch` method; used to in Blurr's low-level API
    to create batches that can be used in the Blurr library
    """

    def __init__(
        self,
        # Your Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # Defaults to use Hugging Face's DataCollatorWithPadding(tokenizer=hf_tokenizer)
        data_collator: Type = None,
    ):
        self.hf_tokenizer = hf_tokenizer
        self.data_collator = data_collator if (data_collator) else DataCollatorWithPadding(tokenizer=hf_tokenizer)

    def __call__(self, features):  # A mini-batch (list of examples to run through your model)
        """This method will collate your data using `self.data_collator` and add a target element to the
        returned tuples if `labels` are defined as is the case when most Hugging Face datasets
        """
        batch = self.data_collator(features)
        if isinstance(features[0], dict):
            return dict(batch), batch["labels"] if ("labels" in features[0]) else dict(batch)

        return batch


In [None]:
# export
class BlurrBatchTransform(HF_AfterBatchTransform):
    """A class used to cast your inputs into something understandable in fastai `show` methods"""

    def __init__(
        self,
        # The abbreviation/name of your Hugging Face transformer architecture (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_arch: str = None,
        # A Hugging Face configuration object (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_config: PretrainedConfig = None,
        # A Hugging Face tokenizer (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_tokenizer: PreTrainedTokenizerBase = None,
        # A Hugging Face model (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_model: PreTrainedModel = None,
        # If you are passing in the "input_ids" as your inputs, set `is_pretokenized` = True
        is_pretokenized: bool = False,
        # The token ID to ignore when calculating loss/metrics
        ignore_token_id: int = CrossEntropyLossFlat().ignore_index,
        # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
        # if your inputs are pre-tokenized (not numericalized)
        is_split_into_words: bool = False,
        # Any other keyword arguments you want included when using your `hf_tokenizer` to tokenize your inputs
        tok_kwargs: dict = {},
        # Any text generation keyword arguments
        text_gen_kwargs: dict = {},
        # The return type your decoded inputs should be cast too (used by methods such as `show_batch`)
        input_return_type: Type = HF_BaseInput,
        # Any other keyword arguments you need to pass to `HF_AfterBatchTransform`
        **kwargs
    ):
        super().__init__(hf_tokenizer=hf_tokenizer, input_return_type=input_return_type)

        store_attr(self=self, names="hf_arch, hf_config, hf_model, hf_tokenizer, is_pretokenized, ignore_token_id")
        store_attr(self=self, names="is_split_into_words, tok_kwargs, text_gen_kwargs, kwargs")


In [None]:
# export
@delegates()
class BlurrDataLoader(TfmdDL):
    """A class that makes creating a fast.ai `DataLoader` that works with Blurr"""

    def __init__(
        self,
        # A standard PyTorch Dataset
        dataset: Union[torch.utils.data.dataset.Dataset, Datasets],
        # The abbreviation/name of your Hugging Face transformer architecture (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_arch: str,
        # A Hugging Face configuration object (not required if passing in an instance of `HF_BeforeBatchTransform`
        # to `before_batch_tfm`)
        hf_config: PretrainedConfig,
        # A Hugging Face tokenizer (not required if passing in an instance of `HF_BeforeBatchTransform` to
        # `before_batch_tfm`)
        hf_tokenizer: PreTrainedTokenizerBase,
        # A Hugging Face model (not required if passing in an instance of `HF_BeforeBatchTransform` to
        # `before_batch_tfm`)
        hf_model: PreTrainedModel,
        # An instance of `BlurrBatchCreator` or equivalent
        batch_creator: BlurrBatchCreator = None,
        # The batch_tfm used to decode Blurr batches (default: HF_AfterBatchTransform)
        batch_tfm: BlurrBatchTransform = None,
        # (optional) A preprocessing function that will be applied to your dataset
        preproccesing_func: Callable[
            [Union[torch.utils.data.dataset.Dataset, Datasets], PreTrainedTokenizerBase, PreTrainedModel],
            Union[torch.utils.data.dataset.Dataset, Datasets],
        ] = None,
        # Keyword arguments to be applied to your `batch_tfm`
        batch_tfm_kwargs: dict = {},
        # Keyword arguments to be applied to `BlurrDataLoader`
        **kwargs,
    ):
        if preproccesing_func:
            dataset = preproccesing_func(dataset, hf_tokenizer, hf_model)

        if "create_batch" in kwargs:
            kwargs.pop("create_batch")
        if not batch_creator:
            batch_creator = BlurrBatchCreator(hf_tokenizer=hf_tokenizer)

        if "after_batch" in kwargs:
            kwargs.pop("after_batch")
        if not batch_tfm:
            batch_tfm = BlurrBatchTransform(hf_arch, hf_config, hf_tokenizer, hf_model, **batch_tfm_kwargs.copy())

        super().__init__(dataset=dataset, create_batch=batch_creator, after_batch=batch_tfm, **kwargs)
        store_attr(self=self, names="hf_arch, hf_config, hf_tokenizer, hf_model")

    def new(
        self,
        # A standard PyTorch and fastai dataset
        dataset: Union[torch.utils.data.dataset.Dataset, Datasets] = None,
        # The class you want to create an instance of (will be "self" if None)
        cls: Type = None,
        #  Any additional keyword arguments you want to pass to the __init__ method of `cls`
        **kwargs,
    ):
        """We have to override the new method in order to add back the Hugging Face objects in this factory
        method (called for example in places like `show_results`). With the exception of the additions to the kwargs
        dictionary, the code below is pulled from the `DataLoaders.new` method as is.
        """
        if dataset is None:
            dataset = self.dataset
        if cls is None:
            cls = type(self)

        cur_kwargs = dict(
            dataset=dataset,
            num_workers=self.fake_l.num_workers,
            pin_memory=self.pin_memory,
            timeout=self.timeout,
            bs=self.bs,
            shuffle=self.shuffle,
            drop_last=self.drop_last,
            indexed=self.indexed,
            device=self.device,
        )

        for n in self._methods:
            o = getattr(self, n)
            if not isinstance(o, MethodType):
                cur_kwargs[n] = o

        # we need to add these arguments back in (these, after_batch, and create_batch will go in as kwargs)
        kwargs["hf_arch"] = self.hf_arch
        kwargs["hf_config"] = self.hf_config
        kwargs["hf_tokenizer"] = self.hf_tokenizer
        kwargs["hf_model"] = self.hf_model

        return cls(**merge(cur_kwargs, kwargs))


## Utility classes and methods 

These methods are use internally for getting blurr transforms associated to your `DataLoaders`

In [None]:
# export
def get_blurr_tfm(
    # A list of transforms (e.g., dls.after_batch, dls.before_batch, etc...)
    tfms_list: Pipeline,
    # The transform to find
    tfm_class: Transform = HF_BeforeBatchTransform,
):
    """
    Given a fastai DataLoaders batch transforms, this method can be used to get at a transform
    instance used in your Blurr DataBlock
    """
    return next(filter(lambda el: issubclass(type(el), tfm_class), tfms_list), None)


In [None]:
show_doc(get_blurr_tfm)


<h4 id="get_blurr_tfm" class="doc_header"><code>get_blurr_tfm</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>get_blurr_tfm</code>(**`tfms_list`**:`Pipeline`, **`tfm_class`**:`Transform`=*`HF_BeforeBatchTransform`*)

Given a fastai DataLoaders batch transforms, this method can be used to get at a transform
instance used in your Blurr DataBlock

**Parameters:**


 - **`tfms_list`** : *`<class 'fastcore.transform.Pipeline'>`*	<p>A list of transforms (e.g., dls.after_batch, dls.before_batch, etc...)</p>


 - **`tfm_class`** : *`<class 'fastcore.transform.Transform'>`*, *optional*	<p>The transform to find</p>



In [None]:
# export
def first_blurr_tfm(
    dls: DataLoaders,  # Your fast.ai `DataLoaders
    before_batch_tfm_class: Transform = HF_BeforeBatchTransform,  # The before_batch transform to look for
    blurr_batch_tfm_class: Transform = BlurrBatchTransform,  # The after_batch (or batch_tfm) to look for
):
    """
    This convenience method will find the first Blurr transform required for methods such as
    `show_batch` and `show_results`. The returned transform should have everything you need to properly
    decode and 'show' your Hugging Face inputs/targets
    """
    # try our befor_batch tfms (this will be used if you're using the mid-level DataBlock API)
    tfm = get_blurr_tfm(dls.before_batch, tfm_class=before_batch_tfm_class)
    if tfm:
        return tfm

    # try our after_batch tfms (this will be used if you're using the low-level Blurr data API)
    return get_blurr_tfm(dls.after_batch, tfm_class=blurr_batch_tfm_class)


In [None]:
show_doc(first_blurr_tfm)


<h4 id="first_blurr_tfm" class="doc_header"><code>first_blurr_tfm</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>first_blurr_tfm</code>(**`dls`**:`DataLoaders`, **`before_batch_tfm_class`**:`Transform`=*`HF_BeforeBatchTransform`*, **`blurr_batch_tfm_class`**:`Transform`=*`BlurrBatchTransform`*)

This convenience method will find the first Blurr transform required for methods such as
`show_batch` and `show_results`. The returned transform should have everything you need to properly
decode and 'show' your Hugging Face inputs/targets

**Parameters:**


 - **`dls`** : *`<class 'fastai.data.core.DataLoaders'>`*	<p>Your fast.ai `DataLoaders</p>


 - **`before_batch_tfm_class`** : *`<class 'fastcore.transform.Transform'>`*, *optional*	<p>The before_batch transform to look for</p>


 - **`blurr_batch_tfm_class`** : *`<class 'fastcore.transform.Transform'>`*, *optional*	<p>The after_batch (or batch_tfm) to look for</p>



## `show_batch`

In [None]:
# export
@typedispatch
def show_batch(
    # This typedispatched `show_batch` will be called for `HF_BaseInput` typed inputs
    x: HF_BaseInput,
    # Your targets
    y,
    # Your raw inputs/targets
    samples,
    # Your `DataLoaders`. This is required so as to get at the Hugging Face objects for
    # decoding them into something understandable
    dataloaders,
    # Your `show_batch` context
    ctxs=None,
    # The maximum number of items to show
    max_n=6,
    # Any truncation your want applied to your decoded inputs
    trunc_at=None,
    # Any other keyword arguments you want applied to `show_batch`
    **kwargs,
):
    # grab our tokenizer
    tfm = first_blurr_tfm(dataloaders)
    hf_tokenizer = tfm.hf_tokenizer

    # if we've included our labels list, we'll use it to look up the value of our target(s)
    trg_labels = tfm.kwargs["labels"] if ("labels" in tfm.kwargs) else None

    res = L()
    n_inp = dataloaders.n_inp

    for idx, (input_ids, label, sample) in enumerate(zip(x, y, samples)):
        if idx >= max_n:
            break

        rets = [hf_tokenizer.decode(input_ids, skip_special_tokens=True)[:trunc_at]]
        for item in sample[n_inp:]:
            if not torch.is_tensor(item):
                trg = trg_labels[int(item)] if trg_labels else item
            elif is_listy(item.tolist()):
                trg = [trg_labels[idx] for idx, val in enumerate(label.numpy().tolist()) if (val == 1)] if (trg_labels) else label.item()
            else:
                trg = trg_labels[label.item()] if (trg_labels) else label.item()

            rets.append(trg)
        res.append(tuplify(rets))

    cols = ["text"] + ["target" if (i == 0) else f"target_{i}" for i in range(len(res[0]) - n_inp)]
    display_df(pd.DataFrame(res, columns=cols)[:max_n])
    return ctxs


## Sequence classification

The following eamples demonstrate several approaches to construct your `DataBlock` for sequence classication tasks using the mid-level API, and also an example on how to accomplish the same using the low-level API and standard PyTorch/Hugging Face/fast.ai Datasets and DataLoaders.

### Using the mid-level API

#### Batch-Time Tokenization

##### Step 1: Get your Hugging Face objects.

There are a bunch of ways we can get at the four Hugging Face elements we need (e.g., architecture name, tokenizer, config, and model).  We can just create them directly, or we can use one of the helper methods available via `BLURR`.

In [None]:
# hide_output
from transformers import AutoModelForSequenceClassification

model_cls = AutoModelForSequenceClassification

pretrained_model_name = "distilroberta-base"  # "distilbert-base-uncased" "bert-base-uncased"
hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(pretrained_model_name, model_cls=model_cls)


#####  Step 2: Create your `DataBlock`

In [None]:
blocks = (HF_TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model, before_batch_kwargs={"labels": labels}), CategoryBlock)
dblock = DataBlock(blocks=blocks, get_x=ColReader("text"), get_y=ColReader("label"), splitter=ColSplitter())


##### Step 3: Build your `DataLoaders`

In [None]:
dls = dblock.dataloaders(imdb_df, bs=4)


In [None]:
b = dls.one_batch()
len(b), len(b[0]["input_ids"]), b[0]["input_ids"].shape, len(b[1])


(2, 4, torch.Size([4, 512]), 4)

Let's take a look at the actual types represented by our batch

In [None]:
explode_types(b)


{tuple: [dict, fastai.torch_core.TensorCategory]}

In [None]:
dls.show_batch(dataloaders=dls, max_n=2, trunc_at=500)


Unnamed: 0,text,target
0,"""A Damsel in Distress"" is definitely not one of Fred Astaire's better musicals. But even Astaire's bad films always had some good moments.<br /><br />In ""Damsel,"" Astaire is Jerry Halliday, an American musical star who is in London on a personal appearance tour. He meets Lady Alice Marshmorton (19-year-old Joan Fontaine), a beautiful English heiress, who hops into the back of a cab he is taking to escape a mob of admirers.<br /><br />Jerry believes that Alice is being forced into a marriage by",neg
1,"One of the most important artistic movements in the history of cinema was without a doubt German expressionism, the highly atmospheric style of film-making developed during the 20s in Berlin. Classic movies like ""Das Cabinet Des Dr. Caligari."" (1920) and ""Nosferatu, Eine Symphonie Des Grauens"" (1922) were the most famous direct results of this movement, and while the movement didn't have a long life, its enormous influence over cinema can still be felt today, specially in the horror genre. One",pos


#### Pre-tokenized/numericalized

BLURR now also works with pre-processed datasets where your inputs are actually "input_ids".  Preprocessing your raw data is the more traditional approach to using Transformers, and is required when you are working with documents that may be longer than your model can handle.  In the later case, in addition to task specific preprocessing, you typically want to tell your tokenizer to create "chunks" of text from such documents by setting `return_overflowing_tokens": True`.

Below is an example of how we can use pre-tokenized/numericalized inputs

##### Step 1: Get your Hugging Face objects.

In [None]:
hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(pretrained_model_name, model_cls=model_cls)


##### Step 1b. Preprocess dataset

In [None]:
preprocessor = ClassificationPreprocessor(hf_tokenizer, label_mapping=labels)
proc_ds = preprocessor.process_hf_dataset(final_ds)
proc_ds


  0%|          | 0/1200 [00:00<?, ?ex/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

Dataset({
    features: ['attention_mask', 'input_ids', 'is_valid', 'label', 'label_name', 'text'],
    num_rows: 1200
})

##### Step 2: Create your `DataBlock`

In [None]:
blocks = (
    HF_TextBlock(
        hf_arch,
        hf_config,
        hf_tokenizer,
        hf_model,
        is_pretokenized=True,
        before_batch_kwargs={"labels": labels},
        tok_kwargs={"add_special_tokens": False},
    ),
    CategoryBlock,
)
dblock = DataBlock(blocks=blocks, get_x=ItemGetter("input_ids"), get_y=ItemGetter("label"), splitter=RandomSplitter())


##### Step 3: Build your `DataLoaders`

In [None]:
dls = dblock.dataloaders(proc_ds, bs=4)


In [None]:
dls.show_batch(dataloaders=dls, max_n=2, trunc_at=500)


Unnamed: 0,text,target
0,"It says that a girl named Susan Montford both wrote and directed this ""movie."" No wonder she has no other credits to her name for writing or directing. She made a severe vocational error in choosing this as her career. This is one of the worst human creations of this millennium.<br /><br />The fundamental thing wrong with this movie other than its ridiculous story of a woman running away from four weak thugs, is the blatant and complete lack of LOGIC.<br /><br />**After she leaves the mall, she",neg
1,"David Mamet's film debut has been hailed by many as a real thinking-man's movie, a movie that makes you question everybody and everything. I saw it for the first time recently and couldn't understand what was supposed to be so great about it.<br /><br />The movie is about a female psychologist named Margaret who is also a best-selling author. Margaret has become disillusioned by her profession and her inability to really help anyone. She tries to rectify this by helping settle her patient's gam",neg


### Using the low-level API

#### Step 1: Build your datasets

In [None]:
raw_datasets = load_dataset("glue", "mrpc")


Downloading:   0%|          | 0.00/7.78k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

Reusing dataset glue (/home/wgilliam/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
raw_datasets["train"].features
raw_datasets["train"]


Dataset({
    features: ['idx', 'label', 'sentence1', 'sentence2'],
    num_rows: 3668
})

In [None]:
preprocessor = ClassificationPreprocessor(hf_tokenizer, text_attrs=["sentence1", "sentence2"], label_mapping=labels)
proc_dataests = preprocessor.process_hf_dataset(raw_datasets)
proc_dataests

  0%|          | 0/3668 [00:00<?, ?ex/s]

  0%|          | 0/408 [00:00<?, ?ex/s]

  0%|          | 0/1725 [00:00<?, ?ex/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'idx', 'input_ids', 'label', 'label_name', 'sentence1', 'sentence2'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['attention_mask', 'idx', 'input_ids', 'label', 'label_name', 'sentence1', 'sentence2'],
        num_rows: 408
    })
    test: Dataset({
        features: ['attention_mask', 'idx', 'input_ids', 'label', 'label_name', 'sentence1', 'sentence2'],
        num_rows: 1725
    })
})

#### Step 2: Dataset pre-processing (optional)

In [None]:
# export
def preproc_hf_dataset(
    # A standard PyTorch Dataset or fast.ai Datasets
    dataset: Union[torch.utils.data.dataset.Dataset, Datasets],
    # A Hugging Face tokenizer
    hf_tokenizer: PreTrainedTokenizerBase,
    # A Hugging Face model
    hf_model: PreTrainedModel,
):
    """This method can be used to preprocess most Hugging Face Datasets for use in Blurr and other training
    libraries
    """
    if ("label") in dataset.column_names:
        dataset = dataset.rename_column("label", "labels")

    hf_model_fwd_args = list(inspect.signature(hf_model.forward).parameters.keys())
    bad_cols = set(dataset.column_names).difference(hf_model_fwd_args)
    dataset = dataset.remove_columns(bad_cols)

    dataset.set_format("torch")
    return dataset


#### Step 3: Build your `DataLoaders`.

Use `BlurrDataLoader` to build Blurr friendly dataloaders from your datasets. Passing `{'labels': label_names}` to your `batch_tfm_kwargs` will ensure that your lable/target names will be displayed in methods like `show_batch` and `show_results` (just as it works with the mid-level API)

In [None]:
label_names = raw_datasets["train"].features["label"].names

trn_dl = BlurrDataLoader(
    proc_dataests["train"],
    hf_arch,
    hf_config,
    hf_tokenizer,
    hf_model,
    preproccesing_func=preproc_hf_dataset,
    batch_tfm_kwargs={"labels": label_names},
    shuffle=True,
    batch_size=8,
)

val_dl = BlurrDataLoader(
    proc_dataests["validation"],
    hf_arch,
    hf_config,
    hf_tokenizer,
    hf_model,
    preproccesing_func=preproc_hf_dataset,
    batch_tfm_kwargs={"labels": label_names},
    batch_size=16,
)

dls = DataLoaders(trn_dl, val_dl)


In [None]:
b = dls.one_batch()
b[0]["input_ids"].shape


torch.Size([8, 76])

In [None]:
dls.show_batch(dataloaders=dls, max_n=2, trunc_at=800)


Unnamed: 0,text,target
0,"Duque will return to Earth Oct. 27 with the station's current crew, U.S. astronaut Ed Lu and Russian cosmonaut Yuri Malenchenko. Currently living onboard the space station are American astronaut Ed Lu and Russian cosmonaut Yuri Malenchenko.",not_equivalent
1,"Also Tuesday, the United States also released more Iraqi prisoners of war, and officials announced that all would soon be let go. Meanwhile in southern Iraq, the United States released more Iraqi prisoners of war, and officials announced that all would be let go soon.",equivalent


## Tests

The tests below to ensure the core DataBlock code above works for **all** pretrained sequence classification models available in Hugging Face.  These tests are excluded from the CI workflow because of how long they would take to run and the amount of data that would be required to download.

**Note**: Feel free to modify the code below to test whatever pretrained classification models you are working with ... and if any of your pretrained sequence classification models fail, please submit a github issue *(or a PR if you'd like to fix it yourself)*

In [None]:
# hide
[model_type for model_type in BLURR.get_models(task="SequenceClassification") if (not model_type.startswith("TF"))]


['AlbertForSequenceClassification',
 'BartForSequenceClassification',
 'BertForSequenceClassification',
 'BigBirdForSequenceClassification',
 'BigBirdPegasusForSequenceClassification',
 'CTRLForSequenceClassification',
 'CamembertForSequenceClassification',
 'CanineForSequenceClassification',
 'ConvBertForSequenceClassification',
 'DebertaForSequenceClassification',
 'DebertaV2ForSequenceClassification',
 'DistilBertForSequenceClassification',
 'ElectraForSequenceClassification',
 'FNetForSequenceClassification',
 'FlaubertForSequenceClassification',
 'FunnelForSequenceClassification',
 'GPT2ForSequenceClassification',
 'GPTJForSequenceClassification',
 'GPTNeoForSequenceClassification',
 'HubertForSequenceClassification',
 'IBertForSequenceClassification',
 'LEDForSequenceClassification',
 'LayoutLMForSequenceClassification',
 'LayoutLMv2ForSequenceClassification',
 'LongformerForSequenceClassification',
 'MBartForSequenceClassification',
 'MPNetForSequenceClassification',
 'MegatronB

In [None]:
# hide
pretrained_model_names = [
    "hf-internal-testing/tiny-albert",
    "hf-internal-testing/tiny-random-bart",
    "hf-internal-testing/tiny-bert",
    "google/bigbird-roberta-base",
    "google/bigbird-pegasus-large-arxiv",
    "hf-internal-testing/tiny-random-ctrl",
    "camembert-base",
    "hf-internal-testing/tiny-random-canine",
    "YituTech/conv-bert-base",
    "hf-internal-testing/tiny-deberta",
    "hf-internal-testing/tiny-random-deberta-v2",
    "hf-internal-testing/tiny-random-distilbert",
    "hf-internal-testing/tiny-electra",
    "google/fnet-base",
    "hf-internal-testing/tiny-random-flaubert",
    "hf-internal-testing/tiny-random-funnel",
    "hf-internal-testing/tiny-random-gpt2",
    "anton-l/gpt-j-tiny-random",
    "hf-internal-testing/tiny-random-gpt_neo",
    "kssteven/ibert-roberta-base",
    "hf-internal-testing/tiny-random-led",
    "hf-internal-testing/tiny-random-longformer",
    "hf-internal-testing/tiny-random-mbart",
    "hf-internal-testing/tiny-random-mpnet",
    # "nvidia/megatron-bert-cased-345m",                 could not test
    "hf-internal-testing/tiny-random-mobilebert",
    "openai-gpt",
    "google/reformer-crime-and-punishment",
    "google/rembert",
    "junnyu/roformer_chinese_sim_char_ft_small",
    "roberta-base",
    "squeezebert/squeezebert-uncased",
    "hf-internal-testing/tiny-random-transfo-xl",
    "xlm-mlm-en-2048",
    "xlm-roberta-base",
    "xlnet-base-cased",
]


In [None]:
# hide
# for model_name in pretrained_model_names:
#     tok = AutoTokenizer.from_pretrained(model_name)
#     print(f'=== {model_name} ===')
#     print(f'=== {tok.padding_side} ===')
#     print(f'=== {tok.pad_token_id} ===')
#     print(tok(['hi', 'hello everyone. its good to be here'], ['yo', 'yo'], padding='max_length', max_length=128))


In [None]:
# hide
raw_datasets = load_dataset("imdb", split=["train", "test"])
raw_datasets[0] = raw_datasets[0].add_column("is_valid", [False] * len(raw_datasets[0]))
raw_datasets[1] = raw_datasets[1].add_column("is_valid", [True] * len(raw_datasets[1]))

final_ds = concatenate_datasets([raw_datasets[0].shuffle().select(range(1000)), raw_datasets[1].shuffle().select(range(200))])
imdb_df = pd.DataFrame(final_ds)


Reusing dataset imdb (/home/wgilliam/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# hide
from transformers import RobertaTokenizer

model_cls = AutoModelForSequenceClassification
bsz = 2
seq_sz = 128

test_results = []
for model_name in pretrained_model_names:
    error = None

    print(f"=== {model_name} ===\n")

    tok_class = RobertaTokenizer if ("/ibert" in model_name) else None

    hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(model_name, model_cls=model_cls, tokenizer_cls=tok_class)

    print(f"architecture:\t{hf_arch}\ntokenizer:\t{type(hf_tokenizer).__name__}\n")

    # not all architectures include a native pad_token (e.g., gpt2, ctrl, etc...), so we add one here
    if hf_tokenizer.pad_token is None:
        hf_tokenizer.add_special_tokens({"pad_token": "<pad>"})
        hf_config.pad_token_id = hf_tokenizer.get_vocab()["<pad>"]
        hf_model.resize_token_embeddings(len(hf_tokenizer))

    try:
        blocks = (HF_TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model, padding="max_length", max_length=seq_sz), CategoryBlock)

        dblock = DataBlock(blocks=blocks, get_x=ColReader("text"), get_y=ColReader("label"), splitter=ColSplitter())
        dls = dblock.dataloaders(imdb_df, bs=bsz)
        b = dls.one_batch()

        print("*** TESTING DataLoaders ***\n")
        test_eq(len(b), 2)
        test_eq(len(b[0]["input_ids"]), bsz)
        test_eq(b[0]["input_ids"].shape, torch.Size([bsz, seq_sz]))
        test_eq(len(b[1]), bsz)

        test_results.append((hf_arch, type(hf_tokenizer).__name__, model_name, "PASSED", ""))
        dls.show_batch(dataloaders=dls, max_n=2, trunc_at=1000)

    except Exception as err:
        test_results.append((hf_arch, type(hf_tokenizer).__name__, model_name, "FAILED", err))


=== hf-internal-testing/tiny-albert ===

architecture:	albert
tokenizer:	AlbertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"director douglas sirk once said there's a very short distance between high art and trash, and trash that contains craziness is by this very quality nearer to art'. this statement defines his cinema perfectly, a very unique body of work that includes classic stage adaptations, adventure and war films, westerns and of course, his famous melodramas.br /br /sirk's melodramas were, as the very word signifies, dramas",1
1,"""a wrong-doer is often a man that has left something undone, not always he that has done something.""--emperor marcus aurelius br /br /the dvd release of ""watch on the rhine"" could not come at a better moment. it restores to us a major lillian hellman play stirringly adapted to the screen by dashiell hammett (hellman scholar bernard f. dick's audio commentary a",1


=== hf-internal-testing/tiny-random-bart ===

architecture:	bart
tokenizer:	BartTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"THE SHOP AROUND THE CORNER is one of the sweetest and most feel-good romantic comedies ever made. There's just no getting around that, and it's hard to actually put one's feeling for this film into words. It's not one of those films that tries too hard, nor does it come up with the oddest possible sc",1
1,"Film can be a looking glass to see the world in a new light. Good Night and Good Luck, for instance, offered parallels to modern judgement-without-evidence and encroachments on freedom. It is easier to examine a moral problem when it is not too close to home: by putting it in a fictional or historic context rem",1


=== hf-internal-testing/tiny-bert ===

architecture:	bert
tokenizer:	BertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"the shop around the corner is one of the sweetest and most feel - good romantic comedies ever made. there's just no getting around that, and it's hard to actually put one's feeling for this film into words. it's not one of those films that tries too hard, nor does it come up with the oddest possible scenarios to get the two protagonists together in the end. in fact, all its charm is innate, contained within the characters and the setting and the plot... which is highly believable to boot. it's easy to think that such a love story, as",1
1,"riff randell is a wildly, obsessed fan of the rock group ; the ramones and so are most of the students in the school. but a new tyrant of a principal, ms. togar thinks rock'n'roll is a bad influence on the students, especially the music from the ramones. so, when riff finds out they're performing in town, she skips class for a couple of days to get tickets for herself and her friends. but when ms. togar discovers why she really took those days off she confiscates the tickets. while, this is happening tom roberts is totally love struck",1


=== google/bigbird-roberta-base ===



normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.


architecture:	big_bird
tokenizer:	BigBirdTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"THE SHOP AROUND THE CORNER is one of the sweetest and most feel-good romantic comedies ever made. There's just no getting around that, and it's hard to actually put one's feeling for this film into words. It's not one of those films that tries too hard, nor does it come up with the oddest possible scenarios to get the two protagonists together in the end. In fact, all its charm is innate, contained within the characters and the setting and the plot... which is highly believable to boot. It's easy to think that such a love story, as beautiful as any other",1
1,"Who knew? Dowdy Queen Victoria, the plump Monarch who was a virtual recluse for 40 years after the death of her husband, Prince Albert, actually led a life fraught with drama and intrigue in her younger days. 'The Young Victoria' not only chronicles the young Queen's romance with her husband-to-be but also does a pretty good job of detailing the political machinations surrounding her ascent to the throne.<br /><br />The Act I'set-up' draws you in right away. Following the death of Victoria's father, the Duke of Kent in 1820, less than a year after",1


=== google/bigbird-pegasus-large-arxiv ===

architecture:	bigbird_pegasus
tokenizer:	PegasusTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"THE SHOP AROUND THE CORNER is one of the sweetest and most feel-good romantic comedies ever made. There's just no getting around that, and it's hard to actually put one's feeling for this film into words. It's not one of those films that tries too hard, nor does it come up with the oddest possible scenarios to get the two protagonists together in the end. In fact, all its charm is innate, contained within the characters and the setting and the plot... which is highly believable to boot. It's easy to think that such a love story, as beautiful as any other ever told",1
1,"Director Douglas Sirk once said there's a very short distance between high art and trash, and trash that contains craziness is by this very quality nearer to art'. This statement defines his cinema perfectly, a very unique body of work that includes classic stage adaptations, adventure and war films, westerns and of course, his famous melodramas.br />br />Sirk's melodramas were, as the very word signifies, dramas with music. The music sets the tone for his masterful style, and every stroke of his brush (Sirk was also a painter) leaves a powerful image on the screen-",1


=== hf-internal-testing/tiny-random-ctrl ===



  angle_rates = 1 / torch.pow(10000, (2 * (i // 2)) / d_model_size)
Using pad_token, but it is not set yet.


architecture:	ctrl
tokenizer:	CTRLTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Director Douglas Sirk once said `there's a very short distance between high art and trash, and trash that contains craziness is by this very quality nearer to art'. This statement defines his cinema perfectly, a very unique body of work that includes classic stage adaptations, adventure and war films, westerns and of course, his famous melodramas.<br /><br />Sirk's melodramas were, as the very word signifies, dramas with music. The music sets the tone for his masterful style, and every stroke of his brush (Sirk was also a painter) leaves a powerful image on the screen-turned-canvas. But this ain't life but its representation, an imitation of life. Sirk",1
1,"Who knew? Dowdy Queen Victoria, the plump Monarch who was a virtual recluse for 40 years after the death of her husband, Prince Albert, actually led a life fraught with drama and intrigue in her younger days. 'The Young Victoria' not only chronicles the young Queen's romance with her husband-to-be but also does a pretty good job of detailing the political machinations surrounding her ascent to the throne.<br /><br />The Act I'set-up' draws you in right away. Following the death of Victoria's father, the Duke of Kent in 1820, less than a year after Victoria's birth, the Duchess of Kent eventually hooked up with former Army Officer John Conroy, who offered",1


=== camembert-base ===

architecture:	camembert
tokenizer:	CamembertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Director Douglas Sirk once said there's a very short distance between high art and trash, and trash that contains craziness is by this very quality nearer to art'. This statement defines his cinema perfectly, a very unique body of work that includes classic stage adaptations, adventure and war films, westerns and of course, his famous melodramas.<br /><br />Sirk's melodramas were, as the very word signifies",1
1,"THE HOUSE THAT DRIPPED BLOOD is the third in a series of seven Amicus horror anthologies. If THE MONSTER CLUB is included as part of the series, this would make eight movies. Although, that movie is very different from the others.<br /><br />I look upon the Amicus anthologies with great memories as I used to love them when I was in my teens. My feelings for them today are just as",1


=== hf-internal-testing/tiny-random-canine ===



Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.


architecture:	canine
tokenizer:	CanineTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Director Douglas Sirk once said `there's a very short distance between high art and trash, and trash that contains craziness i",1
1,To be a Buster Keaton fan is to have your heart broken on a regular basis. Most of us first encounter Keaton in one of the bri,0


=== YituTech/conv-bert-base ===

architecture:	convbert
tokenizer:	ConvBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"the shop around the corner is one of the sweetest and most feel - good romantic comedies ever made. there's just no getting around that, and it's hard to actually put one's feeling for this film into words. it's not one of those films that tries too hard, nor does it come up with the oddest possible scenarios to get the two protagonists together in the end. in fact, all its charm is innate, contained within the characters and the setting and the plot... which is highly believable to boot. it's easy to think that such a love story, as",1
1,"who knew? dowdy queen victoria, the plump monarch who was a virtual recluse for 40 years after the death of her husband, prince albert, actually led a life fraught with drama and intrigue in her younger days.'the young victoria'not only chronicles the young queen's romance with her husband - to - be but also does a pretty good job of detailing the political machinations surrounding her ascent to the throne. < br / > < br / > the act i'set - up'draws you in right away. following the death of victoria's father, the duke of kent in 1820,",1


=== hf-internal-testing/tiny-deberta ===

architecture:	deberta
tokenizer:	DebertaTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Director Douglas Sirk once said there's a very short distance between high art and trash, and trash that contains craziness is by this very quality nearer to art'. This statement defines his cinema perfectly, a very unique body of work that includes classic stage adaptations, adventure and war films, westerns and of cour",1
1,"To be a Buster Keaton fan is to have your heart broken on a regular basis. Most of us first encounter Keaton in one of the brilliant feature films from his great period of independent production: 'The General', 'The Navigator', 'Sherlock Jnr'. We recognise him as the greatest figure in the entire history of f",0


=== hf-internal-testing/tiny-random-deberta-v2 ===

architecture:	deberta_v2
tokenizer:	DebertaV2Tokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"THE SHOP AROUND THE CORNER is one of the sweetest and most feel-good romantic comedies ever made. There's just no getting around that, and it's hard to actually put one's feeling for this film into words. It's not one of those films that tries too hard, nor does it come up with the oddest possible scenarios to get the two protagonists together in the end. In fact, all its charm is innate, contained within the characters and the setting and the plot... which is highly believable to boot. It's easy to think that such a love story, as beautiful as any other ever told",1
1,"Director Douglas Sirk once said `there's a very short distance between high art and trash, and trash that contains craziness is by this very quality nearer to art'. This statement defines his cinema perfectly, a very unique body of work that includes classic stage adaptations, adventure and war films, westerns and of course, his famous melodramas.<br /><br />Sirk's melodramas were, as the very word signifies, dramas with music. The music sets the tone for his masterful style, and every stroke of his brush (Sirk was also a painter) leaves a powerful image on the screen-turned",1


=== hf-internal-testing/tiny-random-distilbert ===

architecture:	distilbert
tokenizer:	DistilBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"director douglas sirk once said ` there's a very short distance between high art and trash, and trash that contains craziness is by this very quality ne",1
1,"the movie "" holly "" is the story of a young girl who has been sold by her poor family and smuggled across the border to cambodia to work as a prostitute in the",1


=== hf-internal-testing/tiny-electra ===

architecture:	electra
tokenizer:	ElectraTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"director douglas sirk once said ` there's a very short distance between high art and trash, and trash that contains craziness is by this very quality nearer to art '. this statement defines his cinema perfectly, a very unique body of work that includes classic stage adaptations, adventure and war films, westerns and of course, his famous melodramas. < br / > < br / > sirk's melodramas were, as the",1
1,""" a wrong - doer is often a man that has left something undone, not always he that has done something. "" - - emperor marcus aurelius < br / > < br / > the dvd release of "" watch on the rhine "" could not come at a better moment. it restores to us a major lillian hellman play stirringly adapted to the screen by dashiell hammett ( hellman scholar bernard f. dick's audio comment",1


=== google/fnet-base ===

architecture:	fnet
tokenizer:	FNetTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"THE SHOP AROUND THE CORNER is one of the sweetest and most feel-good romantic comedies ever made. There's just no getting around that, and it's hard to actually put one's feeling for this film into words. It's not one of those films that tries too hard, nor does it come up with the oddest possible scenarios to get the two protagonists together in the end. In fact, all its charm is innate, contained within the characters and the setting and the plot... which is highly believable to boot. It's easy to",1
1,"Terry Gilliam's and David Peoples' teamed up to create one of the most intelligent and creative science fiction movies of the '90's. People's proved a screenplay with bizarre twists and fantastic ideas about the nature of time  I especially love the idea one can't change the past; it's a nice counterpoint to so many time-travelling movies which say otherwise  biological holocausts and the thin line between sanity and madness. Gilliam visualized his ideas with unique quirkiness, perfection and originality.<br",1


=== hf-internal-testing/tiny-random-flaubert ===

architecture:	flaubert
tokenizer:	FlaubertTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Director Douglas Sirk once said'there' s a very short distance between high art and trash, and trash that contains craziness is by this very quality nearer to art '. This statement defines his cinema perfectly, a very unique body of work that includes classic stage adaptations, adventure and war films, westerns and of course, his famous melodramas. < br / > < br / > Sirk' s melodramas were, as the very word signifies, dramas with music. The music sets the tone",1
1,"This film ( like Astaire' s ROYAL WEDDING - which was shown after it on Turner Classic Network last night ) is famous for a single musical sequence that has gained a place in Gene Kelly' s record : Like Fred Astaire dancing with a clothing rack and later dancing around a room' s walls and ceiling, this film had Gene Kelly dancing in a cartoon sequence with Jerry Mouse. The sequence is nicely done. What is forgotten is that Kelly is telling the story",1


=== hf-internal-testing/tiny-random-funnel ===

architecture:	funnel
tokenizer:	FunnelTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"director douglas sirk once said ` there's a very short distance between high art and trash, and trash that contains craziness is by this very quality ne",1
1,""" a wrong - doer is often a man that has left something undone, not always he that has done something. "" - - emperor marcus aurelius < br / > < br / > the dvd rele",1


=== hf-internal-testing/tiny-random-gpt2 ===



Using pad_token, but it is not set yet.


architecture:	gpt2
tokenizer:	GPT2TokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"THE SHOP AROUND THE CORNER is one of the sweetest and most feel-good romantic comedies ever made. There's just no getting around that, and it's hard to actually put one's feeling for this film into words. It's not one of those films that tries too hard, nor does it come up with the oddest possible scenar",1
1,"The movie ""Holly"" is the story of a young girl who has been sold by her poor family and smuggled across the border to Cambodia to work as a prostitute in the infamous ""K11"" red light village. In the movie, Holly is waiting to be sold at a premium for her virginity when she meets Patrick who is losing money and friends",1


=== anton-l/gpt-j-tiny-random ===



Using pad_token, but it is not set yet.


architecture:	gptj
tokenizer:	GPT2TokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"THE SHOP AROUND THE CORNER is one of the sweetest and most feel-good romantic comedies ever made. There's just no getting around that, and it's hard to actually put one's feeling for this film into words. It's not one of those films that tries too hard, nor does it come up with the oddest possible scenarios to get the two protagonists together in the end. In fact, all its charm is innate, contained within the characters and the setting and the plot... which is highly believable to boot. It's easy to think that such a love story, as beautiful as any other ever told,",1
1,"Every now and then a movie advertises itself as scary or frightening, though they usually aren't. Most modern horror movies fit into this category.<br /><br />Then there are those movies that don't simply cause the tension and adrenaline to pump through your veins harder than usual. They actually frighten you to a level that you've never experienced.<br /><br />""Halloween"" is such a film. It takes so many risks that would make most movie producers cringe. But nearly all of them work. ""Halloween"" is awe-inspiring in its simplicity, and terrifying as a whole.<br /><br",1


=== hf-internal-testing/tiny-random-gpt_neo ===



Using pad_token, but it is not set yet.


architecture:	gpt_neo
tokenizer:	GPT2TokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"THE SHOP AROUND THE CORNER is one of the sweetest and most feel-good romantic comedies ever made. There's just no getting around that, and it's hard to actually put one's feeling for this film into words. It's not one of those films that tries too hard, nor does it come up with the oddest possible scenar",1
1,"Director Douglas Sirk once said `there's a very short distance between high art and trash, and trash that contains craziness is by this very quality nearer to art'. This statement defines his cinema perfectly, a very unique body of work that includes classic stage adaptations, adventure and war films, westerns and of course, his fam",1


=== kssteven/ibert-roberta-base ===

architecture:	ibert
tokenizer:	RobertaTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"THE SHOP AROUND THE CORNER is one of the sweetest and most feel-good romantic comedies ever made. There's just no getting around that, and it's hard to actually put one's feeling for this film into words. It's not one of those films that tries too hard, nor does it come up with the oddest possible scenarios to get the two protagonists together in the end. In fact, all its charm is innate, contained within the characters and the setting and the plot... which is highly believable to boot. It's easy to think that such a love story, as beautiful as any other ever",1
1,"This film (like Astaire's ROYAL WEDDING - which was shown after it on Turner Classic Network last night) is famous for a single musical sequence that has gained a place in Gene Kelly's record: Like Fred Astaire dancing with a clothing rack and later dancing around a room's walls and ceiling, this film had Gene Kelly dancing in a cartoon sequence with Jerry Mouse. The sequence is nicely done. What is forgotten is that Kelly is telling the story behind the cartoon sequence to Dean Stockwell and his fellow child students at school during a break in the day, and sets the stage for the sequence by having",1


=== hf-internal-testing/tiny-random-led ===

architecture:	led
tokenizer:	LEDTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"THE SHOP AROUND THE CORNER is one of the sweetest and most feel-good romantic comedies ever made. There's just no getting around that, and it's hard to actually put one's feeling for this film into words. It's not one of those films that tries too hard, nor does it come up with the oddest possible sc",1
1,"Colleges, High Schools, Fraternities and Sororities have been the most popular stalking grounds for maniacal madmen since the slasher cycle first became a popular cinema culture throughout the late seventies. Even backwoods cabins and campsites have rode shotgun to the amount of massacres that have taken place on campuses since H",0


=== hf-internal-testing/tiny-random-longformer ===

architecture:	longformer
tokenizer:	LongformerTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"THE SHOP AROUND THE CORNER is one of the sweetest and most feel-good romantic comedies ever made. There's just no getting around that, and it's hard to actually put one's feeling for this film into words. It's not one of those films that tries too hard, nor does it come up with the oddest possible sc",1
1,"The freedom of having your own Sea Going Power Boat, the excitement of going on underwater adventures a rugged,an's man of an adventurer and lovely(and so well endowed!) assistants in fine Bikinis were all definite selling points for ""SEA HUNT""(1958-61).<br /><br />Just what",1


=== hf-internal-testing/tiny-random-mbart ===

architecture:	mbart
tokenizer:	MBartTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Director Douglas Sirk once said there's a very short distance between high art and trash, and trash that contains craziness is by this very quality nearer to art'. This statement defines his cinema perfectly, a very unique body of work that includes classic stage adaptations, adventure and war film",1
1,"The movie ""Holly"" is the story of a young girl who has been sold by her poor family and smuggled across the border to Cambodia to work as a prostitute in the infamous ""K11"" red light village. In the movie, Holly is waiting to be sold at a premium for her virginity when she meet",1


=== hf-internal-testing/tiny-random-mpnet ===

architecture:	mpnet
tokenizer:	MPNetTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"director douglas sirk once said ` there's a very short distance between high art and trash, and trash that contains craziness is by this very quality ne",1
1,"scooby doo is undoubtedly one of the most simple, successful and beloved cartoon characters in the world. so, what happens when you've been everywher",0


=== hf-internal-testing/tiny-random-mobilebert ===

architecture:	mobilebert
tokenizer:	MobileBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"director douglas sirk once said ` there's a very short distance between high art and trash, and trash that contains craziness is by this very quality ne",1
1,"the shop around the corner is one of the sweetest and most feel - good romantic comedies ever made. there's just no getting around that, and it's hard to a",1


=== openai-gpt ===



Using pad_token, but it is not set yet.


architecture:	openai
tokenizer:	OpenAIGPTTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"the shop around the corner is one of the sweetest and most feel - good romantic comedies ever made. there's just no getting around that, and it's hard to actually put one's feeling for this film into words. it's not one of those films that tries too hard, nor does it come up with the oddest possible scenarios to get the two protagonists together in the end. in fact, all its charm is innate, contained within the characters and the setting and the plot... which is highly believable to boot. it's easy to think that such a love story, as beautiful as any",1
1,"the freedom of having your own sea going power boat, the excitement of going on underwater adventures a rugged, an's man of an adventurer and lovely ( and so well endowed! ) assistants in fine bikinis were all definite selling points for "" sea hunt "" ( 1958 - 61 ). < br / > < br / > just what was the reason for producing a sort of sea going "" gun for hire "" * series. let's look closely now. there must be a some clues around. < br / > < br / > if we were to look back just a little, we see the rko radio pictures production of",1


=== google/reformer-crime-and-punishment ===



Using pad_token, but it is not set yet.


architecture:	reformer
tokenizer:	ReformerTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Director Douglas Sirk once said theres a very short distance between high art and trash, and trash that contains craziness is by this very quality nearer to art. This statement defines his cinema perfectly, a very unique body of work that includes",1
1,"THE SHOP AROND THE CORNER is one of the sweetest and most feel-good romantic comedies ever made. Theres just no getting around that, and its hard to actually put ones feeling for this film into words. Its not one of those films that tries too ha",1


=== google/rembert ===

architecture:	rembert
tokenizer:	RemBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"THE SHOP AROUND THE CORNER is one of the sweetest and most feel-good romantic comedies ever made. There's just no getting around that, and it's hard to actually put one's feeling for this film into words. It's not one of those films that tries too hard, nor does it come up with the oddest possible scenarios to get the two protagonists together in the end. In fact, all its charm is innate, contained within the characters and the setting and the plot... which is highly believable to boot. It's easy to think that such a",1
1,"If one sits down to watch Unhinged, it is probably because its advertisements, video boxes, whatever, scream that it was banned in the UK for over 20 years (as virtually every video nasty does). It's true; exploitation and taboo excites people and draws them in with their promise of controversy. Being an exploitation fan, however, none of this was new to me. The advertisements that scream that the film was banned in the UK don't necessarily make me want to watch it; in fact, the first thing that usually pops into my head is how disgustingly para",0


=== junnyu/roformer_chinese_sim_char_ft_small ===

architecture:	roformer
tokenizer:	RoFormerTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"director douglas sirk once said there's a very short distance between high art and trash, and trash that contains craziness is by this very quality nearer to art '. this statement defines his cinema perfectly, a very unique body of work that includes classic stage adaptations, adventure and war films, westerns and of course, his famous melodramas. < br / > < br / > sirk",1
1,"to be a buster keaton fan is to have your heart broken on a regular basis. most of us first encounter keaton in one of the brilliant feature films from his great period of independent production :'the general ','the navigator ','sherlock jnr '. we recognise him as the greatest figure in the entire history of film comedy, and we want to see more of his movies. here the heartbreak be",0


=== roberta-base ===

architecture:	roberta
tokenizer:	RobertaTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"THE SHOP AROUND THE CORNER is one of the sweetest and most feel-good romantic comedies ever made. There's just no getting around that, and it's hard to actually put one's feeling for this film into words. It's not one of those films that tries too hard, nor does it come up with the oddest possible scenarios to get the two protagonists together in the end. In fact, all its charm is innate, contained within the characters and the setting and the plot... which is highly believable to boot. It's easy to think that such a love story, as beautiful as any other ever",1
1,"Terry Gilliam's and David Peoples' teamed up to create one of the most intelligent and creative science fiction movies of the '90's. People's proved a screenplay with bizarre twists and fantastic ideas about the nature of time  I especially love the idea one can't change the past; it's a nice counterpoint to so many time-travelling movies which say otherwise  biological holocausts and the thin line between sanity and madness. Gilliam visualized his ideas with unique quirkiness, perfection and originality.<br /><br />The story itself is engaging: one man, James Cole (played",1


=== squeezebert/squeezebert-uncased ===

architecture:	squeezebert
tokenizer:	SqueezeBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"the shop around the corner is one of the sweetest and most feel - good romantic comedies ever made. there's just no getting around that, and it's hard to actually put one's feeling for this film into words. it's not one of those films that tries too hard, nor does it come up with the oddest possible scenarios to get the two protagonists together in the end. in fact, all its charm is innate, contained within the characters and the setting and the plot... which is highly believable to boot. it's easy to think that such a love story, as",1
1,"the house that dripped blood is the third in a series of seven amicus horror anthologies. if the monster club is included as part of the series, this would make eight movies. although, that movie is very different from the others. < br / > < br / > i look upon the amicus anthologies with great memories as i used to love them when i was in my teens. my feelings for them today are just as strong. < br / > < br / > i spent many years trying to track down this movie. the synopses of the stories was so appealing that i went as far as paying a substantial",1


=== hf-internal-testing/tiny-random-transfo-xl ===



Using pad_token, but it is not set yet.


architecture:	transfo_xl
tokenizer:	TransfoXLTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Director Douglas once said 'there's a very short distance between high art and trash, and trash that contains craziness is by this very quality nearer to art'. This statement defines his cinema perfectly, a very unique body of work that includes classic stage adaptations, adventure and war films, westerns and of course, his famous melodramas. < br / > < br / >'s melodramas were, as the very word signifies, dramas with music. The music sets the tone for his masterful style, and every stroke of his brush (was also a painter) leaves a powerful image on the screen-turned-canvas. But this ain't",1
1,"I saw most of the episodes of as a teenager on ""Cliffhanger Theater"" running after midnight on a local station some years ago, and then again when Mystery Science Theatre on it in the early 90's. Time has not been kind to it. < br / > < br / > I can certainly make allowances for the special effects, which were quite impressive for a low budget 50's serial (IMO Commando Cody's flying scenes were better than George Reeves / Superman's in his TV show). And I can also make allowances for the ahem, ""acting,"" and fight choreography except for the guy who plays the ruler of",0


=== xlm-mlm-en-2048 ===

architecture:	xlm
tokenizer:	XLMTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"the shop around the corner is one of the sweetest and most feel-good romantic comedies ever made. there's just no getting around that, and it's hard to actually put one's feeling for this film into words. it's not one of those films that tries too hard, nor does it come up with the oddest possible scenarios to get the two protagonists together in the end. in fact, all its charm is innate, contained within the characters and the setting and the plot... which is highly believable to boot. it's easy to think that such a love story, as beautiful as any other ever told,",1
1,"this film, which is based on a true story, comes from first time director and long time actor, denzel washington. denzel washington has given us some of the best performances of the last decade, as a black soldier in the civil war in glory, and a lawyer in the acclaimed philadelphia. and of course, he made special notoriety last year when he won the academy award for best actor in training day, in which denzel washington became the first african american to receive the award for best actor. i guess denzel wanted a change of pace, so he chose to direct antwone fisher, in",1


=== xlm-roberta-base ===

architecture:	xlm_roberta
tokenizer:	XLMRobertaTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Director Douglas Sirk once said `there's a very short distance between high art and trash, and trash that contains craziness is by this very quality nearer to art'. This statement defines his cinema perfectly, a very unique body of work that includes classic stage adaptations, adventure and war films, westerns and of course, his famous melodramas.<br /><br />Sirk's melodramas were, as the very word signifies, dramas with music. The music sets the tone for his masterful style, and every stroke of his brush (",1
1,"The freedom of having your own Sea Going Power Boat, the excitement of going on underwater adventures a rugged,an's man of an adventurer and lovely(and so well endowed!) assistants in fine Bikinis were all definite selling points for ""SEA HUNT""(1958-61).<br /><br />Just what was the reason for producing a sort of sea going ""gun for hire""* series. Let's look closely now. There must be a some clues around.<br /><br />If we were to look back just",1


=== xlnet-base-cased ===

architecture:	xlnet
tokenizer:	XLNetTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"THE SHOP AROUND THE CORNER is one of the sweetest and most feel-good romantic comedies ever made. There's just no getting around that, and it's hard to actually put one's feeling for this film into words. It's not one of those films that tries too hard, nor does it come up with the oddest possible scenarios to get the two protagonists together in the end. In fact, all its charm is innate, contained within the characters and the setting and the plot... which is highly believable to boot. It's easy to think",1
1,"Riff Randell is a wildly, obsessed fan of the rock group; The Ramones and so are most of the students in the school. But a new tyrant of a principal, Ms. Togar thinks rock 'n' roll is a bad influence on the students, especially the music from The Ramones. So, when Riff finds out they're performing in town, she skips class for a couple of days to get tickets for herself and her friends. But when Ms. Togar discovers why she really took those days off she confiscates the tickets. While, this is happening Tom",1


In [None]:
# hide_input
test_results_df = pd.DataFrame(test_results, columns=["arch", "tokenizer", "model_name", "result", "error"])
display_df(test_results_df)


Unnamed: 0,arch,tokenizer,model_name,result,error
0,albert,AlbertTokenizerFast,hf-internal-testing/tiny-albert,PASSED,
1,bart,BartTokenizerFast,hf-internal-testing/tiny-random-bart,PASSED,
2,bert,BertTokenizerFast,hf-internal-testing/tiny-bert,PASSED,
3,big_bird,BigBirdTokenizerFast,google/bigbird-roberta-base,PASSED,
4,bigbird_pegasus,PegasusTokenizerFast,google/bigbird-pegasus-large-arxiv,PASSED,
5,ctrl,CTRLTokenizer,hf-internal-testing/tiny-random-ctrl,PASSED,
6,camembert,CamembertTokenizerFast,camembert-base,PASSED,
7,canine,CanineTokenizer,hf-internal-testing/tiny-random-canine,PASSED,
8,convbert,ConvBertTokenizerFast,YituTech/conv-bert-base,PASSED,
9,deberta,DebertaTokenizerFast,hf-internal-testing/tiny-deberta,PASSED,


## Summary

The `blurr.data.core` module contains the fundamental bits for all data preprocessing tasks

In [None]:
# hide
from nbdev.export import notebook2script

notebook2script()


Converted 00_utils.ipynb.
Converted 01_data-core.ipynb.
Converted 01_modeling-core.ipynb.
Converted 02_data-language-modeling.ipynb.
Converted 02_modeling-language-modeling.ipynb.
Converted 03_data-token-classification.ipynb.
Converted 03_modeling-token-classification.ipynb.
Converted 04_data-question-answering.ipynb.
Converted 04_modeling-question-answering.ipynb.
Converted 10_data-seq2seq-core.ipynb.
Converted 10_modeling-seq2seq-core.ipynb.
Converted 11_data-seq2seq-summarization.ipynb.
Converted 11_modeling-seq2seq-summarization.ipynb.
Converted 12_data-seq2seq-translation.ipynb.
Converted 12_modeling-seq2seq-translation.ipynb.
Converted 99a_examples-high-level-api.ipynb.
Converted 99b_examples-glue.ipynb.
Converted 99c_examples-glue-plain-pytorch.ipynb.
Converted 99d_examples-multilabel.ipynb.
Converted 99e_examples-causal-lm-gpt2.ipynb.
Converted index.ipynb.
