In [None]:
# default_exp data.core

In [None]:
#all_slow

In [None]:
#hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# data.core

> This module contains the core bits required to use the fastai DataBlock API and/or mid-level data processing pipelines to organize your data in a way modelable by Hugging Face transformer implementations.

In [None]:
#export
import os, inspect
from dataclasses import dataclass
from functools import reduce, partial
from typing import Any, Callable, List, Optional, Union, Type

from fastcore.all import *
from fastai.data.block import TransformBlock
from fastai.data.core import Datasets, DataLoader, DataLoaders, TfmdDL
from fastai.imports import *
from fastai.losses import CrossEntropyLossFlat
from fastai.text.data import SortedDL
from fastai.torch_core import *
from fastai.torch_imports import *
from transformers import (
    DataCollatorWithPadding, logging,
    PretrainedConfig, PreTrainedTokenizerBase, PreTrainedModel
)

from blurr.utils import BLURR

logging.set_verbosity_error()

In [None]:
#hide_input
import pdb

from datasets import load_dataset
from fastai.data.block import CategoryBlock, ColReader, ColSplitter, DataBlock
from fastai.data.external import untar_data, URLs
from fastcore.test import *
from nbverbose.showdoc import show_doc

from blurr.utils import print_versions

os.environ["TOKENIZERS_PARALLELISM"] = "false"
print("What we're running with at the time this documentation was generated:")
print_versions('torch fastai transformers')

What we're running with at the time this documentation was generated:
torch: 1.7.1
fastai: 2.5.0
transformers: 4.9.2


In [None]:
#hide
#cuda
torch.cuda.set_device(1)
print(f'Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}')

Using GPU #1: GeForce GTX 1080 Ti


## Mid-level API: Base tokenization, batch transform, and DataBlock methods

In [None]:
#export
class HF_BaseInput(TensorBase):  
    """The base represenation of your inputs; used by the various fastai `show` methods"""
    def show(
        self, 
        # A Hugging Face tokenizer
        hf_tokenizer:PreTrainedTokenizerBase,       
        # The "context" associated to the current `show_batch/results` call
        ctx=None,      
        # Any truncation you want to apply to the decoded tokenized inputs  
        trunc_at:int=None,   
        # A decoded string of your tokenized inputs (input_ids)
    ) -> str:
        input_ids = self.cpu().numpy()
        decoded_input = str(hf_tokenizer.decode(input_ids, skip_special_tokens=True))[:trunc_at]

        return show_title(decoded_input, ctx=ctx, label='text')

A `HF_BaseInput` object is returned from the decodes method of `HF_AfterBatchTransform` as a means to customize @typedispatched functions like `DataLoaders.show_batch` and `Learner.show_results`. It uses the "input_ids" of a Hugging Face object as the representative tensor for `show` methods

In [None]:
#hide
# played with the idea of writing my own collation routine, but opted to do everything in a batch transform
# and let fastai handle the rest.  keeping this here for the interested :)

# def create_blurr_batch(samples): 
#     """Supports passing in one or two input sequences, or a list[str] (the later is common for token 
#     classification tasks where you should also set `is_split_into_words=True`).
#     Returns all the tensors for the input sequence(s) in a dictionary."""
#     samples = L(samples)
#     inps = samples.itemgot(0).items if (n_seqs == 1 ) else list(zip(samples.itemgot(0,0), samples.itemgot(0,1)))

#     res = hf_tokenizer(inps, max_length=512, padding=True, truncation=True, is_split_into_words=False,return_tensors='pt')

#     fin = (res, retain_type(torch.stack(samples.itemgot(1).items), samples.itemgot(1)[0]))
#     return fin

In [None]:
#export
class HF_BeforeBatchTransform(Transform):
    """Handles everything you need to assemble a mini-batch of inputs and targets, as well as 
    decode the dictionary produced as a byproduct of the tokenization process in the `encodes` method.
    """
    def __init__(
        self, 
        # The abbreviation/name of your Hugging Face transformer architecture (e.b., bert, bart, etc..)
        hf_arch:str,   
        # A specific configuration instance you want to use
        hf_config:PretrainedConfig,   
        # A Hugging Face tokenizer
        hf_tokenizer:PreTrainedTokenizerBase,  
        # A Hugging Face model
        hf_model:PreTrainedModel,      
        # To control the length of the padding/truncation. It can be an integer or None, 
        # in which case it will default to the maximum length the model can accept. If the model has no 
        # specific maximum input length, truncation/padding to max_length is deactivated.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        max_length:int=None,             
        # To control the `padding` applied to your `hf_tokenizer` during tokenization. If None, will default to 
        # `False` or `'do_not_pad'.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        padding:Union[bool, str]=True, 
        # To control `truncation` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `do_not_truncate`.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        truncation:Union[bool, str]=True, 
        # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
        # if your inputs are pre-tokenized (not numericalized)
        is_split_into_words:bool=False, 
        # Any other keyword arguments you want included when using your `hf_tokenizer` to tokenize your inputs
        tok_kwargs={}, 
        # Keyword arguments to apply to `HF_BeforeBatchTransform`
        **kwargs
    ):
        store_attr(self=self, names='hf_arch, hf_config, hf_tokenizer, hf_model')
        store_attr(self=self, names='max_length, padding, truncation, is_split_into_words, tok_kwargs')
        store_attr(self=self, names='kwargs')

    def encodes(
        self, 
        samples  # A subset of data to put into a mini-batch
    ): 
        """This method peforms on-the-fly, batch-time tokenization of your data. In other words, your raw inputs
        are tokenized as needed for each mini-batch of data rather than requiring pre-tokenization of your full
        dataset ahead of time.
        """
        samples = L(samples)
        
        # grab inputs
        if (is_listy(samples[0][0]) and not self.is_split_into_words): 
            inps = list(zip(samples.itemgot(0, 0), samples.itemgot(0, 1)))
        else:
            inps = samples.itemgot(0).items
            
        # tokenize
        tok_d = self.hf_tokenizer(inps, 
                                  max_length=self.max_length, 
                                  padding=self.padding, 
                                  truncation=self.truncation, 
                                  is_split_into_words=self.is_split_into_words,
                                  return_tensors='pt', 
                                  **self.tok_kwargs)

        # update samples with tokenized inputs (e.g. input_ids, attention_mask, etc...)
        d_keys = tok_d.keys()
        updated_samples= [ (*[{k: tok_d[k][idx] for k in d_keys}], *sample[1:]) 
                          for idx, sample in enumerate(samples) ]
  
        return updated_samples

`HF_BeforeBatchTransform` was inspired by this [article](http://dev.fast.ai/tutorial.transformers).

Inputs can come in as a string or a list of tokens, the later being for tasks like Named Entity Recognition (NER), where you want to predict the label of each token.

**Notes re: on-the-fly batch-time tokenization**: The previous version of the library performed the tokenization/numericalization as a type transform when the raw data was read, and included a couple batch transforms to prepare the data for collation (e.g., to be made into a mini-batch). With this update, everything is done in a single batch transform.  Why?  Part of the inspiration had to do with the mechanics of the huggingrace tokenizer, in particular how by default it returns a collated mini-batch of data given a list of sequences. And where do we get a list of examples with fastai? In the batch transforms!  So I thought, hey, why not do everything dynamically at batch time?  And with a bit of tweaking, I got everything to work pretty well.  The result is less code, faster mini-batch creation, less RAM utilization and time spent tokenizing (really helps with very large datasets), and more flexibility.

In [None]:
#export
class HF_AfterBatchTransform(Transform):
    """A class used to cast your inputs into something understandable in fastai `show` methods"""
    def __init__(
        self, 
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,  
        # The return type your decoded inputs should be cast too (used by methods such as `show_batch`)
        input_return_type:Type=HF_BaseInput
    ):
        store_attr(self=self, names='hf_tokenizer, input_return_type')
        
    def decodes(
        self, 
        # The encoded samples for your batch. `input_ids` will be pulled out of your dictionary of Hugging Face
        # inputs, cast to `self.input_return_type` and returned for methods such as `show_batch`
        encoded_samples:Type
    ):
        """Returns the proper object and data for show related fastai methods"""
        if (isinstance(encoded_samples, dict)): 
            return self.input_return_type(encoded_samples['input_ids'], hf_tokenizer=self.hf_tokenizer)
        return encoded_samples

With fastai 2.1.5, before batch transforms no longer have a `decodes` method ... and so, I've introduced a standard batch transform here, `HF_AfterBatchTransform`, that will do the decoding for us.

In [None]:
#export
def blurr_sort_func(
    example, 
    # A Hugging Face tokenizer
    hf_tokenizer:PreTrainedTokenizerBase, 
    # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
    # if your inputs are pre-tokenized (not numericalized)
    is_split_into_words:bool=False,
    # Any other keyword arguments you want to include during tokenization
    tok_kwargs={}    
):
    """This method is used by the `SortedDL` to ensure your dataset is sorted *after* tokenization"""
    if (is_split_into_words):  return len(example[0])
    return len(hf_tokenizer.tokenize(example[0], **tok_kwargs))

In [None]:
#export
class HF_TextBlock(TransformBlock):
    """The core `TransformBlock` to prepare your data for training in Blurr with fastai's `DataBlock` API"""
    def __init__(
        self, 
        # The abbreviation/name of your Hugging Face transformer architecture (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_arch:str=None,          
        # A Hugging Face configuration object (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_config:PretrainedConfig=None,    
        # A Hugging Face tokenizer (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_tokenizer:PreTrainedTokenizerBase=None,  
        # A Hugging Face model (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_model:PreTrainedModel=None,                     
        # The before batch transform you want to use to tokenize your raw data on the fly 
        # (defaults to an instance of `HF_BeforeBatchTransform` created using the Hugging Face objects defined above)
        before_batch_tfm:HF_BeforeBatchTransform=None,             
        # The batch_tfms to apply to the creation of your DataLoaders, 
        # (defaults to HF_AfterBatchTransform created using the Hugging Face objects defined above)
        after_batch_tfm:HF_AfterBatchTransform=None,   
        # To control the length of the padding/truncation. It can be an integer or None, 
        # in which case it will default to the maximum length the model can accept. If the model has no 
        # specific maximum input length, truncation/padding to max_length is deactivated.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        max_length:int=None,             
        # To control the `padding` applied to your `hf_tokenizer` during tokenization. If None, will default to 
        # `False` or `'do_not_pad'.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        padding:Union[bool, str]=True, 
        # To control `truncation` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `do_not_truncate`.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        truncation:Union[bool, str]=True, 
        # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
        # if your inputs are pre-tokenized (not numericalized)
        is_split_into_words:bool=False, 
        # The return type your decoded inputs should be cast too (used by methods such as `show_batch`)
        input_return_type=HF_BaseInput, 
        # The type of `DataLoader` you want created (defaults to `SortedDL`)
        dl_type:DataLoader=None, 
        # Any keyword arguments you want applied to your before batch tfm
        before_batch_kwargs={}, 
        # Any keyword arguments you want applied to your after batch tfm (or referred to in fastai as `batch_tfms`)
        after_batch_kwargs={}, 
        # Any keyword arguments you want your Hugging Face tokenizer to use during tokenization
        tok_kwargs={}, 
        # Any keyword arguments you want to have applied with generating text
        text_gen_kwargs={},
        # Any keyword arguments you want applied to `HF_TextBlock`
        **kwargs
    ):
        if((not all([hf_arch, hf_config, hf_tokenizer, hf_model])) and before_batch_tfm is None):
            raise ValueError(
                """You must supply the Hugging Face architecture, config, tokenizer, and model
                - or - an instances of HF_BeforeBatchTransform"""
            )
        
        if (before_batch_tfm is None):
            before_batch_tfm = HF_BeforeBatchTransform(hf_arch, hf_config, hf_tokenizer, hf_model,
                                                       max_length=max_length, 
                                                       padding=padding, 
                                                       truncation=truncation, 
                                                       is_split_into_words=is_split_into_words,
                                                       tok_kwargs=tok_kwargs.copy(), 
                                                       **before_batch_kwargs.copy())
                
        if (after_batch_tfm is None): 
            after_batch_tfm = HF_AfterBatchTransform(hf_tokenizer=before_batch_tfm.hf_tokenizer,
                                                     input_return_type=input_return_type, 
                                                     **after_batch_kwargs.copy())
            
        if (dl_type is None):
            dl_sort_func = partial(blurr_sort_func, 
                                   hf_tokenizer=before_batch_tfm.hf_tokenizer, 
                                   is_split_into_words=before_batch_tfm.is_split_into_words, 
                                   tok_kwargs=before_batch_tfm.tok_kwargs.copy())
            
            dl_type = partial(SortedDL, sort_func=dl_sort_func)
            
        # set the TransformBlock's Hugging Face face objects
        self.hf_arch = before_batch_tfm.hf_arch
        self.hf_config = before_batch_tfm.hf_config
        self.hf_tokenizer = before_batch_tfm.hf_tokenizer
        self.hf_model = before_batch_tfm.hf_model
        
        return super().__init__(dl_type=dl_type, 
                                dls_kwargs={ 'before_batch': before_batch_tfm }, 
                                batch_tfms=after_batch_tfm)          

A basic wrapper that links defaults transforms for the data block API

`HF_TextBlock` has been dramatically simplified from it's predecessor. It handles setting up your `HF_BeforeBatchTransform` and `HF_AfterBatchTransform` transforms regardless of data source (e.g., this will work with files, DataFrames, whatever). You must either pass in your own instance of a `HF_BeforeBatchTransform` class or the Hugging Face architecture and tokenizer via the `hf_arch` and `hf_tokenizer` (the other args are optional).

## Low-level API: For working with PyTorch and/or fast.ai Datasets & DataLoaders

Below is a low-level API for working with basic PyTorch Datasets (e.g., a dataset from the Hugging Face datasets library) and DataLoaders. Use the approach detailed below if you already have, or want to use, a plain ol' PyTorch `Dataset` instead of the fast.ai `DataBlock` API.

In [None]:
#export
@dataclass
class BlurrBatchCreator():
    """A class that can be assigned to a `TfmdDL.create_batch` method; used to in Blurr's low-level API
    to create batches that can be used in the Blurr library
    """
    def __init__(
        self, 
        # Your Hugging Face tokenizer
        hf_tokenizer:PreTrainedTokenizerBase,     
        # Defaults to use Hugging Face's DataCollatorWithPadding(tokenizer=hf_tokenizer) 
        data_collator:Type=None 
    ):
        self.hf_tokenizer = hf_tokenizer
        self.data_collator = data_collator if (data_collator) else DataCollatorWithPadding(tokenizer=hf_tokenizer) 
        
    def __call__(
        self, 
        features # A mini-batch (list of examples to run through your model)
    ):
        """This method will collate your data using `self.data_collator` and add a target element to the
        returned tuples if `labels` are defined as is the case when most Hugging Face datasets
        """
        batch = self.data_collator(features)
        if (isinstance(features[0], dict)):
            return dict(batch), batch['labels'] if ('labels' in features[0]) else dict(batch)
        
        return batch

In [None]:
#export
class BlurrBatchTransform(HF_AfterBatchTransform):
    """A class used to cast your inputs into something understandable in fastai `show` methods"""
    def __init__(
        self, 
        # The abbreviation/name of your Hugging Face transformer architecture (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_arch:str=None,          
        # A Hugging Face configuration object (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_config:PretrainedConfig=None,    
        # A Hugging Face tokenizer (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_tokenizer:PreTrainedTokenizerBase=None,  
        # A Hugging Face model (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_model:PreTrainedModel=None,                        
        # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
        # if your inputs are pre-tokenized (not numericalized)
        is_split_into_words:bool=False, 
        # The token ID to ignore when calculating loss/metrics
        ignore_token_id:int = CrossEntropyLossFlat().ignore_index, 
        # Any other keyword arguments you want included when using your `hf_tokenizer` to tokenize your inputs
        tok_kwargs={}, 
        # Any text generation keyword arguments
        text_gen_kwargs={},
        # The return type your decoded inputs should be cast too (used by methods such as `show_batch`)
        input_return_type=HF_BaseInput,
        # Any other keyword arguments you need to pass to `HF_AfterBatchTransform`
        **kwargs
    ):
        super().__init__(hf_tokenizer=hf_tokenizer, input_return_type=input_return_type)
        
        store_attr(self=self, names='hf_arch, hf_config, hf_model, tok_kwargs, text_gen_kwargs')
        store_attr(self=self, names='is_split_into_words, ignore_token_id, kwargs')

In [None]:
#export
@delegates()
class BlurrDataLoader(TfmdDL):
    """A class that makes creating a fast.ai `DataLoader` that works with Blurr"""
    def __init__(
        self, 
        # A standard PyTorch Dataset
        dataset:Union[torch.utils.data.dataset.Dataset, Datasets],                  
        # The abbreviation/name of your Hugging Face transformer architecture (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_arch:str,          
        # A Hugging Face configuration object (not required if passing in an instance of `HF_BeforeBatchTransform` 
        # to `before_batch_tfm`)
        hf_config:PretrainedConfig,    
        # A Hugging Face tokenizer (not required if passing in an instance of `HF_BeforeBatchTransform` to 
        # `before_batch_tfm`)
        hf_tokenizer:PreTrainedTokenizerBase,  
        # A Hugging Face model (not required if passing in an instance of `HF_BeforeBatchTransform` to 
        # `before_batch_tfm`)
        hf_model:PreTrainedModel,   
        # An instance of `BlurrBatchCreator` or equivalent
        batch_creator:BlurrBatchCreator=None,  
        # The batch_tfm used to decode Blurr batches (default: HF_AfterBatchTransform)
        batch_tfm:BlurrBatchTransform=None,      
        # (optional) A preprocessing function that will be applied to your dataset
        preproccesing_func:Callable[[Union[torch.utils.data.dataset.Dataset, Datasets], PreTrainedTokenizerBase, PreTrainedModel], Union[torch.utils.data.dataset.Dataset, Datasets]]=None,  
        # Keyword arguments to be applied to your `batch_tfm`
        batch_tfm_kwargs={},  
        # Keyword arguments to be applied to `BlurrDataLoader`
        **kwargs                  
    ):
        if(preproccesing_func): dataset = preproccesing_func(dataset, hf_tokenizer, hf_model)
        
        if ('create_batch' in kwargs): kwargs.pop('create_batch')
        if (not batch_creator):
            batch_creator = BlurrBatchCreator(hf_tokenizer=hf_tokenizer)
            
        if ('after_batch' in kwargs): kwargs.pop('after_batch')
        if (not batch_tfm):
            batch_tfm = BlurrBatchTransform(hf_arch, hf_config, hf_tokenizer, hf_model, **batch_tfm_kwargs.copy()) 

        super().__init__(dataset=dataset, create_batch=batch_creator, after_batch=batch_tfm, **kwargs)
        store_attr(self=self, names='hf_arch, hf_config, hf_tokenizer, hf_model')

    def new(
        self, 
        # A standard PyTorch and fastai dataset
        dataset:Union[torch.utils.data.dataset.Dataset]=None, 
        # The class you want to create an instance of (will be "self" if None)
        cls:Type=None,     
        #  Any additional keyword arguments you want to pass to the __init__ method of `cls`
        **kwargs      
    ):
        """We have to override the new method in order to add back the Hugging Face objects in this factory 
        method (called for example in places like `show_results`). With the exception of the additions to the kwargs
        dictionary, the code below is pulled from the `DataLoaders.new` method as is.
        """
        if dataset is None: dataset = self.dataset
        if cls is None: cls = type(self)
            
        cur_kwargs = dict(dataset=dataset, num_workers=self.fake_l.num_workers, pin_memory=self.pin_memory, 
                          timeout=self.timeout, bs=self.bs, shuffle=self.shuffle, drop_last=self.drop_last, 
                          indexed=self.indexed, device=self.device)
        
        for n in self._methods:
            o = getattr(self, n)
            if not isinstance(o, MethodType): cur_kwargs[n] = o
        
        # we need to add these arguments back in (these, after_batch, and create_batch will go in as kwargs)
        kwargs['hf_arch'] = self.hf_arch
        kwargs['hf_config'] = self.hf_config
        kwargs['hf_tokenizer'] = self.hf_tokenizer
        kwargs['hf_model'] = self.hf_model
        
        return cls(**merge(cur_kwargs, kwargs))

## Utility & base `show_batch` methods

In [None]:
#export
def get_blurr_tfm(
    # A list of transforms (e.g., dls.after_batch, dls.before_batch, etc...)
    tfms_list:Pipeline,                  
    # The transform to find
    tfm_class:Transform=HF_BeforeBatchTransform    
):
    """Given a fastai DataLoaders batch transforms, this method can be used to get at a transform
    instance used in your Blurr DataBlock
    """
    return next(filter(lambda el: issubclass(type(el), tfm_class), tfms_list), None)

In [None]:
show_doc(get_blurr_tfm)

<h4 id="get_blurr_tfm" class="doc_header"><code>get_blurr_tfm</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>get_blurr_tfm</code>(**`tfms_list`**:`Pipeline`, **`tfm_class`**:`Transform`=*`HF_BeforeBatchTransform`*)

Given a fastai DataLoaders batch transforms, this method can be used to get at a transform
instance used in your Blurr DataBlock

**Parameters:**


 - **`tfms_list`** : *`<class 'fastcore.transform.Pipeline'>`*	<p>A list of transforms (e.g., dls.after_batch, dls.before_batch, etc...)</p>


 - **`tfm_class`** : *`<class 'fastcore.transform.Transform'>`*, *optional*	<p>The transform to find</p>



In [None]:
#export
def first_blurr_tfm(
    dls:DataLoaders,                                          # Your fast.ai `DataLoaders              
    before_batch_tfm_class:Transform=HF_BeforeBatchTransform, # The before_batch transform to look for
    blurr_batch_tfm_class:Transform=BlurrBatchTransform       # The after_batch (or batch_tfm) to look for
):
    """This convenience method will find the first Blurr transform required for methods such as 
    `show_batch` and `show_results`. The returned transform should have everything you need to properly
    decode and 'show' your Hugging Face inputs/targets
    """
    # try our befor_batch tfms (this will be used if you're using the mid-level DataBlock API)
    tfm = get_blurr_tfm(dls.before_batch, tfm_class=before_batch_tfm_class)
    if (tfm): return tfm
    
    # try our after_batch tfms (this will be used if you're using the low-level Blurr data API)
    return get_blurr_tfm(dls.after_batch, tfm_class=blurr_batch_tfm_class)

In [None]:
show_doc(first_blurr_tfm)

<h4 id="first_blurr_tfm" class="doc_header"><code>first_blurr_tfm</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>first_blurr_tfm</code>(**`dls`**:`DataLoaders`, **`before_batch_tfm_class`**:`Transform`=*`HF_BeforeBatchTransform`*, **`blurr_batch_tfm_class`**:`Transform`=*`BlurrBatchTransform`*)

This convenience method will find the first Blurr transform required for methods such as 
`show_batch` and `show_results`. The returned transform should have everything you need to properly
decode and 'show' your Hugging Face inputs/targets

**Parameters:**


 - **`dls`** : *`<class 'fastai.data.core.DataLoaders'>`*	<p>Your fast.ai `DataLoaders</p>


 - **`before_batch_tfm_class`** : *`<class 'fastcore.transform.Transform'>`*, *optional*	<p>The before_batch transform to look for</p>


 - **`blurr_batch_tfm_class`** : *`<class 'fastcore.transform.Transform'>`*, *optional*	<p>The after_batch (or batch_tfm) to look for</p>



In [None]:
#export
@typedispatch
def show_batch(
    # This typedispatched `show_batch` will be called for `HF_BaseInput` typed inputs
    x:HF_BaseInput, 
    # Your targets
    y,              
    # Your raw inputs/targets
    samples,        
    # Your `DataLoaders`. This is required so as to get at the Hugging Face objects for 
    # decoding them into something understandable
    dataloaders,    
    # Your `show_batch` context
    ctxs=None, 
    # The maximum number of items to show
    max_n=6, 
    # Any truncation your want applied to your decoded inputs
    trunc_at=None, 
    # Any other keyword arguments you want applied to `show_batch`
    **kwargs
):  
    # grab our tokenizer
    tfm = first_blurr_tfm(dataloaders)
    hf_tokenizer = tfm.hf_tokenizer
    
    res = L()
    n_inp = dataloaders.n_inp
    
    for idx, (input_ids, label, sample) in enumerate(zip(x, y, samples)):
        if (idx >= max_n): break

        rets = [ hf_tokenizer.decode(input_ids, skip_special_tokens=True)[:trunc_at] ]
        for item in sample[n_inp:]: rets.append(label.item() if (torch.is_tensor(item)) else item)
        res.append(tuplify(rets))
        
    cols = ['text'] + [ 'target' if (i == 0) else f'target_{i}' for i in range(len(res[0]) - n_inp) ]
    display_df(pd.DataFrame(res, columns=cols)[:max_n])
    return ctxs

## Sequence classification

Below demonstrates both how to contruct your `DataBlock` for a sequence classification task (e.g., a model that requires a single text input) using the mid-level API, and also with the low-level API should you wish to work with standard PyTorch or fast.ai Datasets and DataLoaders

### Using the mid-level API

In [None]:
path = untar_data(URLs.IMDB_SAMPLE)

model_path = Path('models')
imdb_df = pd.read_csv(path/'texts.csv')

In [None]:
imdb_df.head()

Unnamed: 0,label,text,is_valid
0,negative,"Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff!",False
1,positive,"This is a extremely well-made film. The acting, script and camera-work are all first-rate. The music is good, too, though it is mostly early in the film, when things are still relatively cheery. There are no really superstars in the cast, though several faces will be familiar. The entire cast does an excellent job with the script.<br /><br />But it is hard to watch, because there is no good end to a situation like the one presented. It is now fashionable to blame the British for setting Hindus and Muslims against each other, and then cruelly separating them into two countries. There is som...",False
2,negative,"Every once in a long while a movie will come along that will be so awful that I feel compelled to warn people. If I labor all my days and I can save but one soul from watching this movie, how great will be my joy.<br /><br />Where to begin my discussion of pain. For starters, there was a musical montage every five minutes. There was no character development. Every character was a stereotype. We had swearing guy, fat guy who eats donuts, goofy foreign guy, etc. The script felt as if it were being written as the movie was being shot. The production value was so incredibly low that it felt li...",False
3,positive,"Name just says it all. I watched this movie with my dad when it came out and having served in Korea he had great admiration for the man. The disappointing thing about this film is that it only concentrate on a short period of the man's life - interestingly enough the man's entire life would have made such an epic bio-pic that it is staggering to imagine the cost for production.<br /><br />Some posters elude to the flawed characteristics about the man, which are cheap shots. The theme of the movie ""Duty, Honor, Country"" are not just mere words blathered from the lips of a high-brassed offic...",False
4,negative,"This movie succeeds at being one of the most unique movies you've seen. However this comes from the fact that you can't make heads or tails of this mess. It almost seems as a series of challenges set up to determine whether or not you are willing to walk out of the movie and give up the money you just paid. If you don't want to feel slighted you'll sit through this horrible film and develop a real sense of pity for the actors involved, they've all seen better days, but then you realize they actually got paid quite a bit of money to do this and you'll lose pity for them just like you've alr...",False


There are a bunch of ways we can get at the four Hugging Face elements we need (e.g., architecture name, tokenizer, config, and model).  We can just create them directly, or we can use one of the helper methods available via `BLURR`.

In [None]:
#hide_output
from transformers import AutoModelForSequenceClassification
model_cls = AutoModelForSequenceClassification

pretrained_model_name = "distilroberta-base" # "distilbert-base-uncased" "bert-base-uncased"
hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(pretrained_model_name, model_cls=model_cls)

Once you have those elements, you can create your `DataBlock` as simple as the below.

In [None]:
blocks = (HF_TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model), CategoryBlock)
dblock = DataBlock(blocks=blocks, get_x=ColReader('text'), get_y=ColReader('label'), splitter=ColSplitter())

In [None]:
dls = dblock.dataloaders(imdb_df, bs=4)

In [None]:
b = dls.one_batch(); len(b), len(b[0]['input_ids']), b[0]['input_ids'].shape, len(b[1]) 

(2, 4, torch.Size([4, 512]), 4)

Let's take a look at the actual types represented by our batch

In [None]:
explode_types(b)

{tuple: [dict, fastai.torch_core.TensorCategory]}

In [None]:
dls.show_batch(dataloaders=dls, max_n=2, trunc_at=500)

Unnamed: 0,text,target
0,"Raising Victor Vargas: A Review<br /><br />You know, Raising Victor Vargas is like sticking your hands into a big, steaming bowl of oatmeal. It's warm and gooey, but you're not sure if it feels right. Try as I might, no matter how warm and gooey Raising Victor Vargas became I was always aware that something didn't quite feel right. Victor Vargas suffers from a certain overconfidence on the director's part. Apparently, the director thought that the ethnic backdrop of a Latino family on the lower",negative
1,"Many neglect that this isn't just a classic due to the fact that it's the first 3D game, or even the first shoot-'em-up. It's also one of the first stealth games, one of the only(and definitely the first) truly claustrophobic games, and just a pretty well-rounded gaming experience in general. With graphics that are terribly dated today, the game thrusts you into the role of B.J.(don't even *think* I'm going to attempt spelling his last name!), an American P.O.W. caught in an underground bunker.",positive


### Using the low-level API

Step 1: Grab your datasets

In [None]:
raw_datasets = load_dataset("glue", "mrpc")

Reusing dataset glue (/home/wgilliam/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


In [None]:
def tokenize_function(example):
    return hf_tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

Step 2: Define any pre-processing that needs to be done to your datasets (optional)

In [None]:
#export
def preproc_hf_dataset(
    # A standard PyTorch Dataset or fast.ai Datasets
    dataset:Union[torch.utils.data.dataset.Dataset, Datasets],  
    # A Hugging Face tokenizer
    hf_tokenizer:PreTrainedTokenizerBase,   
    # A Hugging Face model
    hf_model:PreTrainedModel                                    
):
    """This method can be used to preprocess most Hugging Face Datasets for use in Blurr and other training
    libraries
    """
    if ('label') in dataset.column_names: dataset = dataset.rename_column("label", "labels")
        
    hf_model_fwd_args = list(inspect.signature(hf_model.forward).parameters.keys())
    bad_cols = set(dataset.column_names).difference(hf_model_fwd_args)
    dataset = dataset.remove_columns(bad_cols)
    
    dataset.set_format("torch")
    return dataset

Step 3: Use `BlurrDataLoader` to build Blurr friendly dataloaders from your datasets

In [None]:
trn_dl = BlurrDataLoader(tokenized_datasets["train"], 
                         hf_arch, hf_config, hf_tokenizer, hf_model,
                         preproccesing_func=preproc_hf_dataset, shuffle=True, batch_size=8)

val_dl = BlurrDataLoader(tokenized_datasets["validation"],
                         hf_arch, hf_config, hf_tokenizer, hf_model,
                         preproccesing_func=preproc_hf_dataset, batch_size=16)

dls = DataLoaders(trn_dl, val_dl)

In [None]:
b = dls.one_batch()
b[0]['input_ids'].shape

torch.Size([8, 71])

In [None]:
dls.show_batch(dataloaders=dls, max_n=2, trunc_at=500)

Unnamed: 0,text,target
0,"The U.S. government and private technology experts warned Wednesday that hackers plan to attack thousands of Web sites Sunday in a loosely co-ordinated "" contest "" that could disrupt Internet traffic. THE US government and private technology experts have warned that hackers plan to attack thousands of websites on Sunday in a loosely co-ordinated "" contest "" that could disrupt Internet traffic.",1
1,"What's more, Mr. O 'Neill said that he hoped Hyundai would sell one million vehicles annually in the United States by 2010. That wasn 't all : by 2010, Mr. O 'Neill said, he hoped Hyundai would sell 1 million vehicles annually in the United States.",1


## Tests

The tests below to ensure the core DataBlock code above works for **all** pretrained sequence classification models available in Hugging Face.  These tests are excluded from the CI workflow because of how long they would take to run and the amount of data that would be required to download.

**Note**: Feel free to modify the code below to test whatever pretrained classification models you are working with ... and if any of your pretrained sequence classification models fail, please submit a github issue *(or a PR if you'd like to fix it yourself)*

In [None]:
#hide
[ model_type for model_type in BLURR.get_models(task='SequenceClassification') 
 if (not model_type.startswith('TF')) ]

['AlbertForSequenceClassification',
 'BartForSequenceClassification',
 'BertForSequenceClassification',
 'BigBirdForSequenceClassification',
 'BigBirdPegasusForSequenceClassification',
 'CTRLForSequenceClassification',
 'CamembertForSequenceClassification',
 'CanineForSequenceClassification',
 'ConvBertForSequenceClassification',
 'DebertaForSequenceClassification',
 'DebertaV2ForSequenceClassification',
 'DistilBertForSequenceClassification',
 'ElectraForSequenceClassification',
 'FlaubertForSequenceClassification',
 'FunnelForSequenceClassification',
 'GPT2ForSequenceClassification',
 'GPTNeoForSequenceClassification',
 'IBertForSequenceClassification',
 'LEDForSequenceClassification',
 'LayoutLMForSequenceClassification',
 'LongformerForSequenceClassification',
 'MBartForSequenceClassification',
 'MPNetForSequenceClassification',
 'MegatronBertForSequenceClassification',
 'MobileBertForSequenceClassification',
 'OpenAIGPTForSequenceClassification',
 'ReformerForSequenceClassificatio

In [None]:
#hide
pretrained_model_names = [
    'albert-base-v1',
    'facebook/bart-base',
    'bert-base-uncased',
    'google/bigbird-roberta-base',
    'sshleifer/tiny-ctrl',
    'camembert-base',
    'sarnikowski/convbert-medium-small-da-cased',
    'microsoft/deberta-base',
    'microsoft/deberta-v2-xlarge',
    'distilbert-base-uncased',
    'monologg/electra-small-finetuned-imdb',
    'flaubert/flaubert_small_cased', 
    'huggingface/funnel-small-base',
    'gpt2',
    'kssteven/ibert-roberta-base',
    'allenai/led-base-16384',
    'microsoft/layoutlm-base-uncased',
    'allenai/longformer-base-4096',
    'sshleifer/tiny-mbart', 
    'microsoft/mpnet-base',
    'google/mobilebert-uncased',
    'openai-gpt',
    #'reformer-enwik8',                  # (see model card; does not work with/require a tokenizer so no bueno here)
    'roberta-base',
    'squeezebert/squeezebert-uncased',
    #'google/tapas-base',                # (requires pip install torch-scatter)
    'transfo-xl-wt103', 
    'xlm-mlm-en-2048',
    'xlm-roberta-base',
    'xlnet-base-cased'
]

In [None]:
#hide
# for model_name in pretrained_model_names:
#     tok = AutoTokenizer.from_pretrained(model_name)
#     print(f'=== {model_name} ===')
#     print(f'=== {tok.padding_side} ===')
#     print(f'=== {tok.pad_token_id} ===')
#     print(tok(['hi', 'hello everyone. its good to be here'], ['yo', 'yo'], padding='max_length', max_length=128))

In [None]:
#hide
path = untar_data(URLs.IMDB_SAMPLE)

model_path = Path('models')
imdb_df = pd.read_csv(path/'texts.csv')

In [None]:
#hide
from transformers import RobertaTokenizer

model_cls = AutoModelForSequenceClassification
bsz = 2
seq_sz = 128

test_results = []
for model_name in pretrained_model_names:
    error=None
    
    print(f'=== {model_name} ===\n')
    
    tok_class = RobertaTokenizer if ('/ibert' in model_name) else None

    hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(model_name, 
                                                                      model_cls=model_cls, 
                                                                      tokenizer_cls=tok_class)   
    
    print(f'architecture:\t{hf_arch}\ntokenizer:\t{type(hf_tokenizer).__name__}\n')
    
    # not all architectures include a native pad_token (e.g., gpt2, ctrl, etc...), so we add one here
    if (hf_tokenizer.pad_token is None): 
        hf_tokenizer.add_special_tokens({'pad_token': '<pad>'})  
        hf_config.pad_token_id = hf_tokenizer.get_vocab()['<pad>']
        hf_model.resize_token_embeddings(len(hf_tokenizer))   
    
    blocks = (
        HF_TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model, padding='max_length', max_length=seq_sz), 
        CategoryBlock
    )

    dblock = DataBlock(blocks=blocks, get_x=ColReader('text'), get_y=ColReader('label'), splitter=ColSplitter())
    dls = dblock.dataloaders(imdb_df, bs=bsz) 
    b = dls.one_batch()
    
    try:
        print('*** TESTING DataLoaders ***\n')
        test_eq(len(b), 2)
        test_eq(len(b[0]['input_ids']), bsz)
        test_eq(b[0]['input_ids'].shape, torch.Size([bsz, seq_sz]))
        test_eq(len(b[1]), bsz)

        test_results.append((hf_arch, type(hf_tokenizer).__name__, model_name, 'PASSED', ''))
        dls.show_batch(dataloaders=dls, max_n=2, trunc_at=1000)
        
    except Exception as err:
        test_results.append((hf_arch, type(hf_tokenizer).__name__, model_name, 'FAILED', err))

=== albert-base-v1 ===

architecture:	albert
tokenizer:	AlbertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"now that che(2008) has finished its relatively short australian cinema run (extremely limited release:1 screen in sydney, after 6wks), i can guiltlessly join both hosts of ""at the movies"" in taking steven soderbergh to task.br /br /it's usually satisfying to watch a film director change his style/subject, but soderbergh's most recent stinker, the girlfriend experience(2009), was also missing a story, so narrative (and editing?) seem to suddenly be soderbergh's main challenge",negative
1,"the shop around the corner is one of the sweetest and most feel-good romantic comedies ever made. there's just no getting around that, and it's hard to actually put one's feeling for this film into words. it's not one of those films that tries too hard, nor does it come up with the oddest possible scenarios to get the two protagonists together in the end. in fact, all its charm is innate, contained within the characters and the setting and the plot... which is highly believable to boot. it's easy to think that such a love story",positive


=== facebook/bart-base ===

architecture:	bart
tokenizer:	BartTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Raising Victor Vargas: A Review<br /><br />You know, Raising Victor Vargas is like sticking your hands into a big, steaming bowl of oatmeal. It's warm and gooey, but you're not sure if it feels right. Try as I might, no matter how warm and gooey Raising Victor Vargas became I was always aware that something didn't quite feel right. Victor Vargas suffers from a certain overconfidence on the director's part. Apparently, the director thought that the ethnic backdrop of a Latino family on the lower east side, and an idyllic",negative
1,"I watched Grendel the other night and am compelled to put together a Public Service Announcement.<br /><br />Grendel is another version of Beowulf, the thousand-year-old Anglo-Saxon epic poem. The SciFi channel has a growing catalog of inoffensive and uninteresting movies, and the previews promised an inauthentic low-budget mini-epic, but this one refused to let me switch channels. It was staggeringly, overwhelmingly, bad. I watched in fascination and horror at the train wreck you couldn't tear your eyes away from. I reached for a notepad and",negative


=== bert-base-uncased ===

architecture:	bert
tokenizer:	BertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"raising victor vargas : a review < br / > < br / > you know, raising victor vargas is like sticking your hands into a big, steaming bowl of oatmeal. it's warm and gooey, but you're not sure if it feels right. try as i might, no matter how warm and gooey raising victor vargas became i was always aware that something didn't quite feel right. victor vargas suffers from a certain overconfidence on the director's part. apparently, the director thought that the ethnic backdrop of a latino family on the lower east side, and an idyllic",negative
1,"the year 2005 saw no fewer than 3 filmed productions of h. g. wells'great novel, "" war of the worlds "". this is perhaps the least well - known and very probably the best of them. no other version of wotw has ever attempted not only to present the story very much as wells wrote it, but also to create the atmosphere of the time in which it was supposed to take place : the last year of the 19th century, 1900 using wells'original setting, in and near woking, england. < br / > < br / > imdb seems unfriendly to what they regard",positive


=== google/bigbird-roberta-base ===

architecture:	big_bird
tokenizer:	BigBirdTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Raising Victor Vargas: A Review<br /><br />You know, Raising Victor Vargas is like sticking your hands into a big, steaming bowl of oatmeal. It's warm and gooey, but you're not sure if it feels right. Try as I might, no matter how warm and gooey Raising Victor Vargas became I was always aware that something didn't quite feel right. Victor Vargas suffers from a certain overconfidence on the director's part. Apparently, the director thought that the ethnic backdrop of a Latino family on the lower east side, and an idyllic",negative
1,"This is the last of four swashbucklers from France I've scheduled for viewing during this Christmas season: the others (in order of viewing) were the uninspired THE BLACK TULIP (1964; from the same director as this one but not nearly as good), the surprisingly effective LADY Oscar (1979; which had originated as a Japanese manga!) and the splendid CARTOUCHE (1962). Actually, I had watched this one not too long ago on late-night Italian TV and recall not being especially bowled over by it, so that I was genuinely surprised by how much I enjoyed it this time",positive


=== sshleifer/tiny-ctrl ===



Using pad_token, but it is not set yet.


architecture:	ctrl
tokenizer:	CTRLTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Raising Victor Vargas: A Review<br /><br />You know, Raising Victor Vargas is like sticking your hands into a big, steaming bowl of oatmeal. It's warm and gooey, but you're not sure if it feels right. Try as I might, no matter how warm and gooey Raising Victor Vargas became I was always aware that something didn't quite feel right. Victor Vargas suffers from a certain overconfidence on the director's part. Apparently, the director thought that the ethnic backdrop of a Latino family on the lower east side, and an idyllic storyline would make the film critic proof. He was right, but it didn't fool me. Raising Victor Vargas is the story about a seventeen-year old",negative
1,"THE SHOP AROUND THE CORNER is one of the sweetest and most feel-good romantic comedies ever made. There's just no getting around that, and it's hard to actually put one's feeling for this film into words. It's not one of those films that tries too hard, nor does it come up with the oddest possible scenarios to get the two protagonists together in the end. In fact, all its charm is innate, contained within the characters and the setting and the plot... which is highly believable to boot. It's easy to think that such a love story, as beautiful as any other ever told, *could* happen to you... a feeling you don't often get from other romantic comedies,",positive


=== camembert-base ===

architecture:	camembert
tokenizer:	CamembertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Raising Victor Vargas: A Review<br /><br />You know, Raising Victor Vargas is like sticking your hands into a big, steaming bowl of oatmeal. It's warm and gooey, but you're not sure if it feels right. Try as I might, no matter how warm and gooey Raising Victor Vargas became I was always aware that something didn't quite feel right. Victor Vargas",negative
1,"THE SHOP AROUND THE CORNER is one of the sweetest and most feel-good romantic comedies ever made. There's just no getting around that, and it's hard to actually put one's feeling for this film into words. It's not one of those films that tries too hard, nor does it come up with the oddest possible scenarios to get the two protagonists together in the end. In",positive


=== sarnikowski/convbert-medium-small-da-cased ===

architecture:	convbert
tokenizer:	ConvBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Raising Victor Vargas : A Review < br / > < br / > You know, Raising Victor Vargas is like sticking your hands into a big, steaming bowl of oatmeal. It's warm and gooey, but you're not sure if it feels right. Try as I might, no matter how warm and gooey Raising Victor Vargas became I was always aware that something didn't quite feel right. Victor Vargas suffers from",negative
1,"Many neglect that this isn't just a classic due to the fact that it's the first 3D game, or even the first shoot -'em - up. It's also one of the first stealth games, one of the only ( and definitely the first ) truly claustrophobic games, and just a pretty well - rounded gaming experience in general. With graphics that are terribly dated today, the game thrusts you",positive


=== microsoft/deberta-base ===

architecture:	deberta
tokenizer:	DebertaTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Raising Victor Vargas: A Review<br /><br />You know, Raising Victor Vargas is like sticking your hands into a big, steaming bowl of oatmeal. It's warm and gooey, but you're not sure if it feels right. Try as I might, no matter how warm and gooey Raising Victor Vargas became I was always aware that something didn't quite feel right. Victor Vargas suffers from a certain overconfidence on the director's part. Apparently, the director thought that the ethnic backdrop of a Latino family on the lower east side, and an idyllic",negative
1,"This film sat on my Tivo for weeks before I watched it. I dreaded a self-indulgent yuppie flick about relationships gone bad. I was wrong; this was an engrossing excursion into the screwed-up libidos of New Yorkers.<br /><br />The format is the same as Max Ophuls' ""La Ronde,"" based on a play by Arthur Schnitzler, who is given an ""inspired by"" credit. It starts from one person, a prostitute, standing on a street corner in Brooklyn. She is picked up by a home contractor, who has sex with",positive


=== microsoft/deberta-v2-xlarge ===

architecture:	deberta_v2
tokenizer:	DebertaV2Tokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Raising Victor Vargas: A Review<br /><br />You know, Raising Victor Vargas is like sticking your hands into a big, steaming bowl of oatmeal. It's warm and gooey, but you're not sure if it feels right. Try as I might, no matter how warm and gooey Raising Victor Vargas became I was always aware that something didn't quite feel right. Victor Vargas suffers from a certain overconfidence on the director's part. Apparently, the director thought that the ethnic backdrop of a Latino family on the lower east side, and an idyllic storyline would make the film critic proof. He was right,",negative
1,"THE SHOP AROUND THE CORNER is one of the sweetest and most feel-good romantic comedies ever made. There's just no getting around that, and it's hard to actually put one's feeling for this film into words. It's not one of those films that tries too hard, nor does it come up with the oddest possible scenarios to get the two protagonists together in the end. In fact, all its charm is innate, contained within the characters and the setting and the plot... which is highly believable to boot. It's easy to think that such a love story, as beautiful as any other ever told",positive


=== distilbert-base-uncased ===

architecture:	distilbert
tokenizer:	DistilBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"raising victor vargas : a review < br / > < br / > you know, raising victor vargas is like sticking your hands into a big, steaming bowl of oatmeal. it's warm and gooey, but you're not sure if it feels right. try as i might, no matter how warm and gooey raising victor vargas became i was always aware that something didn't quite feel right. victor vargas suffers from a certain overconfidence on the director's part. apparently, the director thought that the ethnic backdrop of a latino family on the lower east side, and an idyllic",negative
1,"the blob starts with one of the most bizarre theme songs ever, sung by an uncredited burt bacharach of all people! you really have to hear it to believe it, the blob may be worth watching just for this song alone & my user comment summary is just a little taste of the classy lyrics... after this unnerving opening credits sequence the blob introduces us, the viewer that is, to steve andrews ( steve mcqueen as steven mcqueen ) & his girlfriend jane martin ( aneta corsaut ) who are parked on their own somewhere & witness what looks like a meteorite falling",negative


=== monologg/electra-small-finetuned-imdb ===

architecture:	electra
tokenizer:	ElectraTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"raising victor vargas : a review < br / > < br / > you know, raising victor vargas is like sticking your hands into a big, steaming bowl of oatmeal. it's warm and gooey, but you're not sure if it feels right. try as i might, no matter how warm and gooey raising victor vargas became i was always aware that something didn't quite feel right. victor vargas suffers from a certain overconfidence on the director's part. apparently, the director thought that the ethnic backdrop of a latino family on the lower east side, and an idyllic",negative
1,"i watched grendel the other night and am compelled to put together a public service announcement. < br / > < br / > grendel is another version of beowulf, the thousand - year - old anglo - saxon epic poem. the scifi channel has a growing catalog of inoffensive and uninteresting movies, and the previews promised an inauthentic low - budget mini - epic, but this one refused to let me switch channels. it was staggeringly, overwhelmingly, bad. i watched in fascination and horror at the train wreck you couldn't tear your eyes away from.",negative


=== flaubert/flaubert_small_cased ===

architecture:	flaubert
tokenizer:	FlaubertTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Raising Victor Vargas : A Review < br / > < br / > You know, Raising Victor Vargas is like sticking your hands into a big, steaming bowl of oatmeal. It' s warm and gooey, but you' re not sure if it feels right. Try as I might, no matter how warm and gooey Raising Victor Vargas became I was always aware that something didn' t quite feel right. Victor Vargas suffers",negative
1,"Although recognized as the best film treatment of the difficulties of having a house in the country built ( or bought ) to your specifications, it is not the first, nor the last. In 1940 Jack Benny and Ann Sheridan were the leads in the film version of the comedy GEORGE WASHINGTON SLEPT HERE by George S. Kaufman and Moss Hart. And about fifteen years ago Shelly Long and Tom Hanks had the lead in THE MONEY PIT. The former was about moving into an 18th Century",positive


=== huggingface/funnel-small-base ===

architecture:	funnel
tokenizer:	FunnelTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"raising victor vargas : a review < br / > < br / > you know, raising victor vargas is like sticking your hands into a big, steaming bowl of oatmeal. it's warm and gooey, but you're not sure if it feels right. try as i might, no matter how warm and gooey raising victor vargas became i was always aware that something didn't quite feel right. victor vargas suffers from a certain overconfidence on the director's part. apparently, the director thought that the ethnic backdrop of a latino family on the lower east side, and an idyllic",negative
1,"there's a legion of mick garris haters out there who feel he couldn't direct a horror film of quality if he had to. and, sleepwalkers (.. screenplay written by stephen king ) is often used as an example of this. i like sleepwalkers, though i fully am aware that garris just says f # ck it and lets all hell break loose about fifteen or so minutes into the movie. forget character or plot development, who needs them anyway. it's about violent mayhem and bloody carnage as a mother and son pair of "" sleepwalkers "" (.. fe",positive


=== gpt2 ===



Using pad_token, but it is not set yet.


architecture:	gpt2
tokenizer:	GPT2TokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Raising Victor Vargas: A Review<br /><br />You know, Raising Victor Vargas is like sticking your hands into a big, steaming bowl of oatmeal. It's warm and gooey, but you're not sure if it feels right. Try as I might, no matter how warm and gooey Raising Victor Vargas became I was always aware that something didn't quite feel right. Victor Vargas suffers from a certain overconfidence on the director's part. Apparently, the director thought that the ethnic backdrop of a Latino family on the lower east side, and an idyllic storyline would",negative
1,"Now that Che(2008) has finished its relatively short Australian cinema run (extremely limited release:1 screen in Sydney, after 6wks), I can guiltlessly join both hosts of ""At The Movies"" in taking Steven Soderbergh to task.<br /><br />It's usually satisfying to watch a film director change his style/subject, but Soderbergh's most recent stinker, The Girlfriend Experience(2009), was also missing a story, so narrative (and editing?) seem to suddenly be Soderbergh's main challenge. Strange, after 20-odd years in the business. He was probably",negative


=== kssteven/ibert-roberta-base ===

architecture:	ibert
tokenizer:	RobertaTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Raising Victor Vargas: A Review<br /><br />You know, Raising Victor Vargas is like sticking your hands into a big, steaming bowl of oatmeal. It's warm and gooey, but you're not sure if it feels right. Try as I might, no matter how warm and gooey Raising Victor Vargas became I was always aware that something didn't quite feel right. Victor Vargas suffers from a certain overconfidence on the director's part. Apparently, the director thought that the ethnic backdrop of a Latino family on the lower east side, and an idyllic",negative
1,"""Look, I know this may suck right now, but pain is temporary, film is forever. Whatever you do right now is burned into celluloid for all time and for thousands of years to come.""  Robert De Niro<br /><br />This was initially a film for Steven Spielberg, the director hiring several screenwriters to adjust the screenplay so that it more suited his themes. And so we have a dysfunctional family that is threatened by a deranged monster in the form of a recently released from prison Robert De Niro. Like ""Jurassic Park"", ""Poltergeist"" and ""War of the",negative


=== allenai/led-base-16384 ===

architecture:	led
tokenizer:	LEDTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Raising Victor Vargas: A Review<br /><br />You know, Raising Victor Vargas is like sticking your hands into a big, steaming bowl of oatmeal. It's warm and gooey, but you're not sure if it feels right. Try as I might, no matter how warm and gooey Raising Victor Vargas became I was always aware that something didn't quite feel right. Victor Vargas suffers from a certain overconfidence on the director's part. Apparently, the director thought that the ethnic backdrop of a Latino family on the lower east side, and an idyllic",negative
1,"I had read many good things about this adaptation of my favorite novel...so invariably my expectations were crushed. But they were crushed more than should be expected. The movie would have been a decent movie if I had not read the novel beforehand, which perhaps ruined it for me.<br /><br />In any event, for some reason they changed the labor camp at Toulon to a ship full of galley slaves. The scene at Bishop Myriel's was fine. In fact, other than the galleys, things survived up until the dismissal of Fantine. Because we do not want to have bad things happen to",negative


=== microsoft/layoutlm-base-uncased ===

architecture:	layoutlm
tokenizer:	LayoutLMTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"raising victor vargas : a review < br / > < br / > you know, raising victor vargas is like sticking your hands into a big, steaming bowl of oatmeal. it's warm and gooey, but you're not sure if it feels right. try as i might, no matter how warm and gooey raising victor vargas became i was always aware that something didn't quite feel right. victor vargas suffers from a certain overconfidence on the director's part. apparently, the director thought that the ethnic backdrop of a latino family on the lower east side, and an idyllic",negative
1,"before sunrise has many remarkable things going on, almost too many to fit into one review like this, but it's suffice to say that it's one of the most observant character studies of the nineties, maybe even in all of contemporary cinema, to be observant not about love, per - say, so much as it's about a human connection. how does one fall in love at first sight? no one does, at least that's deep down the consensus that linklater wants to show with his film. and * yet * there is the possibility of as intense a connection",positive


=== allenai/longformer-base-4096 ===

architecture:	longformer
tokenizer:	LongformerTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Raising Victor Vargas: A Review<br /><br />You know, Raising Victor Vargas is like sticking your hands into a big, steaming bowl of oatmeal. It's warm and gooey, but you're not sure if it feels right. Try as I might, no matter how warm and gooey Raising Victor Vargas became I was always aware that something didn't quite feel right. Victor Vargas suffers from a certain overconfidence on the director's part. Apparently, the director thought that the ethnic backdrop of a Latino family on the lower east side, and an idyllic",negative
1,"Many neglect that this isn't just a classic due to the fact that it's the first 3D game, or even the first shoot-'em-up. It's also one of the first stealth games, one of the only(and definitely the first) truly claustrophobic games, and just a pretty well-rounded gaming experience in general. With graphics that are terribly dated today, the game thrusts you into the role of B.J.(don't even *think* I'm going to attempt spelling his last name!), an American P.O.W. caught in an underground bunker. You fight and search",positive


=== sshleifer/tiny-mbart ===

architecture:	mbart
tokenizer:	MBartTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Raising Victor Vargas: A Review<br /><br />You know, Raising Victor Vargas is like sticking your hands into a big, steaming bowl of oatmeal. It's warm and gooey, but you're not sure if it feels right. Try as I might, no matter how warm and gooey Raising Victor Vargas became I was always aware that something didn't quite feel right. Victor Vargas suffers from a certain overconfidence on the director's part. Apparently, the director thought that the ethnic back",negative
1,"Now that Che(2008) has finished its relatively short Australian cinema run (extremely limited release:1 screen in Sydney, after 6wks), I can guiltlessly join both hosts of ""At The Movies"" in taking Steven Soderbergh to task.<br /><br />It's usually satisfying to watch a film director change his style/subject, but Soderbergh's most recent stinker, The Girlfriend Experience(2009), was also missing a story, so narrative (and editing?) seem to suddenly be Soderbergh's main challenge.",negative


=== microsoft/mpnet-base ===

architecture:	mpnet
tokenizer:	MPNetTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"raising victor vargas : a review < br / > < br / > you know, raising victor vargas is like sticking your hands into a big, steaming bowl of oatmeal. it's warm and gooey, but you're not sure if it feels right. try as i might, no matter how warm and gooey raising victor vargas became i was always aware that something didn't quite feel right. victor vargas suffers from a certain overconfidence on the director's part. apparently, the director thought that the ethnic backdrop of a latino family on the lower east side, and an idyllic",negative
1,"i really wanted to love this show. i truly, honestly did. < br / > < br / > for the first time, gay viewers get their own version of the "" the bachelor "". with the help of his obligatory "" hag "" andra, james, a good looking, well - to - do thirty - something has the chance of love with 15 suitors ( or "" mates "" as they are referred to in the show ). the only problem is half of them are straight and james doesn't know this. if james picks a gay one, they get a trip to new zealand, and if he",negative


=== google/mobilebert-uncased ===

architecture:	mobilebert
tokenizer:	MobileBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"raising victor vargas : a review < br / > < br / > you know, raising victor vargas is like sticking your hands into a big, steaming bowl of oatmeal. it's warm and gooey, but you're not sure if it feels right. try as i might, no matter how warm and gooey raising victor vargas became i was always aware that something didn't quite feel right. victor vargas suffers from a certain overconfidence on the director's part. apparently, the director thought that the ethnic backdrop of a latino family on the lower east side, and an idyllic",negative
1,"although recognized as the best film treatment of the difficulties of having a house in the country built ( or bought ) to your specifications, it is not the first, nor the last. in 1940 jack benny and ann sheridan were the leads in the film version of the comedy george washington slept here by george s. kaufman and moss hart. and about fifteen years ago shelly long and tom hanks had the lead in the money pit. the former was about moving into an 18th century country house that... err, needs work. the latter was about building your dream house - in the late 1980s. although the two films have their",positive


=== openai-gpt ===



Using pad_token, but it is not set yet.


architecture:	openai
tokenizer:	OpenAIGPTTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"raising victor vargas : a review < br / > < br / > you know, raising victor vargas is like sticking your hands into a big, steaming bowl of oatmeal. it's warm and gooey, but you're not sure if it feels right. try as i might, no matter how warm and gooey raising victor vargas became i was always aware that something didn't quite feel right. victor vargas suffers from a certain overconfidence on the director's part. apparently, the director thought that the ethnic backdrop of a latino family on the lower east side, and an idyllic storyline would make the film critic",negative
1,"many neglect that this isn't just a classic due to the fact that it's the first 3d game, or even the first shoot -'em - up. it's also one of the first stealth games, one of the only ( and definitely the first ) truly claustrophobic games, and just a pretty well - rounded gaming experience in general. with graphics that are terribly dated today, the game thrusts you into the role of b. j. ( don't even * think * i'm going to attempt spelling his last name! ), an american p. o. w. caught in an underground bunker. you",positive


=== roberta-base ===

architecture:	roberta
tokenizer:	RobertaTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Raising Victor Vargas: A Review<br /><br />You know, Raising Victor Vargas is like sticking your hands into a big, steaming bowl of oatmeal. It's warm and gooey, but you're not sure if it feels right. Try as I might, no matter how warm and gooey Raising Victor Vargas became I was always aware that something didn't quite feel right. Victor Vargas suffers from a certain overconfidence on the director's part. Apparently, the director thought that the ethnic backdrop of a Latino family on the lower east side, and an idyllic",negative
1,"THE SHOP AROUND THE CORNER is one of the sweetest and most feel-good romantic comedies ever made. There's just no getting around that, and it's hard to actually put one's feeling for this film into words. It's not one of those films that tries too hard, nor does it come up with the oddest possible scenarios to get the two protagonists together in the end. In fact, all its charm is innate, contained within the characters and the setting and the plot... which is highly believable to boot. It's easy to think that such a love story, as beautiful as any other ever",positive


=== squeezebert/squeezebert-uncased ===

architecture:	squeezebert
tokenizer:	SqueezeBertTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"raising victor vargas : a review < br / > < br / > you know, raising victor vargas is like sticking your hands into a big, steaming bowl of oatmeal. it's warm and gooey, but you're not sure if it feels right. try as i might, no matter how warm and gooey raising victor vargas became i was always aware that something didn't quite feel right. victor vargas suffers from a certain overconfidence on the director's part. apparently, the director thought that the ethnic backdrop of a latino family on the lower east side, and an idyllic",negative
1,"i really wanted to love this show. i truly, honestly did. < br / > < br / > for the first time, gay viewers get their own version of the "" the bachelor "". with the help of his obligatory "" hag "" andra, james, a good looking, well - to - do thirty - something has the chance of love with 15 suitors ( or "" mates "" as they are referred to in the show ). the only problem is half of them are straight and james doesn't know this. if james picks a gay one, they get a trip to new zealand, and if he",negative


=== transfo-xl-wt103 ===



Using pad_token, but it is not set yet.


architecture:	transfo_xl
tokenizer:	TransfoXLTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Raising Victor Vargas: A Review < br / > < br / > You know, Raising Victor Vargas is like sticking your hands into a big, steaming bowl of oatmeal. It's warm and gooey, but you're not sure if it feels right. Try as I might, no matter how warm and gooey Raising Victor Vargas became I was always aware that something didn't quite feel right. Victor Vargas suffers from a certain overconfidence on the director's part. Apparently, the director thought that the ethnic backdrop of a Latino family on the lower east side, and an idyllic storyline would make the film critic proof. He was right, but it didn't",negative
1,"THE THE is one of the sweetest and most feel-good romantic comedies ever made. There's just no getting around that, and it's hard to actually put one's feeling for this film into words. It's not one of those films that tries too hard, nor does it come up with the oddest possible scenarios to get the two protagonists together in the end. In fact, all its charm is innate, contained within the characters and the setting and the plot... which is highly believable to boot. It's easy to think that such a love story, as beautiful as any other ever told, * could * happen to you",positive


=== xlm-mlm-en-2048 ===

architecture:	xlm
tokenizer:	XLMTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"raising victor vargas : a review < br / > < br / > you know, raising victor vargas is like sticking your hands into a big, steaming bowl of oatmeal. it's warm and gooey, but you're not sure if it feels right. try as i might, no matter how warm and gooey raising victor vargas became i was always aware that something didn 't quite feel right. victor vargas suffers from a certain overconfidence on the director's part. apparently, the director thought that the ethnic backdrop of a latino family on the lower east side, and an idyllic storyline would make the film critic proof",negative
1,"the shop around the corner is one of the sweetest and most feel-good romantic comedies ever made. there's just no getting around that, and it's hard to actually put one's feeling for this film into words. it's not one of those films that tries too hard, nor does it come up with the oddest possible scenarios to get the two protagonists together in the end. in fact, all its charm is innate, contained within the characters and the setting and the plot... which is highly believable to boot. it's easy to think that such a love story, as beautiful as any other ever told,",positive


=== xlm-roberta-base ===

architecture:	xlm_roberta
tokenizer:	XLMRobertaTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Raising Victor Vargas: A Review<br /><br />You know, Raising Victor Vargas is like sticking your hands into a big, steaming bowl of oatmeal. It's warm and gooey, but you're not sure if it feels right. Try as I might, no matter how warm and gooey Raising Victor Vargas became I was always aware that something didn't quite feel right. Victor Vargas suffers from a certain overconfidence on the director's part. Apparently, the director thought that the ethnic back",negative
1,"Now that Che(2008) has finished its relatively short Australian cinema run (extremely limited release:1 screen in Sydney, after 6wks), I can guiltlessly join both hosts of ""At The Movies"" in taking Steven Soderbergh to task.<br /><br />It's usually satisfying to watch a film director change his style/subject, but Soderbergh's most recent stinker, The Girlfriend Experience(2009), was also missing a story, so narrative (and editing?) seem to suddenly be Soderbergh's main challenge.",negative


=== xlnet-base-cased ===

architecture:	xlnet
tokenizer:	XLNetTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Now that Che(2008) has finished its relatively short Australian cinema run (extremely limited release:1 screen in Sydney, after 6wks), I can guiltlessly join both hosts of ""At The Movies"" in taking Steven Soderbergh to task.<br /><br />It's usually satisfying to watch a film director change his style/subject, but Soderbergh's most recent stinker, The Girlfriend Experience(2009), was also missing a story, so narrative (and editing?) seem to suddenly be Soderbergh'",negative
1,"""The True Story Of The Friendship That Shook South Africa And Awakened The World."" <br /><br />Richard Attenborough, who directed ""A Bridge Too Far"" and ""Gandhi"", wanted to bring the story of Steve Biko to life, and the journey and trouble that journalist Donald Woods went through in order to get his story told. The films uses Wood's two books for it's information and basis - ""Biko"" and ""Asking for Trouble"".<br /><br /",positive


In [None]:
#hide_input
test_results_df = pd.DataFrame(test_results, columns=['arch', 'tokenizer', 'model_name', 'result', 'error'])
display_df(test_results_df)

Unnamed: 0,arch,tokenizer,model_name,result,error
0,albert,AlbertTokenizerFast,albert-base-v1,PASSED,
1,bart,BartTokenizerFast,facebook/bart-base,PASSED,
2,bert,BertTokenizerFast,bert-base-uncased,PASSED,
3,big_bird,BigBirdTokenizerFast,google/bigbird-roberta-base,PASSED,
4,ctrl,CTRLTokenizer,sshleifer/tiny-ctrl,PASSED,
5,camembert,CamembertTokenizerFast,camembert-base,PASSED,
6,convbert,ConvBertTokenizerFast,sarnikowski/convbert-medium-small-da-cased,PASSED,
7,deberta,DebertaTokenizerFast,microsoft/deberta-base,PASSED,
8,deberta_v2,DebertaV2Tokenizer,microsoft/deberta-v2-xlarge,PASSED,
9,distilbert,DistilBertTokenizerFast,distilbert-base-uncased,PASSED,


## Summary

The `blurr.data.core` module contains the fundamental bits for all data preprocessing tasks

In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_utils.ipynb.
Converted 01_data-core.ipynb.
Converted 01_modeling-core.ipynb.
Converted 02_data-language-modeling.ipynb.
Converted 02_modeling-language-modeling.ipynb.
Converted 03_data-token-classification.ipynb.
Converted 03_modeling-token-classification.ipynb.
Converted 04_data-question-answering.ipynb.
Converted 04_modeling-question-answering.ipynb.
Converted 10_data-seq2seq-core.ipynb.
Converted 10_modeling-seq2seq-core.ipynb.
Converted 11_data-seq2seq-summarization.ipynb.
Converted 11_modeling-seq2seq-summarization.ipynb.
Converted 12_data-seq2seq-translation.ipynb.
Converted 12_modeling-seq2seq-translation.ipynb.
Converted 99a_examples-high-level-api.ipynb.
Converted 99b_examples-glue.ipynb.
Converted 99c_examples-glue-plain-pytorch.ipynb.
Converted 99d_examples-multilabel.ipynb.
Converted index.ipynb.
