In [None]:
# default_exp data.core

In [None]:
#hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# data.core

> This module contains the core bits required to use the fastai DataBlock API and/or mid-level data processing pipelines to organize your data in a way modelable by huggingface transformer implementations.

In [None]:
#export
from functools import reduce

import torch, nlp
from transformers import *
from fastai2.text.all import *

from blurr.utils import *

In [None]:
#hide
import pdb

from nbdev.showdoc import *
from fastcore.test import *

In [None]:
#cuda
torch.cuda.set_device(1)
print(f'Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}')

Using GPU #1: GeForce GTX 1080 Ti


## Base tokenization, batch transform, and DataBlock methods

In [None]:
#export
class HF_BaseInput(list): pass

The `HF_BaseInput` object is used to encapsulate all the inputs required by whatever huggingface model we are using. We use it as a container for the `input_ids`, `token_type_ids`, and `attention_mask` tensors required by most models, and also as a mean to customize @typedispatched functions like `DataLoaders.show_batch` and `Learner.show_results`.

In [None]:
#export
class HF_TokenizerTransform(Transform):
    """huggingface friendly tokenization transfor."""
    def __init__(self, hf_arch, hf_tokenizer, title='text', **kwargs):
        
        store_attr(self, 'hf_arch, hf_tokenizer, title')
        self.add_prefix_space = hf_arch in ['gpt2', 'roberta', 'bart']
        
    def encodes(self, inp): 
        """Supports both string and list[str] inputs (the later is common for token classification tasks).
        Returns the numericalized (token_ids) of the input so no need to run this through a Numericalization
        transform."""
        if (isinstance(inp, str)): 
            toks = self.hf_tokenizer.tokenize(inp, add_prefix_space=self.add_prefix_space)
        else:
            toks = [sub_toks for entity in inp 
                    for sub_toks in self.hf_tokenizer.tokenize(entity, add_prefix_space=self.add_prefix_space)]
            
        return tensor(self.hf_tokenizer.convert_tokens_to_ids(toks))
    
    def decodes(self, encoded_inp): 
        """This will get called multiple times for a given encoded input because our batch transform will add
        other elements to it (e.g., attention_mask, token_type_ids, etc...) as required by the defined huggingface
        tokenizer and model.  If it can't decode it, return None."""
        try: return TitledStr(self.hf_tokenizer.decode(encoded_inp.cpu().numpy()))
        except: return None
    

`HF_TokenizerTransform` was inspired by [this article](http://dev.fast.ai/tutorial.transformers).  It handles both the tokenization and numericalization traditionally split apart in the fastai text DataBlock API.  Addtionally, it's been updated to add a prefix space for the huggingface architectures that need it.

You can pass a string or list into this Transform, the later being common in token classification tasks like named entity recognition.

In [None]:
#export
@typedispatch
def build_hf_input(task, tokenizer, a_tok_ids, b_tok_ids=None, targets=None,
                   max_length=512, pad_to_max_length=True, truncation_strategy='longest_first'):

    res = tokenizer.prepare_for_model(a_tok_ids, b_tok_ids, 
                                       max_length=max_length, pad_to_max_length=pad_to_max_length,
                                       truncation_strategy=truncation_strategy, return_tensors='pt')
    
    input_ids = res['input_ids'][0]
    attention_mask = res['attention_mask'][0] if ('attention_mask' in res) else torch.tensor([-9999]) 
    token_type_ids = res['token_type_ids'][0] if ('token_type_ids' in res) else torch.tensor([-9999]) 
    
    return HF_BaseInput([input_ids, attention_mask, token_type_ids]), targets

`build_hf_input` uses fastai's @typedispatched decorator to provide for complete flexibility in terms of how your numericalized tokens are assembled, and also what you return via `HF_BaseInput` and as your targets.  You can override this implementation as needed by assigning a type to the `task` argument (and optionally the `tokenizer` argument as well).

What you return here is what will be fed into your huggingface model.

In [None]:
#export
class HF_BatchTransform(Transform):
    """Handles everything you need to assemble a mini-batch of inputs and targets"""
    def __init__(self, hf_arch, hf_tokenizer, max_seq_len=512, truncation_strategy='longest_first', task=None, 
                 **kwargs):
        
        self.hf_arch = hf_arch
        self.hf_tokenizer = hf_tokenizer
        store_attr(self, 'max_seq_len, truncation_strategy, task, kwargs')
        
    def encodes(self, samples):
        encoded_samples = []
        for idx, sample in enumerate(samples):
            if (isinstance(sample[0], tuple)):
                a_tok_ids, b_tok_ids = sample[0][0].tolist(), sample[0][1].tolist()
            else:
                a_tok_ids, b_tok_ids = sample[0].tolist(), None

            hf_base_input, targets = build_hf_input(self.task, self.hf_tokenizer, 
                                                    a_tok_ids, b_tok_ids, sample[1:],
                                                    self.max_seq_len, True, self.truncation_strategy, **self.kwargs)
            
            encoded_samples.append((hf_base_input, *targets))
            
        return encoded_samples

In [None]:
#export
class HF_TextBlock(TransformBlock):
    def __init__(self, hf_arch, hf_tokenizer, hf_batch_tfm=None, max_seq_len=512, task=None, **kwargs):
        
        if (hf_batch_tfm is None): 
            hf_batch_tfm = HF_BatchTransform(hf_arch, hf_tokenizer, max_seq_len, task=task, **kwargs)
            
        dl_type = SortedDL
        if (isinstance(task, ConditionalGenerationTask)): dl_type=None
    
        return super().__init__(type_tfms=HF_TokenizerTransform(hf_arch, hf_tokenizer), 
                                dl_type=dl_type, dls_kwargs={ 'before_batch': hf_batch_tfm })            

`HF_TextBlock` has been dramatically simplified from it's predecessor.  It handles setting up your `HF_TokenizerTransform` transforms and `HF_BatchTransform` transform regardless of data source (e.g., this will work with files, DataFrames, whatever).

In [None]:
#export
@typedispatch
def show_batch(x:HF_BaseInput, y, samples, hf_tokenizer, skip_special_tokens=True, ctxs=None, max_n=6, **kwargs):  
    if ctxs is None: ctxs = get_empty_df(min(len(samples), max_n))
        
    samples = L((TitledStr(hf_tokenizer.decode(inp, skip_special_tokens=skip_special_tokens).replace(hf_tokenizer.pad_token, '')), *s[1:]) 
                for inp, s in zip(x[0], samples))
    
    ctxs = show_batch[object](x, y, samples, max_n=max_n, ctxs=ctxs, **kwargs)

    display_df(pd.DataFrame(ctxs))
    return ctxs

## Example usage - Multi-class classification

Below demonstrates how to contruct your `DataBlock` for a sequence classification task (e.g., a model that requires a single text input)

In [None]:
path = untar_data(URLs.IMDB_SAMPLE)

model_path = Path('models')
imdb_df = pd.read_csv(path/'texts.csv')

In [None]:
imdb_df.head()

Unnamed: 0,label,text,is_valid
0,negative,"Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff!",False
1,positive,"This is a extremely well-made film. The acting, script and camera-work are all first-rate. The music is good, too, though it is mostly early in the film, when things are still relatively cheery. There are no really superstars in the cast, though several faces will be familiar. The entire cast does an excellent job with the script.<br /><br />But it is hard to watch, because there is no good end to a situation like the one presented. It is now fashionable to blame the British for setting Hindus and Muslims against each other, and then cruelly separating them into two countries. There is som...",False
2,negative,"Every once in a long while a movie will come along that will be so awful that I feel compelled to warn people. If I labor all my days and I can save but one soul from watching this movie, how great will be my joy.<br /><br />Where to begin my discussion of pain. For starters, there was a musical montage every five minutes. There was no character development. Every character was a stereotype. We had swearing guy, fat guy who eats donuts, goofy foreign guy, etc. The script felt as if it were being written as the movie was being shot. The production value was so incredibly low that it felt li...",False
3,positive,"Name just says it all. I watched this movie with my dad when it came out and having served in Korea he had great admiration for the man. The disappointing thing about this film is that it only concentrate on a short period of the man's life - interestingly enough the man's entire life would have made such an epic bio-pic that it is staggering to imagine the cost for production.<br /><br />Some posters elude to the flawed characteristics about the man, which are cheap shots. The theme of the movie ""Duty, Honor, Country"" are not just mere words blathered from the lips of a high-brassed offic...",False
4,negative,"This movie succeeds at being one of the most unique movies you've seen. However this comes from the fact that you can't make heads or tails of this mess. It almost seems as a series of challenges set up to determine whether or not you are willing to walk out of the movie and give up the money you just paid. If you don't want to feel slighted you'll sit through this horrible film and develop a real sense of pity for the actors involved, they've all seen better days, but then you realize they actually got paid quite a bit of money to do this and you'll lose pity for them just like you've alr...",False


There are a bunch of ways we can get at the four huggingface elements we need (e.g., architecture name, tokenizer, config, and model).  We can just create them directly, or we can use one of the helper methods available via `BLURR_MODEL_HELPER`.

In [None]:
task = HF_TASKS_AUTO.SequenceClassification

pretrained_model_name = "roberta-base" # "distilbert-base-uncased" "bert-base-uncased"
hf_arch, hf_config, hf_tokenizer, hf_model = BLURR_MODEL_HELPER.get_hf_objects(pretrained_model_name, task=task)

Once you have those elements, you can create your `DataBlock` as simple as the below.

In [None]:
# single input
blocks = (HF_TextBlock(hf_arch=hf_arch, hf_tokenizer=hf_tokenizer), CategoryBlock)

dblock = DataBlock(blocks=blocks, 
                   get_x=ColReader('text'), get_y=ColReader('label'), 
                   splitter=ColSplitter(col='is_valid'))

In [None]:
# dblock.summary(imdb_df)

In [None]:
dls = dblock.dataloaders(imdb_df, bs=4)

In [None]:
b = dls.one_batch(); len(b), len(b[0]), len(b[1]) 

(2, 3, 4)

In [None]:
b[0][0].shape, b[0][1].shape, b[0][2].shape, b[1].shape

(torch.Size([4, 512]),
 torch.Size([4, 512]),
 torch.Size([4, 1]),
 torch.Size([4]))

In [None]:
dls.show_batch(hf_tokenizer=hf_tokenizer, max_n=2)

Unnamed: 0,text,category
0,"Raising Victor Vargas: A Review<br /><br />You know, Raising Victor Vargas is like sticking your hands into a big, steaming bowl of oatmeal. It's warm and gooey, but you're not sure if it feels right. Try as I might, no matter how warm and gooey Raising Victor Vargas became I was always aware that something didn't quite feel right. Victor Vargas suffers from a certain overconfidence on the director's part. Apparently, the director thought that the ethnic backdrop of a Latino family on the lower east side, and an idyllic storyline would make the film critic proof. He was right, but it didn't fool me. Raising Victor Vargas is the story about a seventeen-year old boy called, you guessed it, Victor Vargas (Victor Rasuk) who lives his teenage years chasing more skirt than the Rolling Stones could do in all the years they've toured. The movie starts off in `Ugly Fat' Donna's bedroom where Victor is sure to seduce her, but a cry from outside disrupts his plans when his best-friend Harold (Kevin Rivera) comes-a-looking for him. Caught in the attempt by Harold and his sister, Victor Vargas runs off for damage control. Yet even with the embarrassing implication that he's been boffing the homeliest girl in the neighborhood, nothing dissuades young Victor from going off on the hunt for more fresh meat. On a hot, New York City day they make way to the local public swimming pool where Victor's eyes catch a glimpse of the lovely young nymph Judy (Judy Marte), who's not just pretty, but a strong and independent too. The relationship that develops between Victor and Judy becomes the focus of the film. The story also focuses on Victor's family that is comprised of his grandmother or abuelita (Altagracia Guzman), his brother Nino (also played by real life brother to Victor, Silvestre Rasuk) and his sister Vicky (Krystal Rodriguez). The action follows Victor between scenes with Judy and scenes with his family. Victor tries to cope with being an oversexed pimp-daddy, his feelings for Judy and his grandmother's conservative Catholic upbringing.<br /><br />The problems that arise from Raising Victor Vargas are a few, but glaring errors. Throughout the film you get to know certain characters like Vicky, Nino, Grandma,",negative
1,"I really wanted to love this show. I truly, honestly did.<br /><br />For the first time, gay viewers get their own version of the ""The Bachelor"". With the help of his obligatory ""hag"" Andra, James, a good looking, well-to-do thirty-something has the chance of love with 15 suitors (or ""mates"" as they are referred to in the show). The only problem is half of them are straight and James doesn't know this. If James picks a gay one, they get a trip to New Zealand, and If he picks a straight one, straight guy gets $25,000. How can this not be fun?! Take my hand, lets stroll: <br /><br />The most glaring problem with this show is the bachelor himself. James is your typical young and successful gay guy with a nice smile and body, the one you'd probably give two glances towards at your local bar before grazing for greener pastures. Why they chose to cast James as the leading man is beyond me. God knows there's so many other hotter and vivacious homosexual men out there dying to be on TV.<br /><br />Aside from his rather average physical appearance, James is about as interesting and exciting as a piece of chalk. Even as such, he has this arrogant, smugly condescending aura about him. However, if James were standing up against a blank, white wall he'd meld right into in it. I honestly can't recall a single interesting or noteworthy thing James said during the course of the show. He is THAT boring and forgettable. In fact, one of the mates flat out advised him he wasn't feeling a connection. I thought that was the best part of the show. Also, James speaks with an excruciatingly annoying lilt. Sound feminine or sound masculine, but don't ****ing segue tones in the middle of sentences...so painful to sit through. I hated him so much all throughout the show I kept thinking, ""Please choose a straight guy and humiliate yourself and your unfortunate looking hag""<br /><br />Then we have the suitors. A remarkably bland bunch of men who don't seem to care either way what is happening. Equally vapid, they seem to be indistinguishable from one guy to the next except, ""Hey that guy has blond highlights or oh that one has curly hair"" Again, astoundingly inept casting decisions seem to be the aim of this show. While",negative


## Example usage - Multi-label classification

Below demonstrates how to contruct your `DataBlock` for a multi-label classification task

In [None]:
# creates a dataset with the first 10% of training set
raw_data = nlp.load_dataset('civil_comments', split='train[:1%]') 
len(raw_data)

Using custom data configuration default


18049

In [None]:
toxic_df = pd.DataFrame(raw_data)
toxic_df.head()

Unnamed: 0,text,toxicity,severe_toxicity,obscene,threat,insult,identity_attack,sexual_explicit
0,"This is so cool. It's like, 'would you want your mother to read this??' Really great idea, well done!",0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Thank you!! This would make my life a lot less anxiety-inducing. Keep it up, and don't let anyone get in your way!",0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,This is such an urgent design problem; kudos to you for taking it on. Very impressive!,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Is this something I'll be able to install on my site? When will you be releasing it?,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,haha you guys are a bunch of losers.,0.893617,0.021277,0.0,0.0,0.87234,0.021277,0.0


In [None]:
lbl_cols = list(toxic_df.columns[1:]); lbl_cols

['toxicity',
 'severe_toxicity',
 'obscene',
 'threat',
 'insult',
 'identity_attack',
 'sexual_explicit']

In [None]:
toxic_df = toxic_df.round({col: 0 for col in lbl_cols})
toxic_df.head()

Unnamed: 0,text,toxicity,severe_toxicity,obscene,threat,insult,identity_attack,sexual_explicit
0,"This is so cool. It's like, 'would you want your mother to read this??' Really great idea, well done!",0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Thank you!! This would make my life a lot less anxiety-inducing. Keep it up, and don't let anyone get in your way!",0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,This is such an urgent design problem; kudos to you for taking it on. Very impressive!,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Is this something I'll be able to install on my site? When will you be releasing it?,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,haha you guys are a bunch of losers.,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [None]:
task = HF_TASKS_AUTO.SequenceClassification

pretrained_model_name = "roberta-base" # "distilbert-base-uncased" "bert-base-uncased"
n_labels = len(lbl_cols)

hf_arch, hf_config, hf_tokenizer, hf_model = BLURR_MODEL_HELPER.get_hf_objects(pretrained_model_name, 
                                                                               task=task, 
                                                                               config_kwargs={'num_labels': n_labels})

In [None]:
# single input
blocks = (
    HF_TextBlock(hf_arch=hf_arch, hf_tokenizer=hf_tokenizer), 
    MultiCategoryBlock(encoded=True, vocab=lbl_cols)
)

dblock = DataBlock(blocks=blocks, 
                   get_x=ColReader('text'), 
                   get_y=ColReader(lbl_cols), 
                   splitter=RandomSplitter())

In [None]:
dls = dblock.dataloaders(toxic_df, bs=4)

In [None]:
b = dls.one_batch()
len(b), b[0][0].shape, b[1].shape

(2, torch.Size([4, 512]), torch.Size([4, 7]))

In [None]:
dls.show_batch(hf_tokenizer=hf_tokenizer, max_n=2)

Unnamed: 0,text,None
0,"I have had a question about Einstein's Special Theory of Relativity for some time which scientists all seem to run away from. Until 1887 the equations used for Relativity were the Galilean transformation equations.\n\n x'=x-vt\n y'=y\n z'=z\n t'=t\n\nAfter 1887, scientists threw away the Galilean transformation equations and substituted the Lorentz equations. Some time back I began to ask scientists why they had not used two sets of Galilean transformation equations instead of the Lorentz equations.\n \n x = x' - (v2)(t2)'\n y = y'\n z = z'\n",
1,"What’s not on the table: Legislator Salary padding #3\n\nAsk Sen. McGuire why she and Majority colleagues like Sen. Giessel and Reps. Neuman and Keller looted the state treasury by claiming per diem at the federal rate of $292 per day for lodging and food while living at home during the 2015 Anchorage special session- with each of them putting thousands of dollars into their pockets: McGuire ($3,504), Giessel ($1,971), Keller ($1,971), and Neuman ($4,964). If McGuire and the Gang are willing to stuff their own pockets like this, imagine what they could do if they start raiding the Permanent Fund.\n\nhttp://www.alaskapublic.org/2015/06/12/legislatures-per-diem-expenses-approach-200k/\n\nAlso, since ""conservatives"" like Neuman, Giessel, McGuire, Reinbold, and Vasquez always holding up the private sector as a model, be sure to ask them to identify one company in Alaska that pays its employees thousands of dollars in excess of expenses. I bet they can't.\n\nBut they will cut teachers and police.",


## Cleanup

In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_utils.ipynb.
Converted 01_data-core.ipynb.
Converted 01a_data-language-modeling.ipynb.
Converted 01c_data-question-answering.ipynb.
Converted 01d_data-token-classification.ipynb.
Converted 01e_data-text-generation.ipynb.
Converted 02_modeling-core.ipynb.
Converted 02a_modeling-language-modeling.ipynb.
Converted 02c_modeling-question-answering.ipynb.
Converted 02d_modeling-token-classification.ipynb.
Converted 02e_modeling-text-generation.ipynb.
Converted index.ipynb.
