## Imports

In [1]:
import datasets
import pandas as pd

from datasets import load_dataset

In [1]:
from onnxruntime import  get_all_providers
print(get_all_providers())

['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'MIGraphXExecutionProvider', 'ROCMExecutionProvider', 'OpenVINOExecutionProvider', 'DnnlExecutionProvider', 'TvmExecutionProvider', 'VitisAIExecutionProvider', 'QNNExecutionProvider', 'NnapiExecutionProvider', 'VSINPUExecutionProvider', 'JsExecutionProvider', 'CoreMLExecutionProvider', 'ArmNNExecutionProvider', 'ACLExecutionProvider', 'DmlExecutionProvider', 'RknpuExecutionProvider', 'WebNNExecutionProvider', 'XnnpackExecutionProvider', 'CANNExecutionProvider', 'AzureExecutionProvider', 'CPUExecutionProvider']


## Dataset

In [2]:
cola_dataset = load_dataset('glue', 'cola')

Downloading data:   0%|          | 0.00/251k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/37.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/37.7k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8551 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1043 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1063 [00:00<?, ? examples/s]

In [3]:
cola_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})

In [4]:
train_dataset = cola_dataset['train']
val_dataset = cola_dataset['validation']
test_dataset = cola_dataset['test']

len(train_dataset), len(val_dataset), len(test_dataset)

(8551, 1043, 1063)

In [5]:
train_dataset[0]

{'sentence': "Our friends won't buy this analysis, let alone the next one we propose.",
 'label': 1,
 'idx': 0}

In [14]:
val_dataset[0]

{'sentence': 'The sailors rode the breeze clear of the rocks.',
 'label': 1,
 'idx': 0}

In [15]:
test_dataset[0]

{'sentence': 'Bill whistled past the house.', 'label': -1, 'idx': 0}

In [16]:
train_dataset.features

{'sentence': Value(dtype='string', id=None),
 'label': ClassLabel(names=['unacceptable', 'acceptable'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [17]:
train_dataset.filter(lambda example: example['label'] == train_dataset.features['label'].str2int('acceptable'))[:5]


Filter:   0%|          | 0/8551 [00:00<?, ? examples/s]

{'sentence': ["Our friends won't buy this analysis, let alone the next one we propose.",
  "One more pseudo generalization and I'm giving up.",
  "One more pseudo generalization or I'm giving up.",
  'The more we study verbs, the crazier they get.',
  'Day by day the facts are getting murkier.'],
 'label': [1, 1, 1, 1, 1],
 'idx': [0, 1, 2, 3, 4]}

In [18]:
train_dataset.filter(lambda example: example['label'] == train_dataset.features['label'].str2int('unacceptable'))[:5]


Filter:   0%|          | 0/8551 [00:00<?, ? examples/s]

{'sentence': ['They drank the pub.',
  'The professor talked us.',
  'We yelled ourselves.',
  'We yelled Harry hoarse.',
  'Harry coughed himself.'],
 'label': [0, 0, 0, 0, 0],
 'idx': [18, 20, 22, 23, 25]}

## Tokenizing

In [19]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google/bert_uncased_L-2_H-128_A-2")
tokenizer

config.json:   0%|          | 0.00/382 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]



BertTokenizerFast(name_or_path='google/bert_uncased_L-2_H-128_A-2', vocab_size=30522, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

This is the configuration of a `BertTokenizerFast`, which is part of the Hugging Face Transformers library.

### 1. **BertTokenizerFast**:
This is a tokenizer specifically designed for BERT models. It uses a "fast" implementation that is highly optimized for speed (based on the `tokenizers` library by Hugging Face). 

Here’s a breakdown of its components:

- **name_or_path**: 
   - `'google/bert_uncased_L-2_H-128_A-2'` refers to the name or path of the BERT model for which this tokenizer is designed. This specific model name suggests it is a variant of BERT that is uncased (doesn't differentiate between uppercase and lowercase characters), with only **2 layers (L-2)**, **128 hidden units (H-128)**, and **2 attention heads (A-2)**. This is a smaller BERT model, typically used for fast, lightweight tasks.
   
- **vocab_size=30522**:
   - This refers to the size of the vocabulary used by the tokenizer. In this case, the vocabulary has 30,522 unique tokens (words or subword units).

- **model_max_length=1000000000000000019884624838656**:
   - This number seems unusually large. Typically, `model_max_length` specifies the maximum length of tokens the model can handle (for BERT, it's often 512). This value could be a placeholder or could indicate no predefined limit is set.

- **is_fast=True**:
   - This indicates that the fast version of the tokenizer is used, optimized for speed.

- **padding_side='right'** and **truncation_side='right'**:
   - These specify that padding and truncation should occur on the right side of the input sequence. This means if the input sequence is shorter than expected, padding tokens will be added to the right. If it’s longer than allowed, it will be truncated from the right side.

- **special_tokens**:
   - This dictionary defines the special tokens used by BERT. These tokens serve specific purposes in the model:
     - `'unk_token': '[UNK]'`: Represents unknown tokens (words not in the tokenizer’s vocabulary).
     - `'sep_token': '[SEP]'`: Used to separate segments in BERT (especially in sentence pair tasks).
     - `'pad_token': '[PAD]'`: Used for padding short sequences.
     - `'cls_token': '[CLS]'`: Added at the beginning of every input sequence for classification tasks.
     - `'mask_token': '[MASK]'`: Used in masked language modeling tasks, where some tokens are masked and the model tries to predict them.

- **clean_up_tokenization_spaces=True**:
   - This is a flag that indicates the tokenizer should clean up extra spaces in the tokenized output. It ensures there aren’t unnecessary spaces between tokens, which can be useful for readability when decoding the tokens back into text.

### 2. **added_tokens_decoder**:
This part defines how specific tokens are added to the tokenizer's vocabulary, and their characteristics:

- **AddedToken**: This is a special token added to the tokenizer’s vocabulary. The properties of each token define how it behaves:
   - **"[PAD]"** (token ID 0): Represents padding.
   - **"[UNK]"** (token ID 100): Represents unknown tokens.
   - **"[CLS]"** (token ID 101): Used for classification.
   - **"[SEP]"** (token ID 102): Used for separating sentences.
   - **"[MASK]"** (token ID 103): Used for masked language modeling tasks.

Each of these tokens has additional attributes:
- **rstrip=False**, **lstrip=False**: These indicate that the token should not strip spaces to the right or left when added to the tokenized sequence.
- **single_word=False**: This means that the token is not constrained to a single word but can appear within larger sequences.
- **normalized=False**: Indicates that the token should not undergo additional normalization (such as lowercasing or other transformations).
- **special=True**: This flag marks the token as special, so it has a unique role in processing (e.g., padding, masking, etc.).

### Summary:
In essence, this tokenizer is designed to work with a small, uncased version of BERT (2 layers, 128 hidden units), optimized for speed, with a vocabulary size of 30,522. It defines how special tokens like `[PAD]`, `[CLS]`, `[SEP]`, etc., should be handled and includes additional configurations for padding, truncation, and token cleaning.


In [20]:
print(train_dataset[0]['sentence'])
tokenizer(train_dataset[0]['sentence'])

Our friends won't buy this analysis, let alone the next one we propose.


{'input_ids': [101, 2256, 2814, 2180, 1005, 1056, 4965, 2023, 4106, 1010, 2292, 2894, 1996, 2279, 2028, 2057, 16599, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

Process of tokenizing a sentence using a BERT tokenizer and explains how the tokenized input is structured when using the Hugging Face `BertTokenizerFast`. 

### 1. **`print(train_dataset[0]['sentence'])`**:
   This line prints the first sentence from the training dataset. In your example, the sentence is:

   ```
   Our friends won't buy this analysis, let alone the next one we propose.
   ```

### 2. **`tokenizer(train_dataset[0]['sentence'])`**:
   This line tokenizes the same sentence using the BERT tokenizer, which outputs a dictionary containing the following fields:
   
   - **`input_ids`**: This is the list of token IDs corresponding to each word or subword in the sentence. BERT uses a predefined vocabulary to map words and subwords to numeric values. The tokenizer breaks the sentence down into tokens, converts them into IDs, and adds special tokens like `[CLS]` and `[SEP]`.
   
   Here is the mapping for each token:

   | Token | Input ID |
   |-------|----------|
   | [CLS] | 101      |
   | Our   | 2256     |
   | friends | 2814    |
   | won | 2180       |
   | 't | 1005        |
   | buy | 4965       |
   | this | 2023      |
   | analysis | 4106  |
   | , | 1010         |
   | let | 2292       |
   | alone | 2894     |
   | the | 1996       |
   | next | 2279      |
   | one | 2028       |
   | we | 2057        |
   | propose | 16599  |
   | . | 1012         |
   | [SEP] | 102      |

   - **Special Tokens**: `[CLS]` (ID: 101) is added at the beginning of the sentence and `[SEP]` (ID: 102) is added at the end. These are standard tokens for BERT models:
     - `[CLS]`: BERT uses this token at the beginning for classification tasks or sentence understanding.
     - `[SEP]`: BERT uses this token to signify the end of a sentence, or between two sentences in sentence-pair tasks.

   - **`token_type_ids`**: These IDs specify whether a token belongs to the first sentence (marked as 0) or the second sentence (marked as 1). In this case, the sentence is not part of a pair, so all the tokens are marked with `0`.
     ```
     'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
     ```

   - **`attention_mask`**: This mask indicates which tokens should be attended to by the model. The value `1` means the token is a real token (not padding), and `0` would mean it's a padding token (in case the sentence was padded to a certain length). Since there are no padding tokens in this example, all the values are `1`.
     ```
     'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
     ```

### What does each part mean?
Let’s break it down based on each key of the dictionary:

#### 1. **`input_ids`**:
   These are the token IDs that represent the tokenized sentence. Each word, or subword (since BERT uses subword tokenization), is converted into its corresponding numeric ID from the BERT vocabulary. In this case, the sentence "Our friends won't buy this analysis, let alone the next one we propose." is tokenized, with special tokens `[CLS]` and `[SEP]` added at the beginning and end, respectively.

#### 2. **`token_type_ids`**:
   These indicate which segment the token belongs to. Since BERT can handle sentence pairs, it needs to know which tokens belong to the first sentence (assigned 0) and which belong to the second sentence (assigned 1). Here, since we only have a single sentence, all the tokens are marked with 0.

#### 3. **`attention_mask`**:
   This mask is used to tell the model which tokens should be attended to and which tokens are padding. A value of 1 means that the model should pay attention to this token, and a value of 0 means it should ignore it (usually for padding tokens). In this case, all the tokens are relevant, so the entire attention mask is filled with 1s.

### Summary of the tokenization process:
- The sentence is split into tokens, each represented by an `input_id`.
- Special tokens `[CLS]` and `[SEP]` are added to indicate the start and end of the sentence.
- Each token is given a `token_type_id`, which is 0 for this single sentence.
- An `attention_mask` is created to inform the model which tokens to focus on during processing.



In [21]:
tokenizer.decode(tokenizer(train_dataset[0]['sentence'])['input_ids'])

"[CLS] our friends won't buy this analysis, let alone the next one we propose. [SEP]"

In [22]:
def encode(examples):
    return tokenizer(
            examples["sentence"],
            truncation=True,
            padding="max_length",
            max_length=512,
        )

train_dataset = train_dataset.map(encode, batched=True)

Map:   0%|          | 0/8551 [00:00<?, ? examples/s]

This defines a function `encode` and applies it to a dataset (`train_dataset`) using the `map` function from the Hugging Face datasets library. 

### 1. **The `encode` Function**:
   ```python
   def encode(examples):
       return tokenizer(
           examples["sentence"],
           truncation=True,
           padding="max_length",
           max_length=512,
       )
   ```

This function takes in a batch of examples from the dataset and tokenizes them using the `tokenizer`. Here's how it works:

- **`examples["sentence"]`**: This extracts the `"sentence"` field from the input batch. Presumably, `train_dataset` contains a column named `"sentence"` that holds the text data you want to tokenize.

- **`truncation=True`**: This tells the tokenizer to truncate the sentence if its tokenized form exceeds a specified maximum length. In this case, it ensures that if the sentence is longer than `max_length=512`, it will be truncated to fit the limit.

- **`padding="max_length"`**: This ensures that each tokenized sequence is padded to the maximum length (`512` in this case). Padding tokens (`[PAD]`) will be added to the end of the sequence (or wherever specified) if it is shorter than 512 tokens. This makes sure that all tokenized examples are the same length.

- **`max_length=512`**: This sets the maximum length of the tokenized sequences to 512 tokens. If a sentence is longer than 512 tokens, it will be truncated. If it’s shorter, padding tokens will be added to make it exactly 512 tokens long.

- **Return value**: The tokenizer will return a dictionary of tokenized representations, including fields such as:
  - `input_ids`: The list of token IDs corresponding to the input text.
  - `attention_mask`: A mask that indicates which tokens are real and which are padding (1 for real tokens, 0 for padding).
  - `token_type_ids`: If applicable, this distinguishes between different input sequences (but for single-sentence tasks, it’s all zeros).

### 2. **Mapping the `encode` Function to the Dataset**:
   ```python
   train_dataset = train_dataset.map(encode, batched=True)
   ```

Here’s what this line does:

- **`train_dataset.map(encode, batched=True)`**: This applies the `encode` function to each example (or batch of examples) in the `train_dataset`.
  - **`batched=True`**: This tells the `map` function to process examples in batches rather than one at a time, which is more efficient. The `encode` function will receive a batch of sentences instead of a single sentence.
  - The `map` function will apply the `encode` function to every example in the dataset, transforming each sentence into its tokenized form (with `input_ids`, `attention_mask`, etc.).

After the `map` operation, `train_dataset` will contain the tokenized representations of the sentences, which are ready to be used in a model like BERT.

### Step-by-Step Process:

1. **Batch of Examples**: The dataset is passed in batches (thanks to `batched=True`). For each batch, the `"sentence"` field is extracted.

2. **Tokenization**: The `tokenizer` converts each sentence into token IDs and applies:
   - **Truncation** if the sentence exceeds 512 tokens.
   - **Padding** if the sentence is shorter than 512 tokens.
   
3. **Returned Tokenized Output**: The `tokenizer` returns a dictionary containing:
   - `input_ids`: The tokenized form of the sentence.
   - `attention_mask`: Indicates which tokens are real and which are padding.
   - (Optional) `token_type_ids`: If needed, distinguishes between sentence segments.

4. **Dataset Update**: After applying `map`, the `train_dataset` now contains the tokenized versions of each sentence, ready to be passed to a BERT model or another transformer-based model for training or inference.

### Example Output:
For a single sentence, the dataset will now look something like this:

```python
{
  'input_ids': [101, 2026, 2814, ..., 102, 0, 0, ..., 0],  # Padded/truncated to 512 tokens
  'attention_mask': [1, 1, 1, ..., 1, 0, 0, ..., 0],  # 1s for real tokens, 0s for padding
  'token_type_ids': [0, 0, 0, ..., 0],  # All 0s if single sentence task
}
```

### Summary:
- The `encode` function tokenizes the sentences from the dataset.
- It applies truncation if the sentence is longer than 512 tokens.
- It pads the sentence to 512 tokens if it is shorter.
- The `map` function applies this transformation to every sentence in the dataset in batches, converting the raw text into a format that can be fed into a transformer model.



## Formating

In [24]:
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])


## Data Loader

In [25]:
import torch
dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32)
next(iter(dataloader))

{'label': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0,
         1, 0, 0, 1, 1, 1, 1, 1]),
 'input_ids': tensor([[  101,  2256,  2814,  ...,     0,     0,     0],
         [  101,  2028,  2062,  ...,     0,     0,     0],
         [  101,  2028,  2062,  ...,     0,     0,     0],
         ...,
         [  101,  5965, 12808,  ...,     0,     0,     0],
         [  101,  2198, 10948,  ...,     0,     0,     0],
         [  101,  3021, 24471,  ...,     0,     0,     0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

In [26]:
for batch in dataloader:
    print(batch['input_ids'].shape, batch['attention_mask'].shape, batch['label'].shape)

torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) to