### Hugging Face Datasets Library
 You can find the names of the datasets provided by the glue benchmark in the video 22,23
 
  https://huggingface.co/docs/datasets/glue.html

mrpc is one of the datasets provided by this benchmark to test para-
phrases

In [2]:
from datasets import load_dataset

# Load the dataset
raw_datasets = load_dataset("glue", "mrpc")
raw_datasets

# The output is a DatasetDict object, which contains each split of the Dataset.

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [3]:
# Connect to each split by indexing!
raw_datasets['train']

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})

In [4]:
# we can see the number of training examples in the dataset as num_rows: 3668 in the above output~

In [5]:
# Access a given element by it's index:
raw_datasets['train'][0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [7]:
# Access a slice of your dataset:
raw_datasets['train'][:5]

{'sentence1': ['Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
  "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .",
  'They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .',
  'Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .',
  'The stock rose $ 2.11 , or about 11 percent , to close Friday at $ 21.51 on the New York Stock Exchange .'],
 'sentence2': ['Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
  "Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .",
  "On June 10 , the ship 's owners had published an advertisement on the Internet , offering the explosives for sale .",
  'Tab shares jumped 20 cents , or 4.6 % , to set a record closing high at 

In [8]:
# Access the sentence1 of the first element:
raw_datasets['train'][0]['sentence1']

'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .'

In [9]:
# Access the first 5 sentences of sentence1
raw_datasets['train'][:5]['sentence1']

['Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .",
 'They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .',
 'Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .',
 'The stock rose $ 2.11 , or about 11 percent , to close Friday at $ 21.51 on the New York Stock Exchange .']

In [10]:
# Use the features attribute to see the information your dataset contains:
raw_datasets['train'].features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [11]:
'''Remember features are the input variables to your model.'''

'Remember features are the input variables to your model.'

In [14]:
# To preprocess all the elements of the dataset we need to tokenize them!

from transformers import AutoTokenizer

checkpoint = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example['sentence1'], example['sentence2'],
                     padding='max_length',
                     truncation=True,
                     max_length=128)
tokenized_datasets = raw_datasets.map(tokenize_function)
print(tokenized_datasets.column_names)

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

{'train': ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'], 'validation': ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'], 'test': ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask']}


In [15]:
'''To speed up tokenization, the map method uses multiprocessing.
You could also set the batched=True'''

'To speed up tokenization, the map method uses multiprocessing.\nYou could also set the batched=True'

In [17]:
from transformers import AutoTokenizer

checkpoint = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example['sentence1'], example['sentence2'],
                     padding='max_length',
                     truncation=True,
                     max_length=128)
    
from datasets import load_dataset

raw_datasets = load_dataset("glue","mrpc") 
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [18]:
'''Once done, we are almost ready for training!
Just remove the columns we don't need anymore by the remove columns function
Rename the column label to labels
and use the format torch'''

"Once done, we are almost ready for training!\nJust remove the columns we don't need anymore by the remove columns function\nRename the column label to labels\nand use the format torch"

In [20]:
tokenized_datasets = tokenized_datasets.remove_columns(["idx","sentence1","sentence2"])
tokenized_datasets = tokenized_datasets.rename_column("label","labels")
tokenized_datasets = tokenized_datasets.with_format("torch") # The format could be torch/tensorflow/numpy~
tokenized_datasets["train"]

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3668
})

In [21]:
'''If needed we can also generate a short sample of the dataset using the select method!'''


'If needed we can also generate a short sample of the dataset using the select method!'

In [23]:
small_train_dataset = tokenized_datasets["train"].select(range(10))
small_train_dataset

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 10
})

In [None]:
'''As you can see, the number of training examples has now reduced to 10'''