In [1]:
from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc")

raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [2]:
raw_datasets["test"]

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 1725
})

In [3]:
raw_datasets["test"][3]

{'sentence1': 'A tropical storm rapidly developed in the Gulf of Mexico Sunday and was expected to hit somewhere along the Texas or Louisiana coasts by Monday night .',
 'sentence2': 'A tropical storm rapidly developed in the Gulf of Mexico on Sunday and could have hurricane-force winds when it hits land somewhere along the Louisiana coast Monday night .',
 'label': 0,
 'idx': 3}

In [4]:
raw_datasets["test"][-10:]

{'sentence1': ['The New York Yankees took third baseman Eric Duncan from Seton Hall Prep in New Jersey with the 27th pick .',
  'Gehring waived extradition Monday during a hearing in San Jose , and authorities said they expected him back in New Hampshire on Tuesday .',
  '" I am advised that certain allegations of criminal conduct have been interposed against my counsel , " said Silver .',
  'Crews worked to install a new culvert and prepare the highway so motorists could use the eastbound lanes for travel as storm clouds threatened to dump more rain .',
  "The deal , approved by both companies ' board of directors , is expected to be completed in the third quarter of Nvidia 's fiscal third quarter .",
  'After Hughes refused to rehire Hernandez , he complained to the Equal Employment Opportunity Commission .',
  'There are 103 Democrats in the Assembly and 47 Republicans .',
  'Bethany Hamilton remained in stable condition Saturday after the attack Friday morning .',
  'Last week the 

In [5]:
raw_datasets["test"].features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [7]:
from transformers import AutoTokenizer

checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(
        example["sentence1"], example["sentence2"], padding= "max_length" , truncation= True, max_length= 128
    )

tokenized_datasets = raw_datasets.map(tokenize_function)
print(tokenized_datasets.column_names)
tokenized_datasets

{'train': ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'], 'validation': ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'], 'test': ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask']}


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [8]:
from transformers import AutoTokenizer

checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(
        example["sentence1"], example["sentence2"], padding= "max_length" , truncation= True, max_length= 128
    )

tokenized_datasets = raw_datasets.map(tokenize_function , batched= True)
tokenized_datasets

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [10]:
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1" , "sentence2" , "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label" , "labels")
tokenized_datasets = tokenized_datasets.with_format("tensorflow")
tokenized_datasets["test"]

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1725
})

In [11]:
small_train_dataset = tokenized_datasets["train"].select(range(100))
small_train_dataset

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 100
})

In [None]:
# Done.