In [8]:
import transformers
transformers.logging.set_verbosity(transformers.logging.CRITICAL)

import datasets
datasets.logging.set_verbosity(datasets.logging.ERROR)

import warnings
warnings.filterwarnings(category=FutureWarning ,action='ignore')

### Get information about the dataset

In [10]:
from datasets import load_dataset_builder
ds_builder = load_dataset_builder("rotten_tomatoes")

print("Type of ds_builder.info: ", type(ds_builder.info))
print("\nds_builder.info.description: ", ds_builder.info.description)
print("\nds_builder.info.features: ", ds_builder.info.features)

Type of ds_builder.info:  <class 'datasets.info.DatasetInfo'>

ds_builder.info.description:  Movie Review Dataset.
This is a dataset of containing 5,331 positive and 5,331 negative processed
sentences from Rotten Tomatoes movie reviews. This data was first used in Bo
Pang and Lillian Lee, ``Seeing stars: Exploiting class relationships for
sentiment categorization with respect to rating scales.'', Proceedings of the
ACL, 2005.


ds_builder.info.features:  {'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['neg', 'pos'], id=None)}


#### Splits

In [12]:
from datasets import get_dataset_split_names

get_dataset_split_names("rotten_tomatoes")

['train', 'validation', 'test']

#### Load the dataset

In [14]:
from datasets import load_dataset

# Loading a split returns a Dataset
dataset = load_dataset("rotten_tomatoes", split="train")
print(f"type of dataset: {type(dataset)}")

# Loading all returns a DatasetDict
datasets = load_dataset("rotten_tomatoes")
print(f"type of datasets: {type(datasets)}")

type of dataset: <class 'datasets.arrow_dataset.Dataset'>


100%|██████████| 3/3 [00:00<00:00, 300.02it/s]

type of datasets: <class 'datasets.dataset_dict.DatasetDict'>





#### Datasets with configurations

When a Datasets has several subset, those are called Configurations  
One shall select explicitly onee configuration when loading the dataset

In [18]:
from datasets import get_dataset_config_names

configs = get_dataset_config_names("PolyAI/minds14")
print(configs)

from datasets import load_dataset

# mindsFR = load_dataset("PolyAI/minds14", "fr-FR", split="train")

['cs-CZ', 'de-DE', 'en-AU', 'en-GB', 'en-US', 'es-ES', 'fr-FR', 'it-IT', 'ko-KR', 'nl-NL', 'pl-PL', 'pt-PT', 'ru-RU', 'zh-CN', 'all']


#### Streaming option

In [27]:
from datasets import load_dataset

#Get an iterable
iterable_dataset = load_dataset("food101", split="train", streaming=True)

# Access it one element at a time
for example in iterable_dataset:
    print("This is an example: ", example)
    break
print("type(iterable_dataset): ", type(iterable_dataset))

# One get get a subset with take
list(iterable_dataset.take(3))

This is an example:  {'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=384x512 at 0x29B6A72D6D0>, 'label': 6}
type(iterable_dataset):  <class 'datasets.iterable_dataset.IterableDataset'>


[{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=384x512>,
  'label': 6},
 {'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=512x512>,
  'label': 6},
 {'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=512x383>,
  'label': 6}]

#### Set format 

In [30]:
dataset.set_format(type="torch", columns=["text", "label"])
dataset.format['type']

'torch'