In [1]:
from datasets import load_dataset_builder, load_dataset, get_dataset_split_names, get_dataset_config_names
import time

In [2]:
_PATH_WIKI_TEXT_ = "wikitext"

# 1. Accessing MetaData

## 1.1. configs available

In [3]:
print(get_dataset_config_names(_PATH_WIKI_TEXT_, trust_remote_code=True))

['wikitext-103-raw-v1', 'wikitext-103-v1', 'wikitext-2-raw-v1', 'wikitext-2-v1']


In [4]:
from datasets import load_dataset_builder
ds_builder = load_dataset_builder(_PATH_WIKI_TEXT_, "wikitext-2-v1")

In [5]:
# inspecting dataset description
ds_builder.info.description

''

In [6]:
# Inspect dataset features
ds_builder.info.features

{'text': Value(dtype='string', id=None)}

# 2. Loading Data

In [7]:
# loading data, train split
dataset_trn = load_dataset(_PATH_WIKI_TEXT_, "wikitext-2-v1", split="train")

In [8]:
# if no split, is given it loads all splits
splits_available = get_dataset_split_names(_PATH_WIKI_TEXT_, "wikitext-2-v1")
print(splits_available)

['test', 'train', 'validation']


## 2.1. Dataset Configurations/Subsets

Many datasets contain serverl sub-datasets. Like `wikitext` has severla sub-datasets, like `['wikitext-103-raw-v1', 'wikitext-103-v1', 'wikitext-2-raw-v1', 'wikitext-2-v1']
`

In [9]:
wiki_text_configs = get_dataset_config_names(_PATH_WIKI_TEXT_, trust_remote_code=True)

In [10]:
wiki_text_configs

['wikitext-103-raw-v1',
 'wikitext-103-v1',
 'wikitext-2-raw-v1',
 'wikitext-2-v1']

In [11]:
wikitext_2_v1 = load_dataset(_PATH_WIKI_TEXT_, 'wikitext-2-raw-v1', split="train")
wikitext_103_v1 = load_dataset(_PATH_WIKI_TEXT_, 'wikitext-103-v1', split="train")

In [12]:
wikitext_2_v1

Dataset({
    features: ['text'],
    num_rows: 36718
})

In [13]:
wikitext_103_v1

Dataset({
    features: ['text'],
    num_rows: 1801350
})

# 3. Know Your Data

## 3.1. Indexing

In [14]:
# indexing by column then row
## slower than indexing by row then column
start_time = time.time()
wikitext_103_v1['text'][3]
end_time = time.time()
print(f"Elapsed time: {end_time - start_time:.4f} seconds")

start_time = time.time()
wikitext_103_v1[3]['text']
end_time = time.time()
print(f"Elapsed time: {end_time - start_time:.4f} seconds")

Elapsed time: 1.1650 seconds
Elapsed time: 0.0001 seconds


## 3.2. Slicing

This returns a subset of the dateset, works like pandas operation

In [15]:
wikitext_2_v1[3:6]

{'text': [' Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " . \n',
  " The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving for series newcomers . Char

# 4. Iterable dataset
It loads/download dataset one example at a time, so we don't have to wait for the whole dataset to download before we can use it

Iterable dataset is loaded, when `streaming` parameter is `True` in `load_dataset()`


We are loading a image dataset in this example

In [16]:
iterable_ds = load_dataset("ethz/food101", split="train", streaming=True)

In [17]:
next(iter(iterable_ds))

{'image': <PIL.Image.Image image mode=RGB size=384x512>, 'label': 6}

In [18]:
## loading specific number of examples
list(iterable_ds.take(3))

[{'image': <PIL.Image.Image image mode=RGB size=384x512>, 'label': 6},
 {'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=512x512>,
  'label': 6},
 {'image': <PIL.Image.Image image mode=RGB size=512x383>, 'label': 6}]