# Tutorial

## Datasets import

In [1]:
import datasets

  from .autonotebook import tqdm as notebook_tqdm


In [75]:
from pprint import pprint

## Loading Dataset

In [17]:
print(datasets.__version__)

3.6.0


In [18]:
from datasets import load_dataset

In [19]:
imdb_dataset = load_dataset("stanfordnlp/imdb")
print(imdb_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


# Train Dataset

In [20]:
imdb_train_dataset  = imdb_dataset['train']
imdb_train_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})

In [21]:
imdb_train_dataset[:5]

{'text': ['I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far b

# Remove the unsupervised

In [22]:
_ = imdb_dataset.pop('unsupervised')
imdb_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
})

# Just download the one split

In [23]:
train_split = load_dataset('stanfordnlp/imdb', split = 'train')
print(train_split)

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})


# Spliting train and test

In [24]:
small_ds = train_split.train_test_split(test_size=0.2)
small_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 20000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
})

# Dataset with local data/files

In [25]:
data_files = ['G:\Work\sachin\gdac.broadinstitute.org_BRCA.mRNA_Preprocess_Median.Level_3.2016012800.0.0\BRCA.medianexp.txt']
local_dataset = load_dataset('text', data_files=data_files)
local_dataset

  data_files = ['G:\Work\sachin\gdac.broadinstitute.org_BRCA.mRNA_Preprocess_Median.Level_3.2016012800.0.0\BRCA.medianexp.txt']


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 17816
    })
})

In [26]:
local_dataset['train'][:5]

{'text': ['Hybridization REF\tTCGA-A1-A0SD-01A-11R-A115-07\tTCGA-A1-A0SE-01A-11R-A084-07\tTCGA-A1-A0SH-01A-11R-A084-07\tTCGA-A1-A0SJ-01A-11R-A084-07\tTCGA-A1-A0SK-01A-12R-A084-07\tTCGA-A1-A0SM-01A-11R-A084-07\tTCGA-A1-A0SO-01A-22R-A084-07\tTCGA-A1-A0SP-01A-11R-A084-07\tTCGA-A2-A04N-01A-11R-A115-07\tTCGA-A2-A04P-01A-31R-A034-07\tTCGA-A2-A04Q-01A-21R-A034-07\tTCGA-A2-A04R-01A-41R-A109-07\tTCGA-A2-A04T-01A-21R-A034-07\tTCGA-A2-A04U-01A-11R-A115-07\tTCGA-A2-A04V-01A-21R-A034-07\tTCGA-A2-A04W-01A-31R-A115-07\tTCGA-A2-A04X-01A-21R-A034-07\tTCGA-A2-A04Y-01A-21R-A034-07\tTCGA-A2-A0CL-01A-11R-A115-07\tTCGA-A2-A0CM-01A-31R-A034-07\tTCGA-A2-A0CP-01A-11R-A034-07\tTCGA-A2-A0CQ-01A-21R-A034-07\tTCGA-A2-A0CS-01A-11R-A115-07\tTCGA-A2-A0CT-01A-31R-A056-07\tTCGA-A2-A0CU-01A-12R-A034-07\tTCGA-A2-A0CV-01A-31R-A115-07\tTCGA-A2-A0CW-01A-21R-A115-07\tTCGA-A2-A0CX-01A-21R-A00Z-07\tTCGA-A2-A0CY-01A-12R-A034-07\tTCGA-A2-A0CZ-01A-11R-A034-07\tTCGA-A2-A0D0-01A-11R-A00Z-07\tTCGA-A2-A0D1-01A-11R-A034-07\tTCGA-A2-A0

# Saving to local disk as in arrow format

In [28]:
local_dataset.save_to_disk('local_dataset')

Saving the dataset (1/1 shards): 100%|██████████| 17816/17816 [00:00<00:00, 179601.50 examples/s]


In [29]:
from datasets import load_from_disk

# Import the saved local dataset

In [30]:
local_dataset_disk = load_from_disk('local_dataset')
local_dataset_disk

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 17816
    })
})

# Accessing the samples using index

In [31]:
idx = 1000
sample = imdb_dataset['train'][idx]
sample

{'text': 'Although I have to admit I laughed more watching this movie than the last few comedies I saw.<br /><br />The budget must have consisted of pocket change from the actors. The production values are so low that they actual made it kind of fun to watch. Reminds me of the Robot Monster made up of a guy in a gorilla suit with a cardboard diving helmet on.<br /><br />In one scene a hapless victim gets their arm and leg cut off. Geez, hard to believe but the Black Knight scene from Holy Grail was more realistic. I kept wondering why the victim didn\'t start shouting " None Shall Pass" and " It\'s only a flesh wound, I\'ve had worse". It was one of the funniest scenes I\'ve seen in the past year.<br /><br />The "gladiator/demon" was a stitch too. Between the horribly cheap costume and the geeky look of the guy in it the end result was hysterical.<br /><br />Truly a movie that is bad enough to be watchable. Kind of like seeing a slow motion auto accident on film.<br /><br />',
 'label'

# Select subset of a dataset

In [68]:
example = imdb_dataset['train'].select([idx])
example

Dataset({
    features: ['text', 'label'],
    num_rows: 1
})

# Select a range of samples

In [69]:
idx = range(0, 100, 2) # take even indices
example = imdb_dataset['train'].select(idx)
example

Dataset({
    features: ['text', 'label'],
    num_rows: 50
})

# WMT dataset

In [70]:
from datasets import get_dataset_config_names, get_dataset_split_names
print(get_dataset_config_names('wmt/wmt14'))

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


['cs-en', 'de-en', 'fr-en', 'hi-en', 'ru-en']


In [72]:
get_dataset_split_names('wmt/wmt14', 'hi-en')

['train', 'validation', 'test']

# Now load the dataset

In [73]:
translation_dataset = load_dataset(path='wmt/wmt14', name='hi-en')
translation_dataset

Generating train split: 100%|██████████| 32863/32863 [00:00<00:00, 1973560.50 examples/s]
Generating validation split: 100%|██████████| 520/520 [00:00<00:00, 95958.38 examples/s]
Generating test split: 100%|██████████| 2507/2507 [00:00<00:00, 341677.34 examples/s]


DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 32863
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 520
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2507
    })
})

# Managing splits

In [74]:
raw_dataset = load_dataset(path='wmt/wmt14', name='hi-en', split='train+test+validation')
print(raw_dataset)
print(len(raw_dataset))

Dataset({
    features: ['translation'],
    num_rows: 35890
})
35890


# Features

In [76]:
pprint(translation_dataset['train'].features)

{'translation': Translation(languages=['hi', 'en'], id=None)}


# Glue

In [77]:
mrpc_dataset = load_dataset('glue', 'mrpc', split='train') # Load MRPC from benchmark
pprint(mrpc_dataset)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 3668/3668 [00:00<00:00, 511170.78 examples/s]
Generating validation split: 100%|██████████| 408/408 [00:00<00:00, 90171.57 examples/s]
Generating test split: 100%|██████████| 1725/1725 [00:00<00:00, 318393.52 examples/s]

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})





In [78]:
pprint(mrpc_dataset.features)

{'idx': Value(dtype='int32', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None)}


# COMMON METHODS

In [79]:
import multiprocessing

In [81]:
print(multiprocessing.cpu_count())

8


# Filtering

In [82]:
print('Before Filtering')
print(20*'-')
print(imdb_dataset)

Before Filtering
--------------------
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
})


In [83]:
num_words = 100
imdb_filtered_dataset = imdb_dataset.filter(lambda example:len(example['text'].split())>=num_words)
print('After Filtering')
print(20*'-')
print(imdb_filtered_dataset)

Filter: 100%|██████████| 25000/25000 [00:00<00:00, 38208.15 examples/s]
Filter: 100%|██████████| 25000/25000 [00:00<00:00, 43494.65 examples/s]

After Filtering
--------------------
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 22074
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 21909
    })
})





# MAP

In [None]:
def add_prefix(example):
  example['text']="IMDB:" + example['text']
  return example

In [85]:
imdb_prefixed_dataset = imdb_dataset.map(add_prefix)
print(imdb_prefixed_dataset)

Map: 100%|██████████| 25000/25000 [00:01<00:00, 12738.47 examples/s]
Map: 100%|██████████| 25000/25000 [00:02<00:00, 11251.95 examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
})





In [86]:
print(imdb_prefixed_dataset['train']['text'][1000])

IMDBAlthough I have to admit I laughed more watching this movie than the last few comedies I saw.<br /><br />The budget must have consisted of pocket change from the actors. The production values are so low that they actual made it kind of fun to watch. Reminds me of the Robot Monster made up of a guy in a gorilla suit with a cardboard diving helmet on.<br /><br />In one scene a hapless victim gets their arm and leg cut off. Geez, hard to believe but the Black Knight scene from Holy Grail was more realistic. I kept wondering why the victim didn't start shouting " None Shall Pass" and " It's only a flesh wound, I've had worse". It was one of the funniest scenes I've seen in the past year.<br /><br />The "gladiator/demon" was a stitch too. Between the horribly cheap costume and the geeky look of the guy in it the end result was hysterical.<br /><br />Truly a movie that is bad enough to be watchable. Kind of like seeing a slow motion auto accident on film.<br /><br />


# CONCATENATE DATASETS

In [87]:
imdb_dataset_whole = load_dataset('stanfordnlp/imdb', split='train+test') # split="all" will load duplicates in this case
print(imdb_dataset_whole)
print(imdb_dataset_whole.features)

Dataset({
    features: ['text', 'label'],
    num_rows: 50000
})
{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['neg', 'pos'], id=None)}


In [88]:
rt_dataset_whole = load_dataset('cornell-movie-review-data/rotten_tomatoes', split='all')
print(rt_dataset_whole)
print(rt_dataset_whole.features)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 8530/8530 [00:00<00:00, 1047931.02 examples/s]
Generating validation split: 100%|██████████| 1066/1066 [00:00<00:00, 216571.96 examples/s]
Generating test split: 100%|██████████| 1066/1066 [00:00<00:00, 242705.90 examples/s]

Dataset({
    features: ['text', 'label'],
    num_rows: 10662
})
{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['neg', 'pos'], id=None)}





In [89]:
concat_dataset = datasets.concatenate_datasets([imdb_dataset_whole, rt_dataset_whole], axis=0)
concat_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 60662
})

# Inter-leave datasets

In [90]:
from datasets import interleave_datasets
inter_datasets = interleave_datasets([imdb_dataset_whole, rt_dataset_whole], probabilities=[0.6, 0.4])
inter_datasets

Dataset({
    features: ['text', 'label'],
    num_rows: 26906
})

# ITERABLE DATASET

In [91]:
imdb_iter_dataset = load_dataset('stanfordnlp/imdb',split='train', streaming=True)
imdb_filtered_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 22074
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 21909
    })
})

In [93]:
for example in imdb_iter_dataset.map(add_prefix):
  print(example)
  break

{'text': 'IMDBI rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and fa

# 1.

# Datasets

In [32]:
import datasets

# Loading a Dataset

In [33]:
from datasets import get_dataset_config_names
get_dataset_config_names('ai4bharat/naamapadam')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


['as', 'bn', 'gu', 'hi', 'kn', 'ml', 'mr', 'or', 'pa', 'ta', 'te']

In [34]:
from datasets import load_dataset

hi_dataset = load_dataset('ai4bharat/naamapadam', 'hi')

Generating train split: 100%|██████████| 985787/985787 [00:01<00:00, 577065.24 examples/s]
Generating test split: 100%|██████████| 867/867 [00:00<00:00, 164129.88 examples/s]
Generating validation split: 100%|██████████| 13460/13460 [00:00<00:00, 487928.96 examples/s]


In [35]:
hi_dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 985787
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 867
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 13460
    })
})

In [36]:
ta_dataset = load_dataset('ai4bharat/naamapadam', 'ta')

Generating train split: 100%|██████████| 497882/497882 [00:00<00:00, 813451.57 examples/s]
Generating test split: 100%|██████████| 758/758 [00:00<00:00, 159658.64 examples/s]
Generating validation split: 100%|██████████| 2795/2795 [00:00<00:00, 382108.20 examples/s]


In [37]:
ta_dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 497882
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 758
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 2795
    })
})

In [38]:
ta_dataset['train'].features['ner_tags'].feature.names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

In [39]:
hi_dataset['train'].features['ner_tags'].feature.names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

# 2.

In [40]:
cache_files = ta_dataset.cache_files
cache_files

{'train': [{'filename': 'C:\\Users\\smlab\\.cache\\huggingface\\datasets\\ai4bharat___naamapadam\\ta\\1.0.0\\9d4f21ac57d11ed4f9ea64854fdc9f5618e61acc\\naamapadam-train.arrow'}],
 'test': [{'filename': 'C:\\Users\\smlab\\.cache\\huggingface\\datasets\\ai4bharat___naamapadam\\ta\\1.0.0\\9d4f21ac57d11ed4f9ea64854fdc9f5618e61acc\\naamapadam-test.arrow'}],
 'validation': [{'filename': 'C:\\Users\\smlab\\.cache\\huggingface\\datasets\\ai4bharat___naamapadam\\ta\\1.0.0\\9d4f21ac57d11ed4f9ea64854fdc9f5618e61acc\\naamapadam-validation.arrow'}]}

In [41]:
for key in cache_files:
  print(cache_files[key][0]['filename'].split('/')[-1])

C:\Users\smlab\.cache\huggingface\datasets\ai4bharat___naamapadam\ta\1.0.0\9d4f21ac57d11ed4f9ea64854fdc9f5618e61acc\naamapadam-train.arrow
C:\Users\smlab\.cache\huggingface\datasets\ai4bharat___naamapadam\ta\1.0.0\9d4f21ac57d11ed4f9ea64854fdc9f5618e61acc\naamapadam-test.arrow
C:\Users\smlab\.cache\huggingface\datasets\ai4bharat___naamapadam\ta\1.0.0\9d4f21ac57d11ed4f9ea64854fdc9f5618e61acc\naamapadam-validation.arrow


# 3.

In [42]:
from datasets import load_dataset_builder

ds_builder = load_dataset_builder('ai4bharat/naamapadam', 'ta')
# this return the value in bytes
print(f"the size of the dataset is: {ds_builder.info.dataset_size /(1024*2):.2f} MB")

the size of the dataset is: 92080.66 MB


# 4.

In [43]:
ta_dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 497882
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 758
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 2795
    })
})

# 5.

In [44]:
def num_of_tokens(example):
  return {'num_tokens': len(example['tokens'])}

In [45]:
ta_dataset['train'][0]['tokens']

['பைரவருக்கு',
 'தேய்பிறை',
 'அஷ்டமியில்',
 'விசேஷ',
 'அபிஷேக',
 'ஆராதனைகள்',
 'நடைபெறுகின்றன',
 '.']

In [46]:
ta_dataset = ta_dataset.map(num_of_tokens)

Map: 100%|██████████| 497882/497882 [00:55<00:00, 8902.58 examples/s]
Map: 100%|██████████| 758/758 [00:00<00:00, 7468.59 examples/s]
Map: 100%|██████████| 2795/2795 [00:00<00:00, 8879.44 examples/s]


In [47]:
ta_dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'num_tokens'],
        num_rows: 497882
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'num_tokens'],
        num_rows: 758
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'num_tokens'],
        num_rows: 2795
    })
})

In [48]:
ta_dataset.column_names

{'train': ['tokens', 'ner_tags', 'num_tokens'],
 'test': ['tokens', 'ner_tags', 'num_tokens'],
 'validation': ['tokens', 'ner_tags', 'num_tokens']}

In [49]:
total_num_of_tokens = 0
for split_name, dataset in ta_dataset.items():
  col_len = sum(row['num_tokens'] for row in dataset)
  total_num_of_tokens +=col_len
print(total_num_of_tokens)

6001876


# 6.

In [50]:

ds_builder = load_dataset_builder("ai4bharat/naamapadam", 'ta')
print(f"the size of the dataset is: {ds_builder.info.dataset_size /(1024*2):.2f} MB")

the size of the dataset is: 92080.66 MB


# 7.

In [51]:
def convert_to_word(example):
  return {'text': ' '.join(example['tokens']).strip()}

In [52]:
ta_dataset = ta_dataset.map(convert_to_word)

Map: 100%|██████████| 497882/497882 [01:04<00:00, 7693.37 examples/s]
Map: 100%|██████████| 758/758 [00:00<00:00, 6223.38 examples/s]
Map: 100%|██████████| 2795/2795 [00:00<00:00, 7230.73 examples/s]


In [53]:
ta_dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'num_tokens', 'text'],
        num_rows: 497882
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'num_tokens', 'text'],
        num_rows: 758
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'num_tokens', 'text'],
        num_rows: 2795
    })
})

In [54]:
ta_dataset = ta_dataset.remove_columns(['tokens', 'ner_tags'])

In [55]:
ta_dataset

DatasetDict({
    train: Dataset({
        features: ['num_tokens', 'text'],
        num_rows: 497882
    })
    test: Dataset({
        features: ['num_tokens', 'text'],
        num_rows: 758
    })
    validation: Dataset({
        features: ['num_tokens', 'text'],
        num_rows: 2795
    })
})

In [56]:
ta_dataset.shape

{'train': (497882, 2), 'test': (758, 2), 'validation': (2795, 2)}

In [57]:
from datasets import concatenate_datasets

ta_dataset = concatenate_datasets([split for split in ta_dataset.values()])

In [58]:
ta_dataset

Dataset({
    features: ['num_tokens', 'text'],
    num_rows: 501435
})

# 8.

In [59]:
ta_dataset.cache_files

[{'filename': 'C:\\Users\\smlab\\.cache\\huggingface\\datasets\\ai4bharat___naamapadam\\ta\\1.0.0\\9d4f21ac57d11ed4f9ea64854fdc9f5618e61acc\\cache-a060572b50599f3c.arrow'},
 {'filename': 'C:\\Users\\smlab\\.cache\\huggingface\\datasets\\ai4bharat___naamapadam\\ta\\1.0.0\\9d4f21ac57d11ed4f9ea64854fdc9f5618e61acc\\cache-675e5c06ddc51875.arrow'},
 {'filename': 'C:\\Users\\smlab\\.cache\\huggingface\\datasets\\ai4bharat___naamapadam\\ta\\1.0.0\\9d4f21ac57d11ed4f9ea64854fdc9f5618e61acc\\cache-9a262ed8f2235737.arrow'}]

# 9.

In [60]:
ta_dataset[1]

{'num_tokens': 19,
 'text': 'தெய்வீக ஏவுதலினால் அறிவிக்கப்பட்ட மற்றொரு செய்தியினால் கடுங்கோபம் கொண்ட அரசன் , உரியாவை படுகொலை செய்தான் . - எரேமியா 26:21 - 24 .'}

In [61]:
# Apply filter
ta_dataset = ta_dataset.filter(lambda example: example['num_tokens']>5)

Filter: 100%|██████████| 501435/501435 [00:03<00:00, 151315.20 examples/s]


In [62]:
ta_dataset

Dataset({
    features: ['num_tokens', 'text'],
    num_rows: 370495
})

# 10.

In [63]:
from datasets import get_dataset_config_names
print(get_dataset_config_names("ai4bharat/indic_glue"))

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


['actsa-sc.te', 'bbca.hi', 'copa.en', 'copa.gu', 'copa.hi', 'copa.mr', 'csqa.as', 'csqa.bn', 'csqa.gu', 'csqa.hi', 'csqa.kn', 'csqa.ml', 'csqa.mr', 'csqa.or', 'csqa.pa', 'csqa.ta', 'csqa.te', 'cvit-mkb-clsr.en-bn', 'cvit-mkb-clsr.en-gu', 'cvit-mkb-clsr.en-hi', 'cvit-mkb-clsr.en-ml', 'cvit-mkb-clsr.en-mr', 'cvit-mkb-clsr.en-or', 'cvit-mkb-clsr.en-ta', 'cvit-mkb-clsr.en-te', 'cvit-mkb-clsr.en-ur', 'iitp-mr.hi', 'iitp-pr.hi', 'inltkh.gu', 'inltkh.ml', 'inltkh.mr', 'inltkh.ta', 'inltkh.te', 'md.hi', 'sna.bn', 'wiki-ner.as', 'wiki-ner.bn', 'wiki-ner.gu', 'wiki-ner.hi', 'wiki-ner.kn', 'wiki-ner.ml', 'wiki-ner.mr', 'wiki-ner.or', 'wiki-ner.pa', 'wiki-ner.ta', 'wiki-ner.te', 'wnli.en', 'wnli.gu', 'wnli.hi', 'wnli.mr', 'wstp.as', 'wstp.bn', 'wstp.gu', 'wstp.hi', 'wstp.kn', 'wstp.ml', 'wstp.mr', 'wstp.or', 'wstp.pa', 'wstp.ta', 'wstp.te']


In [64]:
tamil_datasets  = load_dataset("ai4bharat/indic_glue", "inltkh.ta", split='all')

Generating train split: 100%|██████████| 5346/5346 [00:00<00:00, 570727.68 examples/s]
Generating validation split: 100%|██████████| 669/669 [00:00<00:00, 157119.06 examples/s]
Generating test split: 100%|██████████| 669/669 [00:00<00:00, 136028.18 examples/s]


In [65]:
tamil_datasets

Dataset({
    features: ['text', 'label'],
    num_rows: 6684
})

In [66]:
filter_tamil_datasets =  tamil_datasets.filter(lambda x: len(x['text'].split())>=6)

Filter: 100%|██████████| 6684/6684 [00:00<00:00, 79515.35 examples/s]


In [67]:
from datasets import interleave_datasets

inter_datasets = interleave_datasets([ta_dataset, filter_tamil_datasets], probabilities=[0.8, 0.2], seed=42)
inter_datasets

Dataset({
    features: ['num_tokens', 'text', 'label'],
    num_rows: 32290
})