In [20]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [31]:
from datasets import load_dataset
import numpy as np

from collections import Counter

def show_dataset_info(dataset, y_field: str = "label"):
  print("Nº of samples per split:")
  n_splits = 0
  for split in dataset:
    if len(dataset[split]) > 15000:
      aux_cad = " (exceeds computation limit! subsampling needed!)"
    else:
      aux_cad = ""
    n_splits += 1
    print(f"\t{split}: {len(dataset[split])}{aux_cad}")


  if n_splits < 2:
      if n_splits == 2:
        cad = "f\nOnly 2 splits were found. A train_test_partition is required"
      else:
        cad = f"\nOnly 1 split was found. A train_test_partition is required"
        
      print(cad)

  print("\nLabel distribution per split:")
  for split in dataset:
    label_dist = dict(Counter(dataset[split][y_field]))
    for label in label_dist:
      proportion = label_dist[label] / len(dataset[split])
      print(f"\tClass {label} : {proportion:.2%}")
    print()

## Semantic Similarity (SS): **PAWS**

In [32]:
paws = load_dataset("paws", "labeled_final")
show_dataset_info(paws)



  0%|          | 0/3 [00:00<?, ?it/s]

Nº of samples per split:
	train: 49401 (exceeds computation limit! subsampling needed!)
	test: 8000
	validation: 8000

Label distribution per split:
	Class 0 : 55.81%
	Class 1 : 44.19%

	Class 0 : 55.80%
	Class 1 : 44.20%

	Class 0 : 55.76%
	Class 1 : 44.24%



## Natural Language Inference (NLI): **MNLI**

In [33]:
mnli = load_dataset("glue", "mnli")
show_dataset_info(mnli)



  0%|          | 0/5 [00:00<?, ?it/s]

Nº of samples per split:
	train: 392702 (exceeds computation limit! subsampling needed!)
	validation_matched: 9815
	validation_mismatched: 9832
	test_matched: 9796
	test_mismatched: 9847

Label distribution per split:
	Class 1 : 33.33%
	Class 0 : 33.33%
	Class 2 : 33.33%

	Class 1 : 31.82%
	Class 2 : 32.74%
	Class 0 : 35.45%

	Class 2 : 32.95%
	Class 0 : 35.22%
	Class 1 : 31.82%

	Class -1 : 100.00%

	Class -1 : 100.00%



## Grammatical Coherence (GC): **COLA**

In [34]:
cola = load_dataset("glue", "cola")
show_dataset_info(cola)



  0%|          | 0/3 [00:00<?, ?it/s]

Nº of samples per split:
	train: 8551
	validation: 1043
	test: 1063

Label distribution per split:
	Class 1 : 70.44%
	Class 0 : 29.56%

	Class 1 : 69.13%
	Class 0 : 30.87%

	Class -1 : 100.00%



## Sentiment Analysis (SA): **SST2**

In [35]:
sst2 = load_dataset("glue", "sst2")
show_dataset_info(sst2)



  0%|          | 0/3 [00:00<?, ?it/s]

Nº of samples per split:
	train: 67349 (exceeds computation limit! subsampling needed!)
	validation: 872
	test: 1821

Label distribution per split:
	Class 0 : 44.22%
	Class 1 : 55.78%

	Class 1 : 50.92%
	Class 0 : 49.08%

	Class -1 : 100.00%



## Hate Speech and Offensive Language (HSOL): **HSOL**

In [36]:
hsol = load_dataset("hate_speech_offensive")
show_dataset_info(hsol, y_field = "class")



  0%|          | 0/1 [00:00<?, ?it/s]

Nº of samples per split:
	train: 24783 (exceeds computation limit! subsampling needed!)

Only 1 split was found. A train_test_partition is required

Label distribution per split:
	Class 2 : 16.80%
	Class 1 : 77.43%
	Class 0 : 5.77%



In [39]:
hsol.__dir__()

['__module__',
 '__doc__',
 '_check_values_type',
 '_check_values_features',
 '__getitem__',
 'data',
 'cache_files',
 'num_columns',
 'num_rows',
 'column_names',
 'shape',
 'flatten',
 'unique',
 'cleanup_cache_files',
 '__repr__',
 'cast',
 'cast_column',
 'remove_columns',
 'rename_column',
 'rename_columns',
 'select_columns',
 'class_encode_column',
 'formatted_as',
 'set_format',
 'reset_format',
 'set_transform',
 'with_format',
 'with_transform',
 'map',
 'filter',
 'sort',
 'shuffle',
 'save_to_disk',
 'load_from_disk',
 'from_csv',
 'from_json',
 'from_parquet',
 'from_text',
 'prepare_for_task',
 'align_labels_with_mapping',
 'push_to_hub',
 '__dict__',
 '__weakref__',
 '__new__',
 '__hash__',
 '__getattribute__',
 '__lt__',
 '__le__',
 '__eq__',
 '__ne__',
 '__gt__',
 '__ge__',
 '__iter__',
 '__init__',
 '__or__',
 '__ror__',
 '__ior__',
 '__len__',
 '__setitem__',
 '__delitem__',
 '__contains__',
 '__sizeof__',
 'get',
 'setdefault',
 'pop',
 'popitem',
 'keys',
 'items',