In [1]:
import sys
sys.path.append('../assets/')

In [2]:
# Main libraries
from data_shuffling_split import *
from preprocess_text import *
from datasets import load_dataset, list_datasets, Dataset, DatasetDict, ClassLabel
from transformers import AutoModel, AutoTokenizer

# The Dataset

The hugginface datasets library used either to use avaliable datasets on the hub or either to use your own dataset.

### Load from avaliable data

In [3]:
avaliable_dataset = list_datasets()
print(len(avaliable_dataset))
print(avaliable_dataset[:5])

4865
['acronym_identification', 'ade_corpus_v2', 'adversarial_qa', 'aeslc', 'afrikaans_ner_corpus']


In [4]:
emtions_data = load_dataset('emotion')
emtions_data

Downloading builder script:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

Using custom data configuration default
Reusing dataset emotion (/home/abdelrahman/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [5]:
print(emtions_data['train'].features)

{'text': Value(dtype='string', id=None), 'label': ClassLabel(num_classes=6, names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], id=None)}


In [6]:
emtions_data['train'][:5]

{'text': ['i didnt feel humiliated',
  'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake',
  'im grabbing a minute to post i feel greedy wrong',
  'i am ever feeling nostalgic about the fireplace i will know that it is still on the property',
  'i am feeling grouchy'],
 'label': [0, 0, 3, 2, 3]}

## Use our datasets

In [None]:
strat_train_set = read_file("../dataset/dialect_data/train/strat_train_set.csv")
strat_dev_set   = read_file("../dataset/dialect_data/validation/strat_dev_set.csv")
strat_test_set  = read_file("../dataset/dialect_data/test/strat_test_set.csv")

strat_train_set.columns = ['id', 'label', 'text']
strat_dev_set.columns   = ['id', 'label', 'text']
strat_test_set.columns  = ['id', 'label', 'text']

print(type(strat_train_set))

# Convert to Dataset Apache arrow
ds_strat_train_set = Dataset.from_pandas(strat_train_set)
ds_strat_dev_set   = Dataset.from_pandas(strat_dev_set)
ds_strat_test_set  = Dataset.from_pandas(strat_test_set)
print("="*50)

print(type(ds_strat_train_set))

# Convert dialect string to class label

In this case we can easily convert to the correspond dialect when we need.

In [None]:
labels = list(set(ds_strat_train_set['label']))
print(labels)
print("="*50)
print(len(labels))
print("="*50)
ClassLabels = ClassLabel(num_classes=len(labels), names=labels)
print(ClassLabels)

In [None]:
# Look how the labels are string
print(ds_strat_train_set.features)

In [None]:
# Mapping Labels to IDs
def map_dialect_str2int(data):
    data['label'] = ClassLabels.str2int(data['label'])
    return data

ds_strat_train_set = ds_strat_train_set.map(map_dialect_str2int, batched=True)
# Casting label column to ClassLabel Object
ds_strat_train_set = ds_strat_train_set.cast_column('label', ClassLabels)



ds_strat_dev_set = ds_strat_dev_set.map(map_dialect_str2int, batched=True)
# Casting label column to ClassLabel Object
ds_strat_dev_set = ds_strat_dev_set.cast_column('label', ClassLabels)


ds_strat_test_set = ds_strat_test_set.map(map_dialect_str2int, batched=True)
# Casting label column to ClassLabel Object
ds_strat_test_set = ds_strat_test_set.cast_column('label', ClassLabels)

In [None]:
# Look how the labels are now ClassLabels
print(ds_strat_train_set.features)

# Compare 

In [None]:
print("==================== Check our conversation ====================")
print(list(strat_train_set['label']) == ClassLabels.int2str(ds_strat_train_set['label']))
print(list(strat_dev_set['label'])   == ClassLabels.int2str(ds_strat_dev_set['label']))
print(list(strat_test_set['label']) == ClassLabels.int2str(ds_strat_test_set['label']))

print(list(strat_train_set['label'])[:5])
print(ClassLabels.int2str(ds_strat_train_set['label'][:5]))

print("="*50)

print(list(strat_dev_set['label'])[:5])
print(ClassLabels.int2str(ds_strat_dev_set['label'][:5]))


print("="*50)

print(list(strat_test_set['label'])[:5])
print(ClassLabels.int2str(ds_strat_test_set['label'][:5]))


In [None]:
dialect_datasets = DatasetDict()

dialect_datasets['train']      = ds_strat_train_set
dialect_datasets['validation'] = ds_strat_dev_set
dialect_datasets['test']       = ds_strat_test_set

dialect_datasets

In [None]:
print(dialect_datasets['train'].features)
print("="*50)
print(dialect_datasets['validation'].features)
print("="*50)
print(dialect_datasets['test'].features)

# Push the data into the hub

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
dialect_datasets.push_to_hub('Abdelrahman-Rezk/Arabic_Dialect_Identification')

In [8]:
dialect_datasets = load_dataset('Abdelrahman-Rezk/Arabic_Dialect_Identification')
dialect_datasets

Using custom data configuration Abdelrahman-Rezk--Arabic_Dialect_Identification-dd1894ee74477eb1


Downloading and preparing dataset None/None (download: 47.36 MiB, generated: 74.21 MiB, post-processed: Unknown size, total: 121.57 MiB) to /home/abdelrahman/.cache/huggingface/datasets/Abdelrahman-Rezk___parquet/Abdelrahman-Rezk--Arabic_Dialect_Identification-dd1894ee74477eb1/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/975k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/47.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Dataset parquet downloaded and prepared to /home/abdelrahman/.cache/huggingface/datasets/Abdelrahman-Rezk___parquet/Abdelrahman-Rezk--Arabic_Dialect_Identification-dd1894ee74477eb1/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    test: Dataset({
        features: ['id', 'label', 'text'],
        num_rows: 8981
    })
    train: Dataset({
        features: ['id', 'label', 'text'],
        num_rows: 440052
    })
    validation: Dataset({
        features: ['id', 'label', 'text'],
        num_rows: 9164
    })
})

In [None]:
dialect_datasets['test'][:3]

In [None]:
dicst =dialect_datasets['test'].features

In [None]:
import json

In [None]:
type(dicst)

In [None]:
res = json.dumps(dict(dialect_datasets['test'].features), ensure_ascii=False)

In [None]:
type(dialect_datasets['test'][:3])

In [None]:
res

In [None]:
s = dict(dialect_datasets['test'].features)

In [None]:
s = str(s)
s

In [None]:
res = json.dumps(s, ensure_ascii=False)

In [None]:
import datasets
import transformers

In [None]:
print(datasets.__version__)
print(transformers.__version__)

In [None]:
datasets

In [None]:
!pip3 install datasets