In [None]:
import sys
sys.path.append('../assets/')

In [None]:
# Main libraries
from data_shuffling_split import *

from datasets import load_dataset, list_datasets, Dataset, DatasetDict, ClassLabel
from transformers import AutoModel, AutoTokenizer

# The Dataset

The hugginface datasets library used either to use avaliable datasets on the hub or either to use your own dataset.

### Load from avaliable data

In [None]:
avaliable_dataset = list_datasets()
print(len(avaliable_dataset))
print(avaliable_dataset[:5])

In [None]:
emtions_data = load_dataset('emotion')
emtions_data

In [None]:
print(emtions_data['train'].features)

In [None]:
emtions_data['train'][:5]

## Use our datasets

In [None]:
strat_train_set = read_file("../dataset/dialect_data/train/strat_train_set.csv")
strat_dev_set   = read_file("../dataset/dialect_data/validation/strat_dev_set.csv")
strat_test_set  = read_file("../dataset/dialect_data/test/strat_test_set.csv")

strat_train_set.columns = ['id', 'label', 'text']
strat_dev_set.columns   = ['id', 'label', 'text']
strat_test_set.columns  = ['id', 'label', 'text']

print(type(strat_train_set))

# Convert to Dataset Apache arrow
ds_strat_train_set = Dataset.from_pandas(strat_train_set)
ds_strat_dev_set   = Dataset.from_pandas(strat_dev_set)
ds_strat_test_set  = Dataset.from_pandas(strat_test_set)
print("="*50)

print(type(ds_strat_train_set))

# Convert dialect string to class label

In this case we can easily convert to the correspond dialect when we need.

In [None]:
labels = list(set(ds_strat_train_set['label']))
print(labels)
print("="*50)
print(len(labels))
print("="*50)
ClassLabels = ClassLabel(num_classes=len(labels), names=labels)
print(ClassLabels)

In [None]:
# Look how the labels are string
print(ds_strat_train_set.features)

In [None]:
# Mapping Labels to IDs
def map_dialect_str2int(data):
    data['label'] = ClassLabels.str2int(data['label'])
    return data

ds_strat_train_set = ds_strat_train_set.map(map_dialect_str2int, batched=True)
# Casting label column to ClassLabel Object
ds_strat_train_set = ds_strat_train_set.cast_column('label', ClassLabels)



ds_strat_dev_set = ds_strat_dev_set.map(map_dialect_str2int, batched=True)
# Casting label column to ClassLabel Object
ds_strat_dev_set = ds_strat_dev_set.cast_column('label', ClassLabels)


ds_strat_test_set = ds_strat_test_set.map(map_dialect_str2int, batched=True)
# Casting label column to ClassLabel Object
ds_strat_test_set = ds_strat_test_set.cast_column('label', ClassLabels)

In [None]:
# Look how the labels are now ClassLabels
print(ds_strat_train_set.features)

# Compare 

In [None]:
print("==================== Check our conversation ====================")
print(list(strat_train_set['label']) == ClassLabels.int2str(ds_strat_train_set['label']))
print(list(strat_dev_set['label'])   == ClassLabels.int2str(ds_strat_dev_set['label']))
print(list(strat_test_set['label']) == ClassLabels.int2str(ds_strat_test_set['label']))

print(list(strat_train_set['label'])[:5])
print(ClassLabels.int2str(ds_strat_train_set['label'][:5]))

print("="*50)

print(list(strat_dev_set['label'])[:5])
print(ClassLabels.int2str(ds_strat_dev_set['label'][:5]))


print("="*50)

print(list(strat_test_set['label'])[:5])
print(ClassLabels.int2str(ds_strat_test_set['label'][:5]))


In [None]:
dialect_datasets = DatasetDict()

dialect_datasets['train']      = ds_strat_train_set
dialect_datasets['validation'] = ds_strat_dev_set
dialect_datasets['test']       = ds_strat_test_set

dialect_datasets

In [None]:
print(dialect_datasets['train'].features)
print("="*50)
print(dialect_datasets['validation'].features)
print("="*50)
print(dialect_datasets['test'].features)

# Push the data into the hub

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
dialect_datasets.push_to_hub('Abdelrahman-Rezk/Arabic_Dialect_Identification')

In [None]:
dialect_datasets = load_dataset('Abdelrahman-Rezk/Arabic_Dialect_Identification')
dialect_datasets