In [1]:
import sys
sys.path.append('../assets/')

In [2]:
# Main libraries
from data_shuffling_split import *
from preprocess_text import *
from datasets import load_dataset, list_datasets, Dataset, DatasetDict, ClassLabel
from transformers import AutoModel, AutoTokenizer

# The Dataset

The hugginface datasets library used either to use avaliable datasets on the hub or either to use your own dataset.

### Load from avaliable data

In [3]:
avaliable_dataset = list_datasets()
print(len(avaliable_dataset))
print(avaliable_dataset[:5])

4869
['acronym_identification', 'ade_corpus_v2', 'adversarial_qa', 'aeslc', 'afrikaans_ner_corpus']


In [4]:
emtions_data = load_dataset('emotion')
emtions_data

Using custom data configuration default
Reusing dataset emotion (/home/abdelrahman/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [5]:
print(emtions_data['train'].features)

{'text': Value(dtype='string', id=None), 'label': ClassLabel(num_classes=6, names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], id=None)}


In [6]:
emtions_data['train'][:5]

{'text': ['i didnt feel humiliated',
  'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake',
  'im grabbing a minute to post i feel greedy wrong',
  'i am ever feeling nostalgic about the fireplace i will know that it is still on the property',
  'i am feeling grouchy'],
 'label': [0, 0, 3, 2, 3]}

## Use our datasets

In [7]:
strat_train_set = read_file("../dataset/dialect_data/train/strat_train_set.csv")
strat_dev_set   = read_file("../dataset/dialect_data/validation/strat_dev_set.csv")
strat_test_set  = read_file("../dataset/dialect_data/test/strat_test_set.csv")

strat_train_set.columns = ['id', 'label', 'text']
strat_dev_set.columns   = ['id', 'label', 'text']
strat_test_set.columns  = ['id', 'label', 'text']

print(type(strat_train_set))

# Convert to Dataset Apache arrow
ds_strat_train_set = Dataset.from_pandas(strat_train_set)
ds_strat_dev_set   = Dataset.from_pandas(strat_dev_set)
ds_strat_test_set  = Dataset.from_pandas(strat_test_set)
print("="*50)

print(type(ds_strat_train_set))

<class 'pandas.core.frame.DataFrame'>
<class 'datasets.arrow_dataset.Dataset'>


# Convert dialect string to class label

In this case we can easily convert to the correspond dialect when we need.

In [8]:
labels = list(set(ds_strat_train_set['label']))
print(labels)
print("="*50)
print(len(labels))
print("="*50)
ClassLabels = ClassLabel(num_classes=len(labels), names=labels)
print(ClassLabels)

['BH', 'JO', 'SA', 'QA', 'AE', 'DZ', 'EG', 'KW', 'TN', 'SY', 'LB', 'MA', 'OM', 'LY', 'SD', 'IQ', 'YE', 'PL']
18
ClassLabel(num_classes=18, names=['BH', 'JO', 'SA', 'QA', 'AE', 'DZ', 'EG', 'KW', 'TN', 'SY', 'LB', 'MA', 'OM', 'LY', 'SD', 'IQ', 'YE', 'PL'], id=None)


In [9]:
# Look how the labels are string
print(ds_strat_train_set.features)

{'id': Value(dtype='int64', id=None), 'label': Value(dtype='string', id=None), 'text': Value(dtype='string', id=None)}


In [10]:
# Mapping Labels to IDs
def map_dialect_str2int(data):
    data['label'] = ClassLabels.str2int(data['label'])
    return data

ds_strat_train_set = ds_strat_train_set.map(map_dialect_str2int, batched=True)
# Casting label column to ClassLabel Object
ds_strat_train_set = ds_strat_train_set.cast_column('label', ClassLabels)



ds_strat_dev_set = ds_strat_dev_set.map(map_dialect_str2int, batched=True)
# Casting label column to ClassLabel Object
ds_strat_dev_set = ds_strat_dev_set.cast_column('label', ClassLabels)


ds_strat_test_set = ds_strat_test_set.map(map_dialect_str2int, batched=True)
# Casting label column to ClassLabel Object
ds_strat_test_set = ds_strat_test_set.cast_column('label', ClassLabels)

  0%|          | 0/441 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/45 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

In [11]:
# Look how the labels are now ClassLabels
print(ds_strat_train_set.features)

{'id': Value(dtype='int64', id=None), 'label': ClassLabel(num_classes=18, names=['BH', 'JO', 'SA', 'QA', 'AE', 'DZ', 'EG', 'KW', 'TN', 'SY', 'LB', 'MA', 'OM', 'LY', 'SD', 'IQ', 'YE', 'PL'], id=None), 'text': Value(dtype='string', id=None)}


# Compare 

In [12]:
print("==================== Check our conversation ====================")
print(list(strat_train_set['label']) == ClassLabels.int2str(ds_strat_train_set['label']))
print(list(strat_dev_set['label'])   == ClassLabels.int2str(ds_strat_dev_set['label']))
print(list(strat_test_set['label']) == ClassLabels.int2str(ds_strat_test_set['label']))

print(list(strat_train_set['label'])[:5])
print(ClassLabels.int2str(ds_strat_train_set['label'][:5]))

print("="*50)

print(list(strat_dev_set['label'])[:5])
print(ClassLabels.int2str(ds_strat_dev_set['label'][:5]))


print("="*50)

print(list(strat_test_set['label'])[:5])
print(ClassLabels.int2str(ds_strat_test_set['label'][:5]))


True
True
True
['MA', 'LY', 'OM', 'PL', 'PL']
['MA', 'LY', 'OM', 'PL', 'PL']
['KW', 'JO', 'EG', 'AE', 'KW']
['KW', 'JO', 'EG', 'AE', 'KW']
['EG', 'DZ', 'SA', 'DZ', 'EG']
['EG', 'DZ', 'SA', 'DZ', 'EG']


In [13]:
dialect_datasets = DatasetDict()

dialect_datasets['train']      = ds_strat_train_set
dialect_datasets['validation'] = ds_strat_dev_set
dialect_datasets['test']       = ds_strat_test_set

dialect_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'label', 'text'],
        num_rows: 440052
    })
    validation: Dataset({
        features: ['id', 'label', 'text'],
        num_rows: 9164
    })
    test: Dataset({
        features: ['id', 'label', 'text'],
        num_rows: 8981
    })
})

In [14]:
print(dialect_datasets['train'].features)
print("="*50)
print(dialect_datasets['validation'].features)
print("="*50)
print(dialect_datasets['test'].features)

{'id': Value(dtype='int64', id=None), 'label': ClassLabel(num_classes=18, names=['BH', 'JO', 'SA', 'QA', 'AE', 'DZ', 'EG', 'KW', 'TN', 'SY', 'LB', 'MA', 'OM', 'LY', 'SD', 'IQ', 'YE', 'PL'], id=None), 'text': Value(dtype='string', id=None)}
{'id': Value(dtype='int64', id=None), 'label': ClassLabel(num_classes=18, names=['BH', 'JO', 'SA', 'QA', 'AE', 'DZ', 'EG', 'KW', 'TN', 'SY', 'LB', 'MA', 'OM', 'LY', 'SD', 'IQ', 'YE', 'PL'], id=None), 'text': Value(dtype='string', id=None)}
{'id': Value(dtype='int64', id=None), 'label': ClassLabel(num_classes=18, names=['BH', 'JO', 'SA', 'QA', 'AE', 'DZ', 'EG', 'KW', 'TN', 'SY', 'LB', 'MA', 'OM', 'LY', 'SD', 'IQ', 'YE', 'PL'], id=None), 'text': Value(dtype='string', id=None)}


# Push the data into the hub

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
dialect_datasets.push_to_hub('Abdelrahman-Rezk/Arabic_Dialect_Identification')

In [None]:
dialect_datasets = load_dataset('Abdelrahman-Rezk/Arabic_Dialect_Identification')
dialect_datasets