In [1]:
# !pip install scikit-learn
# !pip install emojis
# !pip install datasets

In [2]:
import sys
sys.path.append('../assets/')

In [3]:
# Main libraries
from data_shuffling_split import *
from preprocess_text import *
from datasets import load_dataset, list_datasets, Dataset, DatasetDict, ClassLabel
from transformers import AutoModel, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm
2023-03-08 05:41:31.613450: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-08 05:41:31.947437: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-03-08 05:41:31.947474: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-03-08 05:41:32.767417: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open s

# The Dataset

The huggingface datasets library is used either to use available datasets on the hub or either to use your own dataset, either this dataset on your machine or on remote access.

### Load from avaliable data

First of all, we have listed out how many datasets are available in huggingface.

One of the datasets we have used before, is the *emotion dataset* that was mentioned by details in second chapter of book *nlp with transformers*, and we have apply on this dataset in this repo:
https://github.com/Abdelrahmanrezk/nlp_with_transformers/blob/main/chapter_2/chapter%202%20Text%20Classification%20with%20transformers.ipynb

In [4]:
avaliable_dataset = list_datasets()
print(len(avaliable_dataset))
print(avaliable_dataset[:5])

23908
['acronym_identification', 'ade_corpus_v2', 'adversarial_qa', 'aeslc', 'afrikaans_ner_corpus']


## load_dataset

Now to use any of the hugging face datasets we have to load this dataset and it will be directly cashed on your machine for the next time you will use it easily without downloading it again.

**what if the dataset is not on the hugging face Hub ??**
In this case, you will also use the load_dataset function with its available scripts that provide you with a way to load your own dataset either from your machine or from an online remote.


In [5]:
emtions_data = load_dataset('emotion')
emtions_data

No config specified, defaulting to: emotion/split
Found cached dataset emotion (/home/abdelrahman/.cache/huggingface/datasets/emotion/split/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd)
100%|████████████████████████████████████████████| 3/3 [00:00<00:00, 420.12it/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

## DatasetDict
It looks like a python dictionary each part of the data is split into keys which defines it as either a train or validation or test, and the value is the split part of the data itself, and we can deal with that as it's a python dictionary.

In [6]:
train_ds = emtions_data['train']
print(type(train_ds))
train_ds

<class 'datasets.arrow_dataset.Dataset'>


Dataset({
    features: ['text', 'label'],
    num_rows: 16000
})

## Dataset
Each of the splited parts in the datasetdict itself is a dataset object which is one of the core data structures in the hugging face dataset library, and we can work with as ordinary Python array or list.

In [7]:
print(len(train_ds))
print(train_ds[0])

16000
{'text': 'i didnt feel humiliated', 'label': 0}


In [8]:
print(train_ds[:5])
print("="*50)
print(train_ds.features)

{'text': ['i didnt feel humiliated', 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake', 'im grabbing a minute to post i feel greedy wrong', 'i am ever feeling nostalgic about the fireplace i will know that it is still on the property', 'i am feeling grouchy'], 'label': [0, 0, 3, 2, 3]}
{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], id=None)}


## Result

We can see that the output of the train_ds when we have printed out the first index looks like a python dictionary with keys corresponding to the names of the columns in the training dataset, and the values of these keys is the text and labels of that text.

Actually the hugging face dataset is built over the *apache spark* which is more memory efficient than native python.

Also, we can see that the features of the training dataset printing out the data type of each column in the dataset, and we can see that the label column is a Class Label object which defines the corresponding classes to each label we have.

In [9]:
print(train_ds['text'][:5])
print("="*50)
print(train_ds['label'][:5])
print("="*50)
print(train_ds.features['label']._str2int)

['i didnt feel humiliated', 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake', 'im grabbing a minute to post i feel greedy wrong', 'i am ever feeling nostalgic about the fireplace i will know that it is still on the property', 'i am feeling grouchy']
[0, 0, 3, 2, 3]
{'sadness': 0, 'joy': 1, 'love': 2, 'anger': 3, 'fear': 4, 'surprise': 5}


## Other Dataset

We will use another dataset from this paper :
https://arxiv.org/pdf/2005.06557.pdf

We already got this dataset from another project:

https://github.com/Abdelrahmanrezk/Arabic-Dialect-Identification

So now we are going to know how to load this data set and push it into huggingface hub, but you need to have an account on huggingface.

## Use our datasets
First of all we have splited out the dataset into:
- train
- test
- validation

In [17]:
strat_train_set = read_file("../dataset/train/strat_train_set.csv")
strat_dev_set   = read_file("../dataset/validation/strat_dev_set.csv")
strat_test_set  = read_file("../dataset/test/strat_test_set.csv")

# strat_train_set.columns = ['id', 'dialect', 'dialect_l_encoded', 'text']
# strat_dev_set.columns   = ['id', 'dialect', 'dialect_l_encoded', 'text']
# strat_test_set.columns  = ['id', 'dialect', 'dialect_l_encoded', 'text']

print(type(strat_train_set))

strat_test_set.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,id,dialect,text
0,839595752783560704,OM,@fahad_laporta يا خي ما حد مبرد قلوبنا مثل قرا...
1,631897776033427456,LB,@Kalamennas @Marcel_Ghanem الشعب ماخمل الشعب ق...
2,903332474683580416,KW,@AyoubKw يقول قرار تخصيصك تستلمه بعد العيد بس ...
3,768492742733340672,LB,#ما_رح_انسى الحلو ما بينتسى
4,1023998295033741184,IQ,@PrideOfMUFC هذي اخلاقك العالية تخليك تشتم \nب...


In [None]:
# Convert to Dataset Apache arrow
ds_strat_train_set = Dataset.from_pandas(strat_train_set)
ds_strat_dev_set   = Dataset.from_pandas(strat_dev_set)
ds_strat_test_set  = Dataset.from_pandas(strat_test_set)
print("="*50)

print(type(ds_strat_train_set))

# Convert dialect string to class label

In this case we can easily convert to the correspond dialect when we need.

In [None]:
labels = list(set(ds_strat_train_set['label']))
print(labels)
print("="*50)
print(len(labels))
print("="*50)
ClassLabels = ClassLabel(num_classes=len(labels), names=labels)
print(ClassLabels)

In [None]:
# Look how the labels are string
print(ds_strat_train_set.features)

In [None]:
# Mapping Labels to IDs
def map_dialect_str2int(data):
    data['label'] = ClassLabels.str2int(data['label'])
    return data

ds_strat_train_set = ds_strat_train_set.map(map_dialect_str2int, batched=True)
# Casting label column to ClassLabel Object
ds_strat_train_set = ds_strat_train_set.cast_column('label', ClassLabels)



ds_strat_dev_set = ds_strat_dev_set.map(map_dialect_str2int, batched=True)
# Casting label column to ClassLabel Object
ds_strat_dev_set = ds_strat_dev_set.cast_column('label', ClassLabels)


ds_strat_test_set = ds_strat_test_set.map(map_dialect_str2int, batched=True)
# Casting label column to ClassLabel Object
ds_strat_test_set = ds_strat_test_set.cast_column('label', ClassLabels)

In [None]:
# Look how the labels are now ClassLabels
print(ds_strat_train_set.features)

# Compare 

In [None]:
print("==================== Check our conversation ====================")
print(list(strat_train_set['label']) == ClassLabels.int2str(ds_strat_train_set['label']))
print(list(strat_dev_set['label'])   == ClassLabels.int2str(ds_strat_dev_set['label']))
print(list(strat_test_set['label']) == ClassLabels.int2str(ds_strat_test_set['label']))

print(list(strat_train_set['label'])[:5])
print(ClassLabels.int2str(ds_strat_train_set['label'][:5]))

print("="*50)

print(list(strat_dev_set['label'])[:5])
print(ClassLabels.int2str(ds_strat_dev_set['label'][:5]))


print("="*50)

print(list(strat_test_set['label'])[:5])
print(ClassLabels.int2str(ds_strat_test_set['label'][:5]))


In [None]:
dialect_datasets = DatasetDict()

dialect_datasets['train']      = ds_strat_train_set
dialect_datasets['validation'] = ds_strat_dev_set
dialect_datasets['test']       = ds_strat_test_set

dialect_datasets

In [None]:
print(dialect_datasets['train'].features)
print("="*50)
print(dialect_datasets['validation'].features)
print("="*50)
print(dialect_datasets['test'].features)

# Push the data into the hub

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
dialect_datasets.push_to_hub('Abdelrahman-Rezk/Arabic_Dialect_Identification')

In [None]:
dialect_datasets = load_dataset('Abdelrahman-Rezk/Arabic_Dialect_Identification')
dialect_datasets