In [90]:
from datasets import (
    Dataset, 
    DatasetDict, 
    Features,
    Value,
    Sequence,
    load_dataset, 
)
import os
import polars as pl

from huggingface_hub import HfApi
import shutil

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [11]:
def get_parquet_path(PATH):
    return 'hf://datasets/'+ PATH + '/data/train-*.parquet'

In [66]:
HF_PATH = 'MBZUAI-Paris/Darija-SFT-Mixture'
df_sft_mbzuai = pl.read_parquet(get_parquet_path(HF_PATH))
df_sft_mbzuai = df_sft_mbzuai.to_pandas()

In [71]:
# df_sft_mbzuai['dataset'].value_counts()

In [70]:
samples = df_sft_mbzuai[ df_sft_mbzuai['dataset'] == 'hard_coded']

In [73]:
samples

Unnamed: 0,dataset,id,messages,direction,metadata
837,hard_coded,hard_coded_11,"[{'content': 'واش تقدر تاخد بلاصة Google؟', 'r...",,
6817,hard_coded,hard_coded_2,"[{'content': 'شكون لي قادّك؟', 'role': 'user'}...",,
8044,hard_coded,hard_coded_7,[{'content': 'عطيني شي مقدمة قصيرة على جامعة م...,,
9049,hard_coded,hard_coded_6,[{'content': 'واش تقدر تگول لي شكون لي قادّك؟ ...,,
13302,hard_coded,hard_coded_4,"[{'content': 'دوي ليا على راسك.', 'role': 'use...",,
...,...,...,...,...,...
447523,hard_coded,hard_coded_5,[{'content': 'واش تقدر تگول لي شكون ليا صنعك؟ ...,,
447775,hard_coded,hard_coded_11,"[{'content': 'واش تقدر تاخد بلاصة Google؟', 'r...",,
448425,hard_coded,hard_coded_4,"[{'content': 'دوي ليا على راسك.', 'role': 'use...",,
452014,hard_coded,hard_coded_2,"[{'content': 'شكون لي قادّك؟', 'role': 'user'}...",,


In [75]:
remove_sets = [
    'nllb-seed_few_shot',
    'doda',
    'doda_few_shot',
    'flores+_few_shot',
    'hard_coded'
]

# filter some of the bad sets
df_sft_mbzuai = df_sft_mbzuai[~df_sft_mbzuai['dataset'].isin(remove_sets)]

In [15]:
HF_PATH = 'BounharAbdelaziz/Terjman-v2-English-Darija-Dataset-350K'
df_en_trans = pl.read_parquet(get_parquet_path(HF_PATH))
df_en_trans = df_en_trans.to_pandas()

In [14]:
HF_PATH = 'BounharAbdelaziz/Darija-Translation-Dataset-22K-all-13-lang'
df_multi_trans = pl.read_parquet(get_parquet_path(HF_PATH))
df_multi_trans = df_multi_trans.to_pandas()

In [21]:
# LID
HF_PATH = 'atlasia/No-Arabic-Dialect-Left-Behind'
df_arabic_lid = pl.read_parquet(get_parquet_path(HF_PATH))
df_arabic_lid = df_arabic_lid.to_pandas()

In [25]:
# embedding negation-triplet
HF_PATH = 'atlasia/Sentence-Transformers-Morocco-Darija'

SPLIT = 'triplet'
PATH = f'hf://datasets/{HF_PATH}/{SPLIT}/train-*.parquet'

df_triplet = pl.read_parquet(PATH)
df_triplet = df_triplet.to_pandas()


SPLIT = 'negation-triplet'
PATH = f'hf://datasets/{HF_PATH}/{SPLIT}/train-*.parquet'

df_negation_triplet = pl.read_parquet(PATH)
df_negation_triplet = df_negation_triplet.to_pandas()


SPLIT = 'pair-score'
PATH = f'hf://datasets/{HF_PATH}/{SPLIT}/train-*.parquet'

df_pair_score = pl.read_parquet(PATH)
df_pair_score = df_pair_score.to_pandas()

In [26]:
# Emotion detection
HF_PATH = "atlasia/emotion-detection"
df_emotion = load_dataset(HF_PATH, split="train").filter(
    lambda row: row['language'] == 'ary'
).to_pandas()


Downloading readme: 100%|██████████| 877/877 [00:00<00:00, 5.62kB/s]
Downloading data: 100%|██████████| 6.33M/6.33M [00:00<00:00, 8.20MB/s]
Downloading data: 100%|██████████| 851k/851k [00:00<00:00, 2.03MB/s]
Downloading data: 100%|██████████| 4.03M/4.03M [00:00<00:00, 6.95MB/s]
Generating train split: 100%|██████████| 65098/65098 [00:00<00:00, 413533.40 examples/s]
Generating dev split: 100%|██████████| 8506/8506 [00:00<00:00, 449142.67 examples/s]
Generating test split: 100%|██████████| 41555/41555 [00:00<00:00, 521587.81 examples/s]
Filter: 100%|██████████| 65098/65098 [00:00<00:00, 112533.63 examples/s]


In [28]:
# Transliteration
HF_PATH = 'atlasia/ATAM'
df_transliteration = pl.read_parquet(get_parquet_path(HF_PATH))
df_transliteration = df_transliteration.to_pandas()

In [78]:
# Topic classification
HF_PATH = 'atlasia/moroccan_darija_domain_classifier_dataset'
df_topic_classif = pl.read_parquet(get_parquet_path(HF_PATH))
df_topic_classif = df_topic_classif.to_pandas()

# Combine datasets

In [112]:
DATA_HUB_PATH = "BounharAbdelaziz/Atlaset-SFT"

In [113]:
# Configuration 1: Translation Tasks

dataset_dict = DatasetDict()
dataset_dict = DatasetDict({
    # English-Darija translations
    'train': Dataset.from_pandas(df_en_trans),
})

dataset_dict.push_to_hub(
    DATA_HUB_PATH,
    config_name='eng_ary_translation',
    commit_message=f'English to Darija translation tasks.'
)

dataset_dict = DatasetDict()
dataset_dict = DatasetDict({
    # Multi-language translations
    'train': Dataset.from_pandas(df_multi_trans),
})


dataset_dict.push_to_hub(
    DATA_HUB_PATH,
    config_name='multilingual_translation',
    commit_message=f'Multilingual translation tasks.'
)

dataset_dict = DatasetDict()
dataset_dict = DatasetDict({
    # Transliteration dataset
    'train': Dataset.from_pandas(df_transliteration)
})

dataset_dict.push_to_hub(
    DATA_HUB_PATH,
    config_name='transliteration',
    commit_message=f'Transliteration tasks.'
)

Creating parquet from Arrow format: 100%|██████████| 355/355 [00:01<00:00, 181.87ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:09<00:00,  9.02s/it]
Creating parquet from Arrow format: 100%|██████████| 12/12 [00:01<00:00,  6.19ba/s]
Creating parquet from Arrow format: 100%|██████████| 12/12 [00:00<00:00, 199.90ba/s]
Uploading the dataset shards: 100%|██████████| 2/2 [00:09<00:00,  4.80s/it]
Creating parquet from Arrow format: 100%|██████████| 68/68 [00:00<00:00, 1775.39ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.26it/s]


CommitInfo(commit_url='https://huggingface.co/datasets/BounharAbdelaziz/Atlaset-SFT/commit/78b76c956c5b614fe6bb708cc1c38e3adb4a572c', commit_message='Transliteration tasks.', commit_description='', oid='78b76c956c5b614fe6bb708cc1c38e3adb4a572c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/BounharAbdelaziz/Atlaset-SFT', endpoint='https://huggingface.co', repo_type='dataset', repo_id='BounharAbdelaziz/Atlaset-SFT'), pr_revision=None, pr_num=None)

In [114]:
# Configuration 2: Language Identification and Classification
dataset_dict = DatasetDict()
dataset_dict = DatasetDict({
    # Arabic dialect identification
    'train': Dataset.from_pandas(df_arabic_lid),
})

dataset_dict.push_to_hub(
    DATA_HUB_PATH,
    config_name='arabic_dialect_identification',
    commit_message=f'Classification task.'
)

dataset_dict = DatasetDict()
dataset_dict = DatasetDict({
    # Topic classification
    'train': Dataset.from_pandas(df_topic_classif),
})

dataset_dict.push_to_hub(
    DATA_HUB_PATH,
    config_name='topic_classification',
    commit_message=f'Topic classification task.'
)

dataset_dict = DatasetDict()
dataset_dict = DatasetDict({
    # Emotion detection
    'train': Dataset.from_pandas(df_emotion)
})

dataset_dict.push_to_hub(
    DATA_HUB_PATH,
    config_name='emotion_detection',
    commit_message=f'Emotion detection task.'
)

Creating parquet from Arrow format: 100%|██████████| 398/398 [00:02<00:00, 156.27ba/s]
Creating parquet from Arrow format: 100%|██████████| 398/398 [00:00<00:00, 534.57ba/s]
Creating parquet from Arrow format: 100%|██████████| 398/398 [00:00<00:00, 1176.12ba/s]
Uploading the dataset shards: 100%|██████████| 3/3 [00:17<00:00,  5.75s/it]
Creating parquet from Arrow format: 100%|██████████| 189/189 [00:00<00:00, 2776.69ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.29it/s]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 980.44ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.35it/s]


CommitInfo(commit_url='https://huggingface.co/datasets/BounharAbdelaziz/Atlaset-SFT/commit/d29d16839127eb8f16b9739dd0960b102c9d1e23', commit_message='Emotion detection task.', commit_description='', oid='d29d16839127eb8f16b9739dd0960b102c9d1e23', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/BounharAbdelaziz/Atlaset-SFT', endpoint='https://huggingface.co', repo_type='dataset', repo_id='BounharAbdelaziz/Atlaset-SFT'), pr_revision=None, pr_num=None)

In [None]:
dataset_dict = DatasetDict()
dataset_dict = DatasetDict({
    # SFT dataset
    'train': Dataset.from_pandas(df_sft_mbzuai),
})

dataset_dict.push_to_hub(
    DATA_HUB_PATH,
    config_name='mbzuai_conversations_sft',
    commit_message=f'Embedding.'
)

Creating parquet from Arrow format: 100%|██████████| 196/196 [00:01<00:00, 108.14ba/s]
Creating parquet from Arrow format: 100%|██████████| 196/196 [00:02<00:00, 83.54ba/s]
Uploading the dataset shards: 100%|██████████| 2/2 [00:16<00:00,  8.12s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/BounharAbdelaziz/Atlaset-SFT/commit/a427bb331316ca0ded64db6df9d7fd8fe3396946', commit_message='Embedding.', commit_description='', oid='a427bb331316ca0ded64db6df9d7fd8fe3396946', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/BounharAbdelaziz/Atlaset-SFT', endpoint='https://huggingface.co', repo_type='dataset', repo_id='BounharAbdelaziz/Atlaset-SFT'), pr_revision=None, pr_num=None)

In [None]:
# Configuration 3: Semantic Tasks
dataset_dict = DatasetDict()
dataset_dict = DatasetDict({
    # Embeddings
    'train': Dataset.from_pandas(df_triplet),
})

dataset_dict.push_to_hub(
    DATA_HUB_PATH,
    config_name='similarity_triplets',
    commit_message=f'Embedding.'
)

dataset_dict = DatasetDict()
dataset_dict = DatasetDict({
    # Topic classification
    'train': Dataset.from_pandas(df_negation_triplet),
})

dataset_dict.push_to_hub(
    DATA_HUB_PATH,
    config_name='entailment_triplets',
    commit_message=f'Embedding task.'
)

dataset_dict = DatasetDict()
dataset_dict = DatasetDict({
    # Emotion detection
    'train': Dataset.from_pandas(df_pair_score)
})

dataset_dict.push_to_hub(
    DATA_HUB_PATH,
    config_name='sentence_pairs',
    commit_message=f'Embedding task.'
)

Creating parquet from Arrow format: 100%|██████████| 37/37 [00:00<00:00, 80.11ba/s]
Creating parquet from Arrow format: 100%|██████████| 37/37 [00:02<00:00, 18.36ba/s]
Uploading the dataset shards: 100%|██████████| 2/2 [00:11<00:00,  5.87s/it]
Creating parquet from Arrow format: 100%|██████████| 10/10 [00:00<00:00, 569.26ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.42it/s]
Creating parquet from Arrow format: 100%|██████████| 200/200 [00:00<00:00, 409.05ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.44s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/BounharAbdelaziz/Atlaset-SFT/commit/e56f442240181bdcfd69f87d09c93b203ca25d1a', commit_message='Embedding task.', commit_description='', oid='e56f442240181bdcfd69f87d09c93b203ca25d1a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/BounharAbdelaziz/Atlaset-SFT', endpoint='https://huggingface.co', repo_type='dataset', repo_id='BounharAbdelaziz/Atlaset-SFT'), pr_revision=None, pr_num=None)