In [1]:
from datasets import (load_dataset,
                      DatasetDict,
                      concatenate_datasets,
                    )
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DATASET_HUB = "BounharAbdelaziz/AL-Atlas-Moroccan-Darija-Pretraining-Dataset"

# Load the datasets to combine

In [3]:
jasper_dataset = load_dataset("JasperV13/Darija_Dataset")

In [4]:
jasper_darija_instruct_dataset = load_dataset("JasperV13/Darija_instruct")

In [5]:
abdeljalil_darija_topic_ds = load_dataset("abdeljalilELmajjodi/darija_topic_ds")

In [6]:
abdeljalil_darija_s2s_ds = load_dataset("abdeljalilELmajjodi/darija_s2s_ds")

In [7]:
abdeljalil_darija_qa_ds = load_dataset("abdeljalilELmajjodi/darija_qa_ds")

In [8]:
abdeljalil_darija_classification_ds = load_dataset("abdeljalilELmajjodi/darija_classification_ds")

In [9]:
bourbouh_subtitles_dataset = load_dataset("bourbouh/moroccan-darija-youtube-subtitles")

In [10]:
Darija_QA_dataset = load_dataset("Lyte/Darija-QA")

In [11]:
mo_darija_merged_dataset = load_dataset("tachicart/mo_darija_merged")

# Concatenate the datasets

In [12]:
# ------------------------------------------------------------------------------------------------------------ #
# ------------------------------------------------------------------------------------------------------------ #

def extract_and_stack_dataset(dataset, dataset_source, dataset_source_column, use_source_from_dataset, metadata_columns):
    """
    Stack multiple text columns into a single "text" column, keeping metadata, and add dataset_source as a separate column.

    Args:
        dataset (DatasetDict or Dataset): The dataset or dataset dict to process.
        dataset_source (str): The source of the dataset to add as a separate column.
        dataset_source_column (str): In case the dataset has the source, use it.
        use_source_from_dataset (bool): Whether to use the 'source' column from the dataset.
        metadata_columns (list or str): List of columns (or single column) to exclude (used as metadata).

    Returns:
        DatasetDict: A DatasetDict with stacked text, metadata columns, and dataset_source as a separate column.
    """
    if isinstance(metadata_columns, str):
        metadata_columns = [metadata_columns]  # Handle single column as a list

    def transform(sample):
        # Check for columns that don't exist in the dataset
        for col in metadata_columns:
            if col not in sample:
                raise KeyError(f"Column '{col}' is not in the dataset. Available columns: {list(sample.keys())}")

        # Identify text columns to stack (all columns except excluded)
        text_columns = [col for col in sample.keys() if col not in metadata_columns]
        stacked_text = []
        metadata_list = {col: [] for col in metadata_columns}  # Initialize metadata dict

        # Stack text and metadata
        for col in text_columns:
            if isinstance(sample[col], list):  # Handle batch mapping with lists
                stacked_text.extend(sample[col])
                for meta_col in metadata_columns:
                    metadata_list[meta_col].extend(sample[meta_col])
            else:
                stacked_text.append(sample[col])
                for meta_col in metadata_columns:
                    metadata_list[meta_col].append(sample[meta_col])

        # Prepare metadata as string for each entry
        merged_metadata = [
            {meta_col: str(metadata_list[meta_col][i]) for meta_col in metadata_columns}
            for i in range(len(stacked_text))
        ]
        
        # Convert entire metadata to string (by converting the whole dict to a string)
        merged_metadata_as_str = [str(metadata) for metadata in merged_metadata]
       
        # Determine the dataset_source_column value based on whether to use it from the dataset
        if use_source_from_dataset and dataset_source_column in sample:
            if isinstance(sample[dataset_source_column], list):  # Batch processing
                dataset_source_column_value = sample[dataset_source_column]
            else:
                dataset_source_column_value = [sample[dataset_source_column]] * len(stacked_text)
        else:
            dataset_source_column_value = [dataset_source] * len(stacked_text)

            
        return {
            "text": stacked_text,
            "dataset_source": dataset_source_column_value,  # Separate column for dataset source
            "metadata": merged_metadata_as_str,  # Entire metadata as string
        }

    # Apply transformation across splits, preserving split structure
    if isinstance(dataset, DatasetDict):
        new_splits = {}
        for split in dataset.keys():
            print(f"Processing split: {split}...")
            new_splits[split] = dataset[split].map(
                transform,
                batched=True,  # Needed to handle stacking correctly
                remove_columns=dataset[split].column_names
            ).flatten_indices()
        return DatasetDict(new_splits)
    else:
        return dataset.map(
            transform,
            batched=True,
            remove_columns=dataset.column_names
        ).flatten_indices()


# ------------------------------------------------------------------------------------------------------------ #
# ------------------------------------------------------------------------------------------------------------ #

from datasets import DatasetDict, concatenate_datasets

def concat_datasetdicts(d1: DatasetDict, d2: DatasetDict) -> DatasetDict:
    """
    Concatenates two DatasetDict objects, handling missing splits and columns.
    If a split exists in one DatasetDict but not the other, it is included as-is.
    If columns are missing in one dataset, they are added with None values.

    Args:
        d1 (DatasetDict): The first DatasetDict to concatenate.
        d2 (DatasetDict): The second DatasetDict to concatenate.

    Returns:
        DatasetDict: A new DatasetDict with concatenated splits.
    """
    # Get all unique splits from both DatasetDicts
    all_splits = set(d1.keys()).union(d2.keys())

    # Create a new DatasetDict by combining splits
    concatenated = DatasetDict()
    
    for split in all_splits:
        if split in d1 and split in d2:
            # Both DatasetDicts have this split, so concatenate them
            # Get the feature sets from both datasets
            features_d1 = set(d1[split].column_names)
            features_d2 = set(d2[split].column_names)

            # Find missing columns in both datasets
            missing_in_d1 = features_d2 - features_d1
            missing_in_d2 = features_d1 - features_d2

            # Add missing columns with None values
            for missing_col in missing_in_d1:
                d1[split] = d1[split].add_column(missing_col, [None] * len(d1[split]))
            for missing_col in missing_in_d2:
                d2[split] = d2[split].add_column(missing_col, [None] * len(d2[split]))

            # Now concatenate the datasets with the same features
            concatenated[split] = concatenate_datasets([d1[split], d2[split]])

        elif split in d1:
            # Only the first DatasetDict has this split
            concatenated[split] = d1[split]
        elif split in d2:
            # Only the second DatasetDict has this split
            concatenated[split] = d2[split]

    return concatenated

# ------------------------------------------------------------------------------------------------------------ #
# ------------------------------------------------------------------------------------------------------------ #

def combine_all_datasets(all_datasets_dict: dict={}):
    """ Combines all dataset from a given dictionary of structure. """
    
    # Create a new DatasetDict by combining splits
    combined = DatasetDict()
        
    for dataset_source, data in all_datasets_dict.items():
        
        if data["dataset_source_column_name"] is not None:
            use_source_from_dataset = True
            dataset_source_column = data["dataset_source_column_name"]
        else:
            use_source_from_dataset = False
            dataset_source_column = None
        
        dataset = extract_and_stack_dataset(dataset=data["dataset"], 
                                                dataset_source=dataset_source, 
                                                dataset_source_column=dataset_source_column, 
                                                use_source_from_dataset=use_source_from_dataset, 
                                                metadata_columns=data["metadata_columns"]
                )
        combined = concat_datasetdicts(combined, dataset)
        
        print(f"=" * 25)
        
    return combined
    
# ------------------------------------------------------------------------------------------------------------ #
# ------------------------------------------------------------------------------------------------------------ #

In [13]:
all_datasets_dict= {
                "JasperV13/Darija_Dataset": {
                    "dataset": jasper_dataset,
                    "metadata_columns": ['source'],
                    "dataset_source_column_name": 'source',
                },
                "abdeljalilELmajjodi/darija_topic_ds": {
                    "dataset": abdeljalil_darija_topic_ds,
                    "metadata_columns": ['topic'],
                    "dataset_source_column_name": None,
                },
                "abdeljalilELmajjodi/darija_qa_ds": {
                    "dataset": abdeljalil_darija_qa_ds,
                    "metadata_columns": ['question_number', 'correct_answer_num'],
                    "dataset_source_column_name": None,
                },
                "abdeljalilELmajjodi/abdeljalil_darija_classification_ds": {
                    "dataset": abdeljalil_darija_classification_ds,
                    "metadata_columns": ['index_id', 'category'],
                    "dataset_source_column_name": None,
                },
                "bourbouh/moroccan-darija-youtube-subtitles": {
                    "dataset": bourbouh_subtitles_dataset,
                    "metadata_columns": ['video_id', 'title'],
                    "dataset_source_column_name": None,
                },
                "JasperV13/Darija_instruct": {
                    "dataset": jasper_darija_instruct_dataset,
                    "metadata_columns": [],
                    "dataset_source_column_name": None,
                },
                "tachicart/mo_darija_merged": {
                    "dataset": mo_darija_merged_dataset,
                    "metadata_columns": ['ar'],
                    "dataset_source_column_name": None,
                },
                "Lyte/Darija-QA": {
                    "dataset": Darija_QA_dataset,
                    "metadata_columns": [],
                    "dataset_source_column_name": None,
                },
            }

In [14]:
combined = combine_all_datasets(all_datasets_dict)

Processing split: train...
Processing split: train...
Processing split: train...
Processing split: train...
Processing split: train...
Processing split: train...
Processing split: train...
Processing split: test...
Processing split: train...


In [15]:
combined

DatasetDict({
    train: Dataset({
        features: ['text', 'dataset_source', 'metadata'],
        num_rows: 2774431
    })
    test: Dataset({
        features: ['text', 'dataset_source', 'metadata'],
        num_rows: 1820
    })
})

In [16]:
set(combined['train']['dataset_source'])

{'ArabicDarija_xP3x',
 'Darija-Stories-Dataset',
 'DarijaBridge',
 'DarijaEnglish-xP3x',
 'JasperV13/Darija_instruct',
 'Lyte/Darija-QA',
 'MArSum',
 'ML101',
 'MTCD',
 'abdeljalilELmajjodi/abdeljalil_darija_classification_ds',
 'abdeljalilELmajjodi/darija_qa_ds',
 'abdeljalilELmajjodi/darija_topic_ds',
 'atlasia/darija_english',
 'bourbouh/moroccan-darija-youtube-subtitles',
 'darija_speech_to_text',
 'darija_youtube_subtitles',
 'dataset_dyal_darija',
 'goud-sum',
 'moroccan_darija_wikipedia_dataset',
 'tachicart/mo_darija_merged'}

# Push the dataset to the Hugging Face Hub

In [None]:
combined.push_to_hub(DATASET_HUB)

Creating parquet from Arrow format: 100%|██████████| 925/925 [00:01<00:00, 570.44ba/s]
Creating parquet from Arrow format: 100%|██████████| 925/925 [00:01<00:00, 581.73ba/s]
