In [1]:
from datasets import (load_dataset,
                      DatasetDict,
                      concatenate_datasets,
                      Dataset
                    )
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
DATASET_HUB = "atlasia/AL-Atlas-Moroccan-Darija-Pretraining-Dataset"

In [4]:
dataset_in_hub = load_dataset(DATASET_HUB)

Downloading data: 100%|██████████| 48/48 [04:25<00:00,  5.53s/files]
Generating train split: 100%|██████████| 71999640/71999640 [02:03<00:00, 581154.77 examples/s] 
Generating test split: 100%|██████████| 406331/406331 [00:00<00:00, 613579.42 examples/s]


# Load the datasets to combine

In [3]:
jasper_dataset = load_dataset("JasperV13/Darija_Dataset")

Generating train split: 100%|██████████| 2731313/2731313 [00:07<00:00, 367983.00 examples/s]


In [4]:
jasper_darija_instruct_dataset = load_dataset("JasperV13/Darija_instruct")

Generating train split: 100%|██████████| 843/843 [00:00<00:00, 174503.91 examples/s]


In [None]:
abdeljalil_darija_topic_ds = load_dataset("abdeljalilELmajjodi/darija_topic_ds")

In [None]:
abdeljalil_darija_s2s_ds = load_dataset("abdeljalilELmajjodi/darija_s2s_ds")

In [None]:
abdeljalil_darija_qa_ds = load_dataset("abdeljalilELmajjodi/darija_qa_ds")

In [None]:
abdeljalil_darija_classification_ds = load_dataset("abdeljalilELmajjodi/darija_classification_ds")

In [None]:
bourbouh_subtitles_dataset = load_dataset("bourbouh/moroccan-darija-youtube-subtitles")

In [None]:
Darija_QA_dataset = load_dataset("Lyte/Darija-QA")

In [None]:
mo_darija_merged_dataset = load_dataset("tachicart/mo_darija_merged")

In [6]:
fine_web_sawalni_filtered = load_dataset("sawalni-ai/fw-darija")

In [4]:
atlasia_facebook_darija_dataset = load_dataset("atlasia/facebook_darija_dataset")

Generating train split: 100%|██████████| 1374/1374 [00:00<00:00, 117302.89 examples/s]


In [5]:
atlasia_facebook_darija_dataset

DatasetDict({
    train: Dataset({
        features: ['pageName', 'text', '__index_level_0__'],
        num_rows: 1374
    })
})

In [7]:
atlasia_facebook_darija_dataset = atlasia_facebook_darija_dataset.remove_columns(["__index_level_0__"])
atlasia_facebook_darija_dataset

DatasetDict({
    train: Dataset({
        features: ['pageName', 'text'],
        num_rows: 1374
    })
})

#### MAC dataset available as CSV

In [None]:
mac_df = pd.read_csv("MAC corpus.csv")

https://github.com/Lafifi-24/arabic-dialect-identification

In [None]:
mac_df.head()

In [None]:
# keep only the dialect
filtered_mac_df = mac_df[mac_df['class'] == 'dialectal']

In [None]:
filtered_mac_df.head()

In [None]:
mac_dataset = Dataset.from_pandas(filtered_mac_df)

In [None]:
mac_dataset = mac_dataset.train_test_split(test_size=0.01).remove_columns('__index_level_0__')  # leave 1% for test

In [None]:
mac_dataset

In [4]:
fineweb_2_ary = load_dataset("Omartificial-Intelligence-Space/FineWeb2-Moroccan-Arabic")

Downloading data: 100%|██████████| 34/34 [04:34<00:00,  8.07s/files]
Generating train split: 100%|██████████| 69181074/69181074 [01:37<00:00, 707992.38 examples/s] 
Generating test split: 100%|██████████| 404456/404456 [00:00<00:00, 698677.54 examples/s]


In [5]:
fineweb_2_ary

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 69181074
    })
    test: Dataset({
        features: ['text'],
        num_rows: 404456
    })
})

# Concatenate the datasets

In [6]:
# ------------------------------------------------------------------------------------------------------------ #
# ------------------------------------------------------------------------------------------------------------ #

def extract_and_stack_dataset(dataset, dataset_source, dataset_source_column, use_source_from_dataset, metadata_columns):
    """
    Stack multiple text columns into a single "text" column, keeping metadata, and add dataset_source as a separate column.

    Args:
        dataset (DatasetDict or Dataset): The dataset or dataset dict to process.
        dataset_source (str): The source of the dataset to add as a separate column.
        dataset_source_column (str): In case the dataset has the source, use it.
        use_source_from_dataset (bool): Whether to use the 'source' column from the dataset.
        metadata_columns (list or str): List of columns (or single column) to exclude (used as metadata).

    Returns:
        DatasetDict: A DatasetDict with stacked text, metadata columns, and dataset_source as a separate column.
    """
    if isinstance(metadata_columns, str):
        metadata_columns = [metadata_columns]  # Handle single column as a list

    def transform(sample):
        # Check for columns that don't exist in the dataset
        for col in metadata_columns:
            if col not in sample:
                raise KeyError(f"Column '{col}' is not in the dataset. Available columns: {list(sample.keys())}")

        # Identify text columns to stack (all columns except excluded)
        text_columns = [col for col in sample.keys() if col not in metadata_columns]
        stacked_text = []
        metadata_list = {col: [] for col in metadata_columns}  # Initialize metadata dict

        # Stack text and metadata
        for col in text_columns:
            if isinstance(sample[col], list):  # Handle batch mapping with lists
                stacked_text.extend(sample[col])
                for meta_col in metadata_columns:
                    metadata_list[meta_col].extend(sample[meta_col])
            else:
                stacked_text.append(sample[col])
                for meta_col in metadata_columns:
                    metadata_list[meta_col].append(sample[meta_col])

        # Prepare metadata as string for each entry
        merged_metadata = [
            {meta_col: str(metadata_list[meta_col][i]) for meta_col in metadata_columns}
            for i in range(len(stacked_text))
        ]
        
        # Convert entire metadata to string (by converting the whole dict to a string)
        merged_metadata_as_str = [str(metadata) for metadata in merged_metadata]
       
        # Determine the dataset_source_column value based on whether to use it from the dataset
        if use_source_from_dataset and dataset_source_column in sample:
            if isinstance(sample[dataset_source_column], list):  # Batch processing
                dataset_source_column_value = sample[dataset_source_column]
            else:
                dataset_source_column_value = [sample[dataset_source_column]] * len(stacked_text)
        else:
            dataset_source_column_value = [dataset_source] * len(stacked_text)

            
        return {
            "text": stacked_text,
            "dataset_source": dataset_source_column_value,  # Separate column for dataset source
            "metadata": merged_metadata_as_str,  # Entire metadata as string
        }

    # Apply transformation across splits, preserving split structure
    if isinstance(dataset, DatasetDict):
        new_splits = {}
        for split in dataset.keys():
            print(f"Processing split: {split}...")
            new_splits[split] = dataset[split].map(
                transform,
                batched=True,  # Needed to handle stacking correctly
                remove_columns=dataset[split].column_names
            ).flatten_indices()
        return DatasetDict(new_splits)
    else:
        return dataset.map(
            transform,
            batched=True,
            remove_columns=dataset.column_names
        ).flatten_indices()


# ------------------------------------------------------------------------------------------------------------ #
# ------------------------------------------------------------------------------------------------------------ #

from datasets import DatasetDict, concatenate_datasets

def concat_datasetdicts(d1: DatasetDict, d2: DatasetDict) -> DatasetDict:
    """
    Concatenates two DatasetDict objects, handling missing splits and columns.
    If a split exists in one DatasetDict but not the other, it is included as-is.
    If columns are missing in one dataset, they are added with None values.

    Args:
        d1 (DatasetDict): The first DatasetDict to concatenate.
        d2 (DatasetDict): The second DatasetDict to concatenate.

    Returns:
        DatasetDict: A new DatasetDict with concatenated splits.
    """
    # Get all unique splits from both DatasetDicts
    all_splits = set(d1.keys()).union(d2.keys())

    # Create a new DatasetDict by combining splits
    concatenated = DatasetDict()
    
    for split in all_splits:
        if split in d1 and split in d2:
            # Both DatasetDicts have this split, so concatenate them
            # Get the feature sets from both datasets
            features_d1 = set(d1[split].column_names)
            features_d2 = set(d2[split].column_names)

            # Find missing columns in both datasets
            missing_in_d1 = features_d2 - features_d1
            missing_in_d2 = features_d1 - features_d2

            # Add missing columns with None values
            for missing_col in missing_in_d1:
                d1[split] = d1[split].add_column(missing_col, [None] * len(d1[split]))
            for missing_col in missing_in_d2:
                d2[split] = d2[split].add_column(missing_col, [None] * len(d2[split]))

            # Now concatenate the datasets with the same features
            concatenated[split] = concatenate_datasets([d1[split], d2[split]])

        elif split in d1:
            # Only the first DatasetDict has this split
            concatenated[split] = d1[split]
        elif split in d2:
            # Only the second DatasetDict has this split
            concatenated[split] = d2[split]

    return concatenated

# ------------------------------------------------------------------------------------------------------------ #
# ------------------------------------------------------------------------------------------------------------ #

def combine_all_datasets(all_datasets_dict: dict={}):
    """ Combines all dataset from a given dictionary of structure. """
    
    # Create a new DatasetDict by combining splits
    combined = DatasetDict()
        
    for dataset_source, data in all_datasets_dict.items():
        
        if data["dataset_source_column_name"] is not None:
            use_source_from_dataset = True
            dataset_source_column = data["dataset_source_column_name"]
        else:
            use_source_from_dataset = False
            dataset_source_column = None
        
        dataset = extract_and_stack_dataset(dataset=data["dataset"], 
                                                dataset_source=dataset_source, 
                                                dataset_source_column=dataset_source_column, 
                                                use_source_from_dataset=use_source_from_dataset, 
                                                metadata_columns=data["metadata_columns"]
                )
        combined = concat_datasetdicts(combined, dataset)
        
        print(f"=" * 25)
        
    return combined
    
# ------------------------------------------------------------------------------------------------------------ #
# ------------------------------------------------------------------------------------------------------------ #

In [7]:
all_datasets_dict= {
                "JasperV13/Darija_Dataset": {
                    "dataset": jasper_dataset,
                    "metadata_columns": ['source'],
                    "dataset_source_column_name": 'source',
                },
                "abdeljalilELmajjodi/darija_topic_ds": {
                    "dataset": abdeljalil_darija_topic_ds,
                    "metadata_columns": ['topic'],
                    "dataset_source_column_name": None,
                },
                "abdeljalilELmajjodi/darija_qa_ds": {
                    "dataset": abdeljalil_darija_qa_ds,
                    "metadata_columns": ['question_number', 'correct_answer_num'],
                    "dataset_source_column_name": None,
                },
                "abdeljalilELmajjodi/abdeljalil_darija_classification_ds": {
                    "dataset": abdeljalil_darija_classification_ds,
                    "metadata_columns": ['index_id', 'category'],
                    "dataset_source_column_name": None,
                },
                "bourbouh/moroccan-darija-youtube-subtitles": {
                    "dataset": bourbouh_subtitles_dataset,
                    "metadata_columns": ['video_id', 'title'],
                    "dataset_source_column_name": None,
                },
                "JasperV13/Darija_instruct": {
                    "dataset": jasper_darija_instruct_dataset,
                    "metadata_columns": [],
                    "dataset_source_column_name": None,
                },
                "tachicart/mo_darija_merged": {
                    "dataset": mo_darija_merged_dataset,
                    "metadata_columns": ['ar'],
                    "dataset_source_column_name": None,
                },
                "Lyte/Darija-QA": {
                    "dataset": Darija_QA_dataset,
                    "metadata_columns": [],
                    "dataset_source_column_name": None,
                },
                "MAC Corpus": {
                    "dataset": mac_dataset,
                    "metadata_columns": ['type', 'class'],
                    "dataset_source_column_name": None,
                },
                "sawalni-ai/fw-darija": {
                    "dataset": fine_web_sawalni_filtered,
                    "metadata_columns": ['gherbal_cleaned_text', 'gherbal_predictions', 'gherbal_lang', 'gherbal_score', 'id', 'metadata', 'domain'],
                    "dataset_source_column_name": None,
                },
            }

NameError: name 'jasper_dataset' is not defined

In [8]:
to_add_datasets_dict= {
                "Omartificial-Intelligence-Space/FineWeb2-Moroccan-Arabic": {
                    "dataset": fineweb_2_ary,
                    "metadata_columns": [],
                    "dataset_source_column_name": None,
                },
            }

In [9]:
fineweb_2_ary

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 69181074
    })
    test: Dataset({
        features: ['text'],
        num_rows: 404456
    })
})

In [10]:
to_add_dataset = combine_all_datasets(to_add_datasets_dict)

Processing split: train...


Map: 100%|██████████| 69181074/69181074 [07:09<00:00, 160941.05 examples/s]
Flattening the indices: 100%|██████████| 69181074/69181074 [03:59<00:00, 289345.28 examples/s] 


Processing split: test...


Map: 100%|██████████| 404456/404456 [00:01<00:00, 204918.55 examples/s]
Flattening the indices: 100%|██████████| 404456/404456 [00:00<00:00, 634658.87 examples/s] 






In [11]:
to_add_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'dataset_source', 'metadata'],
        num_rows: 69181074
    })
    test: Dataset({
        features: ['text', 'dataset_source', 'metadata'],
        num_rows: 404456
    })
})

In [12]:
set(to_add_dataset['train']['dataset_source'])

{'Omartificial-Intelligence-Space/FineWeb2-Moroccan-Arabic'}

In [13]:
combined = concat_datasetdicts(to_add_dataset, dataset_in_hub)

In [14]:
set(combined['train']['dataset_source'])

{'ArabicDarija_xP3x',
 'Darija-Stories-Dataset',
 'DarijaBridge',
 'DarijaEnglish-xP3x',
 'JasperV13/Darija_instruct',
 'Lyte/Darija-QA',
 'MAC Corpus',
 'MArSum',
 'ML101',
 'MTCD',
 'Omartificial-Intelligence-Space/FineWeb2-Moroccan-Arabic',
 'abdeljalilELmajjodi/abdeljalil_darija_classification_ds',
 'abdeljalilELmajjodi/darija_qa_ds',
 'abdeljalilELmajjodi/darija_topic_ds',
 'atlasia/darija_english',
 'atlasia/facebook_darija_dataset',
 'bourbouh/moroccan-darija-youtube-subtitles',
 'darija_speech_to_text',
 'darija_youtube_subtitles',
 'dataset_dyal_darija',
 'goud-sum',
 'moroccan_darija_wikipedia_dataset',
 'sawalni-ai/fw-darija',
 'tachicart/mo_darija_merged'}

# Push the dataset to the Hugging Face Hub

In [15]:
combined.push_to_hub(DATASET_HUB)

Creating parquet from Arrow format: 100%|██████████| 1500/1500 [00:01<00:00, 792.19ba/s]
Creating parquet from Arrow format: 100%|██████████| 1500/1500 [00:01<00:00, 790.82ba/s]
Creating parquet from Arrow format: 100%|██████████| 1500/1500 [00:01<00:00, 831.83ba/s]
Creating parquet from Arrow format: 100%|██████████| 1500/1500 [00:01<00:00, 829.63ba/s]
Creating parquet from Arrow format: 100%|██████████| 1500/1500 [00:01<00:00, 810.89ba/s]
Creating parquet from Arrow format: 100%|██████████| 1500/1500 [00:01<00:00, 810.63ba/s]
Creating parquet from Arrow format: 100%|██████████| 1500/1500 [00:01<00:00, 818.38ba/s]
Creating parquet from Arrow format: 100%|██████████| 1500/1500 [00:01<00:00, 822.53ba/s]
Creating parquet from Arrow format: 100%|██████████| 1500/1500 [00:01<00:00, 827.39ba/s]
Creating parquet from Arrow format: 100%|██████████| 1500/1500 [00:01<00:00, 857.38ba/s]
Creating parquet from Arrow format: 100%|██████████| 1500/1500 [00:01<00:00, 855.97ba/s]
Creating parquet from

CommitInfo(commit_url='https://huggingface.co/datasets/BounharAbdelaziz/AL-Atlas-Moroccan-Darija-Pretraining-Dataset/commit/5783cacb61280a2fc47d090ebcf8e6404f9b3467', commit_message='Upload dataset', commit_description='', oid='5783cacb61280a2fc47d090ebcf8e6404f9b3467', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/BounharAbdelaziz/AL-Atlas-Moroccan-Darija-Pretraining-Dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='BounharAbdelaziz/AL-Atlas-Moroccan-Darija-Pretraining-Dataset'), pr_revision=None, pr_num=None)

# MinHash Deduplication

In [16]:
from datasets import DatasetDict
from datasketch import MinHash, MinHashLSH
from sklearn.feature_extraction.text import CountVectorizer

In [85]:
def run_minhash_deduplication(dataset_dict, text_column, n=3, num_perm=128, threshold=0.8, LOG_FREQUENCY=1000):

    def generate_ngrams(text, n):
        """Generate n-grams from text."""
        vectorizer = CountVectorizer(ngram_range=(n, n), analyzer='word', token_pattern=r'\b\w+\b')
        try:
            vectorizer.fit([text])
            return vectorizer.get_feature_names_out()
        except ValueError:
            return []

    def get_minhash(ngrams, num_perm):
        """Create a MinHash signature from n-grams."""
        m = MinHash(num_perm=num_perm)
        for ngram in ngrams:
            m.update(ngram.encode('utf8'))
        return m

    # Initialize LSH
    lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)

    def process_example(example, idx, split_name):
        """Process a single example to check for duplicates."""
        global_id = f"{split_name}_{idx}"
        text = example.get(text_column, None)

        # # Log example details for debugging
        # if idx % LOG_FREQUENCY == 0:  # Log every 1000 examples
        #     print(f"Processing {global_id}: {text[:50] if text else 'EMPTY'}")

        if text is None or not text.strip():
            return {"_keep": False, **example}

        ngrams = generate_ngrams(text, n)
        if len(ngrams) == 0:  # Explicitly check if ngrams array is empty
            return {"_keep": False, **example}

        m = get_minhash(ngrams, num_perm)

        # Check for duplicates
        if lsh.query(m):
            return {"_keep": False, **example}
        else:
            lsh.insert(global_id, m)
            return {"_keep": True, **example}


    # Deduplicate each split
    deduplicated_splits = {}
    for split_name, split_data in dataset_dict.items():
        print(f"Processing split: {split_name} (Size: {len(split_data)})")
        deduplicated_split = split_data.map(
            lambda example, idx: process_example(example, idx, split_name),
            with_indices=True,
            remove_columns=[]  # Keeps the original schema
        ).filter(lambda x: x["_keep"])

        deduplicated_split = deduplicated_split.remove_columns(["_keep"])
        deduplicated_splits[split_name] = deduplicated_split

    print("[INFO] Deduplication complete.")
    return DatasetDict(deduplicated_splits)


In [87]:
LSH_THRESHOLD = 0.75
NUM_PERM = 128
N_GRAM = 3
text_column ='text'
LOG_FREQUENCY = 1000

In [88]:
dataset_in_hub

DatasetDict({
    train: Dataset({
        features: ['text', 'dataset_source', 'metadata'],
        num_rows: 71999640
    })
    test: Dataset({
        features: ['text', 'dataset_source', 'metadata'],
        num_rows: 406331
    })
})

In [89]:
# execute minhash dedpulication
deduplicated_dataset = run_minhash_deduplication(dataset_dict=dataset_in_hub, 
                                                text_column=text_column, 
                                                 n=N_GRAM, 
                                                 num_perm=NUM_PERM, 
                                                 threshold=LSH_THRESHOLD,
                                                 LOG_FREQUENCY=LOG_FREQUENCY)

Processing split: train (Size: 71999640)


Map:   0%|          | 120/71999640 [00:00<34:23:00, 581.67 examples/s]

Map:   1%|          | 362046/71999640 [11:07<36:42:39, 542.05 examples/s] 


KeyboardInterrupt: 

In [39]:
deduplicated_dataset

DatasetDict({
    train: Dataset({
        features: ['output', 'input', 'instruction'],
        num_rows: 1072
    })
})