# core

> This module contains all the core functions used in the library.

In [None]:
#| default_exp core

In [None]:
#| export
import logging
import os

from datasets import concatenate_datasets, Dataset
from rich.logging import RichHandler

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
logger.addHandler(RichHandler(rich_tracebacks=True))
# Turn off logging for datasets
logging.getLogger("datasets").setLevel(logging.ERROR)

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| hide
from datasets import load_dataset
from squeakily.filter import check_char_repetition, check_flagged_words, minhash_dedup
from squeakily.clean import remove_empty_lines, normalize_whitespace

In [None]:
#| export
class Pipeline:
    """
    A pipeline is a collection of datasources and their associated transformations to be run.
    """
    def __init__(
        self,
        datasources # The datasources to be run
    ):
        self.datasources = datasources
    
    def run(
        self,
        global_filters=[],      # Filters to be run at the dataset level rather than the example level
        global_cleaners=[],     # Cleaners to be run at the dataset level rather than the example level
        cleaning_first=False,   # Whether to run the cleaning transformations first
        globals_first=False,    # Whether to run the global transformations first
        dry_run=False,          # Whether to run the pipeline or only calculate the various criteria and add as a column
    ):
        """
        Run the pipeline.
        """
        for i in range(len(self.datasources)):
            column = self.datasources[i]["columns"][0]
            logger.info(f"Running datasource: {self.datasources[i]['dataset'].builder_name}")
            if cleaning_first:
                for c in self.datasources[i]["cleaners"]:
                    name = c.__name__
                    logger.info(f"Running cleaner: {name} on {column}")
                    self.datasources[i]["dataset"] = self.datasources[i]["dataset"].map(
                        lambda x: {column: c(x[column])},
                        num_proc=os.cpu_count(),
                    )
                for f in self.datasources[i]["filters"]:
                    name = f.__name__
                    logger.info(f"Running filter: {name} on {column}")
                    if dry_run:
                        logger.info(f"Running in dry-run mode")
                        self.datasources[i]["dataset"] = self.datasources[i]["dataset"].map(
                            lambda x: {f"{name}_criteria": f(x[column], dry_run=True)},
                            num_proc=os.cpu_count(),
                        )
                    else:
                        self.datasources[i]["dataset"] = self.datasources[i]["dataset"].filter(
                            lambda x: f(x[column]),
                            num_proc=os.cpu_count(),
                        )
            else:
                for f in self.datasources[i]["filters"]:
                    name = f.__name__
                    logger.info(f"Running filter: {name} on {column}")
                    if dry_run:
                        logger.info(f"Running in dry-run mode")
                        self.datasources[i]["dataset"] = self.datasources[i]["dataset"].map(
                            lambda x: {f"{name}_criteria": f(x[column], dry_run=True)},
                            num_proc=os.cpu_count(),
                        )
                    else:
                        self.datasources[i]["dataset"] = self.datasources[i]["dataset"].filter(
                            lambda x: f(x[column]),
                            num_proc=os.cpu_count(),
                        )
                for c in self.datasources[i]["cleaners"]:
                    name = c.__name__
                    logger.info(f"Running cleaner: {name} on {column}")
                    self.datasources[i]["dataset"] = self.datasources[i]["dataset"].map(
                        lambda x: {column: c(x[column])},
                        num_proc=os.cpu_count(),
                    )

        if len(global_filters) > 0:
            # concatenate all datasets
            datasets = [
                d["dataset"] for d in self.datasources
                if not d.get("skip_global", False)
            ]
            global_column = self.datasources[0]["columns"][0]
            global_dataset = concatenate_datasets(datasets)

            # Add a column representing the original dataset name
            md = []
            for d in datasets:
                md.extend([d.builder_name] * len(d))
            meta_data = Dataset.from_dict({"meta_data": md})
            global_dataset_with_meta = concatenate_datasets([global_dataset, meta_data], axis=1)

            # Run the global filters
            for f in global_filters:
                logger.info(f"Running global filter: {f.__name__}")
                global_dataset_with_meta = f(global_dataset_with_meta, global_column, dry_run=dry_run)

            # Split the dataset back up
            for i, dataset in enumerate(datasets):
                self.datasources[i]["dataset"] = global_dataset_with_meta.filter(
                    lambda x: x["meta_data"] == dataset.builder_name,
                    num_proc=os.cpu_count(),
                )

In [None]:
#|echo: true
show_doc(Pipeline.run)

---

[source](https://github.com/CarperAI/squeakily/blob/main/squeakily/core.py#L30){target="_blank" style="float:right; font-size:smaller"}

### Pipeline.run

>      Pipeline.run (global_filters=[], global_cleaners=[],
>                    cleaning_first=False, globals_first=False, dry_run=False)

Run the pipeline.

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| global_filters | list | [] | Filters to be run at the dataset level rather than the example level |
| global_cleaners | list | [] | Cleaners to be run at the dataset level rather than the example level |
| cleaning_first | bool | False | Whether to run the cleaning transformations first |
| globals_first | bool | False | Whether to run the global transformations first |
| dry_run | bool | False | Whether to run the pipeline or only calculate the various criteria and add as a column |

In [None]:
# test dry run
ds = load_dataset("wikitext", "wikitext-103-v1", split="train[:1%]")
logger.info(f"Original dataset size: {len(ds)}")
datasources = [
    {
        "dataset": ds,
        "columns": ["text"],
        "filters": [check_char_repetition, check_flagged_words],
        "cleaners": [remove_empty_lines, normalize_whitespace],
    },
    # ...
]

pipeline = Pipeline(datasources)
pipeline.run(dry_run=True, global_filters=[minhash_dedup])

assert len(ds) == len(pipeline.datasources[0]["dataset"])
assert "check_char_repetition_criteria" in pipeline.datasources[0]["dataset"].features
assert "check_flagged_words_criteria" in pipeline.datasources[0]["dataset"].features
assert "duplicate" in pipeline.datasources[0]["dataset"].features
assert "meta_data" in pipeline.datasources[0]["dataset"].features
assert "__id__" in pipeline.datasources[0]["dataset"].features

                                

                                

                                

                                

                                 

Adding index... #0:   0%|          | 0/563 [00:00<?, ?ex/s]

  

Adding index... #1:   0%|          | 0/563 [00:00<?, ?ex/s]

 

Adding index... #2:   0%|          | 0/563 [00:00<?, ?ex/s]

 

Adding index... #3:   0%|          | 0/563 [00:00<?, ?ex/s]

Adding index... #4:   0%|          | 0/563 [00:00<?, ?ex/s]

 

Adding index... #5:   0%|          | 0/563 [00:00<?, ?ex/s]

  

Adding index... #6:   0%|          | 0/563 [00:00<?, ?ex/s]

Adding index... #7:   0%|          | 0/563 [00:00<?, ?ex/s]

   

Adding index... #8:   0%|          | 0/563 [00:00<?, ?ex/s]

Adding index... #9:   0%|          | 0/563 [00:00<?, ?ex/s]

Adding index... #10:   0%|          | 0/563 [00:00<?, ?ex/s]

 

Adding index... #11:   0%|          | 0/563 [00:00<?, ?ex/s]

  

Adding index... #12:   0%|          | 0/563 [00:00<?, ?ex/s]

Adding index... #13:   0%|          | 0/563 [00:00<?, ?ex/s]

  

Adding index... #14:   0%|          | 0/563 [00:00<?, ?ex/s]

Adding index... #15:   0%|          | 0/563 [00:00<?, ?ex/s]

  

Adding index... #16:   0%|          | 0/563 [00:00<?, ?ex/s]

Adding index... #17:   0%|          | 0/563 [00:00<?, ?ex/s]

 

Adding index... #18:   0%|          | 0/563 [00:00<?, ?ex/s]

  

Adding index... #19:   0%|          | 0/563 [00:00<?, ?ex/s]

 

Adding index... #20:   0%|          | 0/563 [00:00<?, ?ex/s]

 

Adding index... #21:   0%|          | 0/563 [00:00<?, ?ex/s]

Adding index... #22:   0%|          | 0/563 [00:00<?, ?ex/s]

 

Adding index... #23:   0%|          | 0/563 [00:00<?, ?ex/s]

   

Adding index... #26:   0%|          | 0/563 [00:00<?, ?ex/s]

Adding index... #24:   0%|          | 0/563 [00:00<?, ?ex/s]

  

Adding index... #27:   0%|          | 0/563 [00:00<?, ?ex/s]

 

Adding index... #25:   0%|          | 0/563 [00:00<?, ?ex/s]

Adding index... #28:   0%|          | 0/563 [00:00<?, ?ex/s]

 

Adding index... #29:   0%|          | 0/563 [00:00<?, ?ex/s]

Adding index... #30:   0%|          | 0/562 [00:00<?, ?ex/s]

 

Adding index... #31:   0%|          | 0/562 [00:00<?, ?ex/s]

                                                         

Fingerprinting... #3:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #2:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #8:   0%|          | 0/563 [00:00<?, ?ex/s]

 

Fingerprinting... #7:   0%|          | 0/563 [00:00<?, ?ex/s]

   

Fingerprinting... #4:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #11:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #5:   0%|          | 0/563 [00:00<?, ?ex/s]

 

Fingerprinting... #12:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #0:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #6:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #10:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #13:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #1:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #9:   0%|          | 0/563 [00:00<?, ?ex/s]

 

Fingerprinting... #19:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #15:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #14:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #22:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #21:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #17:   0%|          | 0/563 [00:00<?, ?ex/s]

 

Fingerprinting... #28:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #16:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #18:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #30:   0%|          | 0/562 [00:00<?, ?ex/s]

Fingerprinting... #26:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #27:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #29:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #20:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #23:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #25:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #31:   0%|          | 0/562 [00:00<?, ?ex/s]

Fingerprinting... #24:   0%|          | 0/563 [00:00<?, ?ex/s]

Indexing signatures...:   0%|          | 0/18014 [00:00<?, ?it/s]

                                        

Querying... #1:   0%|          | 0/563 [00:00<?, ?ex/s]

 

Querying... #3:   0%|          | 0/563 [00:00<?, ?ex/s]

     

Querying... #8:   0%|          | 0/563 [00:00<?, ?ex/s]

Querying... #10:   0%|          | 0/563 [00:00<?, ?ex/s]

Querying... #4:   0%|          | 0/563 [00:00<?, ?ex/s]

Querying... #9:   0%|          | 0/563 [00:00<?, ?ex/s]

   

Querying... #29:   0%|          | 0/563 [00:00<?, ?ex/s]

   

Querying... #30:   0%|          | 0/562 [00:00<?, ?ex/s]

Querying... #17:   0%|          | 0/563 [00:00<?, ?ex/s]

  

Querying... #25:   0%|          | 0/563 [00:00<?, ?ex/s]

Querying... #7:   0%|          | 0/563 [00:00<?, ?ex/s]

   

Querying... #11:   0%|          | 0/563 [00:00<?, ?ex/s]

Querying... #0:   0%|          | 0/563 [00:00<?, ?ex/s]

 

Querying... #19:   0%|          | 0/563 [00:00<?, ?ex/s]

     

Querying... #21:   0%|          | 0/563 [00:00<?, ?ex/s]

Querying... #23:   0%|          | 0/563 [00:00<?, ?ex/s]

Querying... #20:   0%|          | 0/563 [00:00<?, ?ex/s]

Querying... #18:   0%|          | 0/563 [00:00<?, ?ex/s]

Querying... #13:   0%|          | 0/563 [00:00<?, ?ex/s]

Querying... #12:   0%|          | 0/563 [00:00<?, ?ex/s]

Querying... #27:   0%|          | 0/563 [00:00<?, ?ex/s]

Querying... #6:   0%|          | 0/563 [00:00<?, ?ex/s]

Querying... #24:   0%|          | 0/563 [00:00<?, ?ex/s]

Querying... #5:   0%|          | 0/563 [00:00<?, ?ex/s]

Querying... #16:   0%|          | 0/563 [00:00<?, ?ex/s]

Querying... #14:   0%|          | 0/563 [00:00<?, ?ex/s]

Querying... #31:   0%|          | 0/562 [00:00<?, ?ex/s]

Querying... #26:   0%|          | 0/563 [00:00<?, ?ex/s]

Querying... #15:   0%|          | 0/563 [00:00<?, ?ex/s]

Querying... #2:   0%|          | 0/563 [00:00<?, ?ex/s]

Querying... #22:   0%|          | 0/563 [00:00<?, ?ex/s]

 

Querying... #28:   0%|          | 0/563 [00:00<?, ?ex/s]

                                   

Finding duplicates... #0:   0%|          | 0/1 [00:00<?, ?ba/s]

Finding duplicates... #1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Finding duplicates... #2:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Finding duplicates... #3:   0%|          | 0/1 [00:00<?, ?ba/s]

Finding duplicates... #4:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Finding duplicates... #5:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Finding duplicates... #6:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Finding duplicates... #7:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Finding duplicates... #8:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Finding duplicates... #9:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Finding duplicates... #10:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Finding duplicates... #11:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Finding duplicates... #12:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Finding duplicates... #13:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Finding duplicates... #14:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Finding duplicates... #15:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Finding duplicates... #16:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Finding duplicates... #17:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Finding duplicates... #18:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Finding duplicates... #19:   0%|          | 0/1 [00:00<?, ?ba/s]

Finding duplicates... #20:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Finding duplicates... #21:   0%|          | 0/1 [00:00<?, ?ba/s]

Finding duplicates... #22:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Finding duplicates... #23:   0%|          | 0/1 [00:00<?, ?ba/s]

   

Finding duplicates... #26:   0%|          | 0/1 [00:00<?, ?ba/s]

    

Finding duplicates... #27:   0%|          | 0/1 [00:00<?, ?ba/s]

Finding duplicates... #28:   0%|          | 0/1 [00:00<?, ?ba/s]

Finding duplicates... #25:   0%|          | 0/1 [00:00<?, ?ba/s]

Finding duplicates... #24:   0%|          | 0/1 [00:00<?, ?ba/s]

Finding duplicates... #31:   0%|          | 0/1 [00:00<?, ?ba/s]

Finding duplicates... #29:   0%|          | 0/1 [00:00<?, ?ba/s]

Finding duplicates... #30:   0%|          | 0/1 [00:00<?, ?ba/s]

Constructing graph...:   0%|          | 0/7757 [00:00<?, ?it/s]

Iterating over components...:   0%|          | 0/10560 [00:00<?, ?it/s]

                                                            

Labeling duplicates... #0:   0%|          | 0/563 [00:00<?, ?ex/s]

Labeling duplicates... #4:   0%|          | 0/563 [00:00<?, ?ex/s]

Labeling duplicates... #1:   0%|          | 0/563 [00:00<?, ?ex/s]

 

Labeling duplicates... #5:   0%|          | 0/563 [00:00<?, ?ex/s]

Labeling duplicates... #3:   0%|          | 0/563 [00:00<?, ?ex/s]

 

Labeling duplicates... #8:   0%|          | 0/563 [00:00<?, ?ex/s]

Labeling duplicates... #6:   0%|          | 0/563 [00:00<?, ?ex/s]

Labeling duplicates... #2:   0%|          | 0/563 [00:00<?, ?ex/s]

Labeling duplicates... #7:   0%|          | 0/563 [00:00<?, ?ex/s]

Labeling duplicates... #9:   0%|          | 0/563 [00:00<?, ?ex/s]

Labeling duplicates... #14:   0%|          | 0/563 [00:00<?, ?ex/s]

Labeling duplicates... #11:   0%|          | 0/563 [00:00<?, ?ex/s]

Labeling duplicates... #10:   0%|          | 0/563 [00:00<?, ?ex/s]

Labeling duplicates... #17:   0%|          | 0/563 [00:00<?, ?ex/s]

Labeling duplicates... #20:   0%|          | 0/563 [00:00<?, ?ex/s]

Labeling duplicates... #19:   0%|          | 0/563 [00:00<?, ?ex/s]

Labeling duplicates... #22:   0%|          | 0/563 [00:00<?, ?ex/s]

Labeling duplicates... #21:   0%|          | 0/563 [00:00<?, ?ex/s]

Labeling duplicates... #18:   0%|          | 0/563 [00:00<?, ?ex/s]

Labeling duplicates... #28:   0%|          | 0/563 [00:00<?, ?ex/s]

  

Labeling duplicates... #23:   0%|          | 0/563 [00:00<?, ?ex/s]

Labeling duplicates... #27:   0%|          | 0/563 [00:00<?, ?ex/s]

Labeling duplicates... #30:   0%|          | 0/562 [00:00<?, ?ex/s]

Labeling duplicates... #16:   0%|          | 0/563 [00:00<?, ?ex/s]

Labeling duplicates... #12:   0%|          | 0/563 [00:00<?, ?ex/s]

Labeling duplicates... #15:   0%|          | 0/563 [00:00<?, ?ex/s]

Labeling duplicates... #26:   0%|          | 0/563 [00:00<?, ?ex/s]

Labeling duplicates... #13:   0%|          | 0/563 [00:00<?, ?ex/s]

Labeling duplicates... #31:   0%|          | 0/562 [00:00<?, ?ex/s]

Labeling duplicates... #24:   0%|          | 0/563 [00:00<?, ?ex/s]

Labeling duplicates... #29:   0%|          | 0/563 [00:00<?, ?ex/s]

Labeling duplicates... #25:   0%|          | 0/563 [00:00<?, ?ex/s]

                                  

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#8:   0%|          | 0/1 [00:00<?, ?ba/s]

#9:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#10:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#11:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#12:   0%|          | 0/1 [00:00<?, ?ba/s]

#13:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#14:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#15:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#16:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#17:   0%|          | 0/1 [00:00<?, ?ba/s]

#18:   0%|          | 0/1 [00:00<?, ?ba/s]

   

#19:   0%|          | 0/1 [00:00<?, ?ba/s]

#20:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#21:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#22:   0%|          | 0/1 [00:00<?, ?ba/s]

#23:   0%|          | 0/1 [00:00<?, ?ba/s]

   

#26:   0%|          | 0/1 [00:00<?, ?ba/s]

    

#27:   0%|          | 0/1 [00:00<?, ?ba/s]

#28:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#25:   0%|          | 0/1 [00:00<?, ?ba/s]

#29:   0%|          | 0/1 [00:00<?, ?ba/s]

#30:   0%|          | 0/1 [00:00<?, ?ba/s]

#24:   0%|          | 0/1 [00:00<?, ?ba/s]

#31:   0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
# test dry run with partials
from functools import partial

ds = load_dataset("wikitext", "wikitext-103-v1", split="train[:1%]")
logger.info(f"Original dataset size: {len(ds)}")
check_char_repetition_p = partial(check_char_repetition, char_repetition_len=3)
check_char_repetition_p.__name__ = "check_char_repetition"
datasources = [
    {
        "dataset": ds,
        "columns": ["text"],
        "filters": [check_char_repetition_p, check_flagged_words],
        "cleaners": [remove_empty_lines, normalize_whitespace],
    },
    # ...
]

pipeline = Pipeline(datasources)
pipeline.run(dry_run=True)

assert len(ds) == len(pipeline.datasources[0]["dataset"])
assert "check_char_repetition_criteria" in pipeline.datasources[0]["dataset"].features
assert "check_flagged_words_criteria" in pipeline.datasources[0]["dataset"].features

                                    

#0:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/563 [00:00<?, ?ex/s]

#2:   0%|          | 0/563 [00:00<?, ?ex/s]

  

#4:   0%|          | 0/563 [00:00<?, ?ex/s]

#3:   0%|          | 0/563 [00:00<?, ?ex/s]

#5:   0%|          | 0/563 [00:00<?, ?ex/s]

  

#6:   0%|          | 0/563 [00:00<?, ?ex/s]

  

#7:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#8:   0%|          | 0/563 [00:00<?, ?ex/s]

#9:   0%|          | 0/563 [00:00<?, ?ex/s]

#10:   0%|          | 0/563 [00:00<?, ?ex/s]

  

#11:   0%|          | 0/563 [00:00<?, ?ex/s]

#12:   0%|          | 0/563 [00:00<?, ?ex/s]

  

#13:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#14:   0%|          | 0/563 [00:00<?, ?ex/s]

#15:   0%|          | 0/563 [00:00<?, ?ex/s]

  

#16:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#17:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#18:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#19:   0%|          | 0/563 [00:00<?, ?ex/s]

#20:   0%|          | 0/563 [00:00<?, ?ex/s]

  

#21:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#22:   0%|          | 0/563 [00:00<?, ?ex/s]

#23:   0%|          | 0/563 [00:00<?, ?ex/s]

  

#24:   0%|          | 0/563 [00:00<?, ?ex/s]

  

#26:   0%|          | 0/563 [00:00<?, ?ex/s]

#25:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#27:   0%|          | 0/563 [00:00<?, ?ex/s]

#28:   0%|          | 0/563 [00:00<?, ?ex/s]

  

#29:   0%|          | 0/563 [00:00<?, ?ex/s]

#30:   0%|          | 0/562 [00:00<?, ?ex/s]

#31:   0%|          | 0/562 [00:00<?, ?ex/s]

                                   

#0:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#5:   0%|          | 0/563 [00:00<?, ?ex/s]

#4:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#6:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#7:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#8:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#9:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#10:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#11:   0%|          | 0/563 [00:00<?, ?ex/s]

  

#12:   0%|          | 0/563 [00:00<?, ?ex/s]

#13:   0%|          | 0/563 [00:00<?, ?ex/s]

#14:   0%|          | 0/563 [00:00<?, ?ex/s]

  

#15:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#16:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#17:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#18:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#19:   0%|          | 0/563 [00:00<?, ?ex/s]

  

#20:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#21:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#22:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#23:   0%|          | 0/563 [00:00<?, ?ex/s]

#24:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#25:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#26:   0%|          | 0/563 [00:00<?, ?ex/s]

#27:   0%|          | 0/563 [00:00<?, ?ex/s]

   

#28:   0%|          | 0/563 [00:00<?, ?ex/s]

#29:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#30:   0%|          | 0/562 [00:00<?, ?ex/s]

#31:   0%|          | 0/562 [00:00<?, ?ex/s]

                                  

#0:   0%|          | 0/563 [00:00<?, ?ex/s]

  

#1:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#4:   0%|          | 0/563 [00:00<?, ?ex/s]

#5:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#6:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#7:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#8:   0%|          | 0/563 [00:00<?, ?ex/s]

  

#9:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#10:   0%|          | 0/563 [00:00<?, ?ex/s]

#11:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#12:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#13:   0%|          | 0/563 [00:00<?, ?ex/s]

  

#14:   0%|          | 0/563 [00:00<?, ?ex/s]

#15:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#16:   0%|          | 0/563 [00:00<?, ?ex/s]

  

#17:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#18:   0%|          | 0/563 [00:00<?, ?ex/s]

#19:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#20:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#21:   0%|          | 0/563 [00:00<?, ?ex/s]

#22:   0%|          | 0/563 [00:00<?, ?ex/s]

  

#23:   0%|          | 0/563 [00:00<?, ?ex/s]

#24:   0%|          | 0/563 [00:00<?, ?ex/s]

  

#25:   0%|          | 0/563 [00:00<?, ?ex/s]

#26:   0%|          | 0/563 [00:00<?, ?ex/s]

    

#27:   0%|          | 0/563 [00:00<?, ?ex/s]

#28:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#29:   0%|          | 0/563 [00:00<?, ?ex/s]

#30:   0%|          | 0/562 [00:00<?, ?ex/s]

#31:   0%|          | 0/562 [00:00<?, ?ex/s]

                                   

#0:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/563 [00:00<?, ?ex/s]

#2:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#4:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#5:   0%|          | 0/563 [00:00<?, ?ex/s]

#6:   0%|          | 0/563 [00:00<?, ?ex/s]

   

#7:   0%|          | 0/563 [00:00<?, ?ex/s]

#8:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#9:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#10:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#11:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#12:   0%|          | 0/563 [00:00<?, ?ex/s]

  

#13:   0%|          | 0/563 [00:00<?, ?ex/s]

#14:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#15:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#16:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#17:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#18:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#19:   0%|          | 0/563 [00:00<?, ?ex/s]

  

#20:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#21:   0%|          | 0/563 [00:00<?, ?ex/s]

#22:   0%|          | 0/563 [00:00<?, ?ex/s]

  

#23:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#24:   0%|          | 0/563 [00:00<?, ?ex/s]

#25:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#26:   0%|          | 0/563 [00:00<?, ?ex/s]

  

#27:   0%|          | 0/563 [00:00<?, ?ex/s]

#28:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#29:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#30:   0%|          | 0/562 [00:00<?, ?ex/s]

#31:   0%|          | 0/562 [00:00<?, ?ex/s]

In [None]:
ds = load_dataset("wikitext", "wikitext-103-v1", split="train[:1%]")
logger.info(f"Original dataset size: {len(ds)}")
datasources = [
    {
        "dataset": ds,
        "columns": ["text"],
        "filters": [check_char_repetition, check_flagged_words],
        "cleaners": [remove_empty_lines, normalize_whitespace],
    },
    # ...
]

global_filters = [minhash_dedup]
pipeline = Pipeline(datasources)
pipeline.run(global_filters=global_filters)
logger.info(f"Final dataset size: {len(pipeline.datasources[0]['dataset'])}")

assert len(ds) > len(pipeline.datasources[0]["dataset"])

                                

                                

                                

                                   

#0:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#4:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#5:   0%|          | 0/563 [00:00<?, ?ex/s]

#6:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#7:   0%|          | 0/563 [00:00<?, ?ex/s]

  

#8:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#9:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#10:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#11:   0%|          | 0/563 [00:00<?, ?ex/s]

#12:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#13:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#14:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#15:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#16:   0%|          | 0/563 [00:00<?, ?ex/s]

   

#17:   0%|          | 0/563 [00:00<?, ?ex/s]

#18:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#20:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#19:   0%|          | 0/563 [00:00<?, ?ex/s]

#21:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#22:   0%|          | 0/563 [00:00<?, ?ex/s]

  

#23:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#24:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#25:   0%|          | 0/563 [00:00<?, ?ex/s]

#26:   0%|          | 0/563 [00:00<?, ?ex/s]

 

#27:   0%|          | 0/562 [00:00<?, ?ex/s]

 

#28:   0%|          | 0/562 [00:00<?, ?ex/s]

 

#29:   0%|          | 0/562 [00:00<?, ?ex/s]

 

#30:   0%|          | 0/562 [00:00<?, ?ex/s]

#31:   0%|          | 0/562 [00:00<?, ?ex/s]

                                  

Adding index... #0:   0%|          | 0/563 [00:00<?, ?ex/s]

 

Adding index... #1:   0%|          | 0/563 [00:00<?, ?ex/s]

 

Adding index... #2:   0%|          | 0/563 [00:00<?, ?ex/s]

 

Adding index... #3:   0%|          | 0/563 [00:00<?, ?ex/s]

Adding index... #4:   0%|          | 0/563 [00:00<?, ?ex/s]

  

Adding index... #5:   0%|          | 0/563 [00:00<?, ?ex/s]

Adding index... #6:   0%|          | 0/563 [00:00<?, ?ex/s]

  

Adding index... #7:   0%|          | 0/563 [00:00<?, ?ex/s]

Adding index... #8:   0%|          | 0/563 [00:00<?, ?ex/s]

 

Adding index... #9:   0%|          | 0/563 [00:00<?, ?ex/s]

  

Adding index... #10:   0%|          | 0/563 [00:00<?, ?ex/s]

Adding index... #11:   0%|          | 0/563 [00:00<?, ?ex/s]

 

Adding index... #12:   0%|          | 0/563 [00:00<?, ?ex/s]

   

Adding index... #13:   0%|          | 0/563 [00:00<?, ?ex/s]

Adding index... #14:   0%|          | 0/563 [00:00<?, ?ex/s]

 

Adding index... #15:   0%|          | 0/563 [00:00<?, ?ex/s]

 

Adding index... #16:   0%|          | 0/563 [00:00<?, ?ex/s]

Adding index... #17:   0%|          | 0/563 [00:00<?, ?ex/s]

  

Adding index... #18:   0%|          | 0/563 [00:00<?, ?ex/s]

Adding index... #19:   0%|          | 0/563 [00:00<?, ?ex/s]

   

Adding index... #21:   0%|          | 0/563 [00:00<?, ?ex/s]

Adding index... #20:   0%|          | 0/563 [00:00<?, ?ex/s]

 

Adding index... #22:   0%|          | 0/563 [00:00<?, ?ex/s]

 

Adding index... #23:   0%|          | 0/563 [00:00<?, ?ex/s]

 

Adding index... #24:   0%|          | 0/563 [00:00<?, ?ex/s]

Adding index... #25:   0%|          | 0/563 [00:00<?, ?ex/s]

 

Adding index... #26:   0%|          | 0/563 [00:00<?, ?ex/s]

  

Adding index... #27:   0%|          | 0/562 [00:00<?, ?ex/s]

 

Adding index... #28:   0%|          | 0/562 [00:00<?, ?ex/s]

 

Adding index... #29:   0%|          | 0/562 [00:00<?, ?ex/s]

 

Adding index... #30:   0%|          | 0/562 [00:00<?, ?ex/s]

Adding index... #31:   0%|          | 0/562 [00:00<?, ?ex/s]

                                                             

Fingerprinting... #0:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #2:   0%|          | 0/563 [00:00<?, ?ex/s]

  

Fingerprinting... #4:   0%|          | 0/563 [00:00<?, ?ex/s]

 

Fingerprinting... #8:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #3:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #7:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #1:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #6:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #9:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #13:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #12:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #18:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #11:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #15:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #10:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #17:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #26:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #19:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #16:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #21:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #27:   0%|          | 0/562 [00:00<?, ?ex/s]

Fingerprinting... #22:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #5:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #28:   0%|          | 0/562 [00:00<?, ?ex/s]

Fingerprinting... #23:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #24:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #14:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #25:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #30:   0%|          | 0/562 [00:00<?, ?ex/s]

Fingerprinting... #29:   0%|          | 0/562 [00:00<?, ?ex/s]

Fingerprinting... #20:   0%|          | 0/563 [00:00<?, ?ex/s]

Fingerprinting... #31:   0%|          | 0/562 [00:00<?, ?ex/s]

Indexing signatures...:   0%|          | 0/18011 [00:00<?, ?it/s]

                                                

Querying... #11:   0%|          | 0/563 [00:00<?, ?ex/s]

Querying... #9:   0%|          | 0/563 [00:00<?, ?ex/s]

  

Querying... #4:   0%|          | 0/563 [00:00<?, ?ex/s]

 

Querying... #13:   0%|          | 0/563 [00:00<?, ?ex/s]

Querying... #0:   0%|          | 0/563 [00:00<?, ?ex/s]

Querying... #8:   0%|          | 0/563 [00:00<?, ?ex/s]

  

Querying... #31:   0%|          | 0/562 [00:00<?, ?ex/s]

Querying... #28:   0%|          | 0/562 [00:00<?, ?ex/s]

 

Querying... #15:   0%|          | 0/563 [00:00<?, ?ex/s]

Querying... #14:   0%|          | 0/563 [00:00<?, ?ex/s]

Querying... #10:   0%|          | 0/563 [00:00<?, ?ex/s]

Querying... #3:   0%|          | 0/563 [00:00<?, ?ex/s]

 

Querying... #20:   0%|          | 0/563 [00:00<?, ?ex/s]

Querying... #19:   0%|          | 0/563 [00:00<?, ?ex/s]

Querying... #24:   0%|          | 0/563 [00:00<?, ?ex/s]

 

Querying... #26:   0%|          | 0/563 [00:00<?, ?ex/s]

 

Querying... #29:   0%|          | 0/562 [00:00<?, ?ex/s]

  

Querying... #30:   0%|          | 0/562 [00:00<?, ?ex/s]

 

Querying... #18:   0%|          | 0/563 [00:00<?, ?ex/s]

Querying... #25:   0%|          | 0/563 [00:00<?, ?ex/s]

 

Querying... #27:   0%|          | 0/562 [00:00<?, ?ex/s]

 

Querying... #16:   0%|          | 0/563 [00:00<?, ?ex/s]

 

Querying... #2:   0%|          | 0/563 [00:00<?, ?ex/s]

Querying... #1:   0%|          | 0/563 [00:00<?, ?ex/s]

 

Querying... #6:   0%|          | 0/563 [00:00<?, ?ex/s]

Querying... #5:   0%|          | 0/563 [00:00<?, ?ex/s]

Querying... #23:   0%|          | 0/563 [00:00<?, ?ex/s]

Querying... #7:   0%|          | 0/563 [00:00<?, ?ex/s]

Querying... #21:   0%|          | 0/563 [00:00<?, ?ex/s]

Querying... #22:   0%|          | 0/563 [00:00<?, ?ex/s]

Querying... #12:   0%|          | 0/563 [00:00<?, ?ex/s]

Querying... #17:   0%|          | 0/563 [00:00<?, ?ex/s]

                                   

Finding duplicates... #0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Finding duplicates... #1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Finding duplicates... #2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Finding duplicates... #3:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Finding duplicates... #4:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Finding duplicates... #5:   0%|          | 0/1 [00:00<?, ?ba/s]

Finding duplicates... #6:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Finding duplicates... #7:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Finding duplicates... #8:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Finding duplicates... #9:   0%|          | 0/1 [00:00<?, ?ba/s]

Finding duplicates... #10:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Finding duplicates... #11:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Finding duplicates... #12:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Finding duplicates... #13:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Finding duplicates... #14:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Finding duplicates... #15:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Finding duplicates... #16:   0%|          | 0/1 [00:00<?, ?ba/s]

Finding duplicates... #17:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Finding duplicates... #18:   0%|          | 0/1 [00:00<?, ?ba/s]

Finding duplicates... #19:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Finding duplicates... #20:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Finding duplicates... #21:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Finding duplicates... #22:   0%|          | 0/1 [00:00<?, ?ba/s]

Finding duplicates... #23:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Finding duplicates... #24:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Finding duplicates... #25:   0%|          | 0/1 [00:00<?, ?ba/s]

     

Finding duplicates... #26:   0%|          | 0/1 [00:00<?, ?ba/s]

Finding duplicates... #27:   0%|          | 0/1 [00:00<?, ?ba/s]

Finding duplicates... #29:   0%|          | 0/1 [00:00<?, ?ba/s]

Finding duplicates... #28:   0%|          | 0/1 [00:00<?, ?ba/s]

Finding duplicates... #30:   0%|          | 0/1 [00:00<?, ?ba/s]

Finding duplicates... #31:   0%|          | 0/1 [00:00<?, ?ba/s]

Constructing graph...:   0%|          | 0/7757 [00:00<?, ?it/s]

Iterating over components...:   0%|          | 0/10557 [00:00<?, ?it/s]

                                                        

Filtering duplicates... #2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Filtering duplicates... #1:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Filtering duplicates... #7:   0%|          | 0/1 [00:00<?, ?ba/s]

Filtering duplicates... #5:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Filtering duplicates... #6:   0%|          | 0/1 [00:00<?, ?ba/s]

Filtering duplicates... #3:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Filtering duplicates... #0:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Filtering duplicates... #9:   0%|          | 0/1 [00:00<?, ?ba/s]

Filtering duplicates... #13:   0%|          | 0/1 [00:00<?, ?ba/s]

Filtering duplicates... #4:   0%|          | 0/1 [00:00<?, ?ba/s]

Filtering duplicates... #11:   0%|          | 0/1 [00:00<?, ?ba/s]

Filtering duplicates... #18:   0%|          | 0/1 [00:00<?, ?ba/s]

Filtering duplicates... #10:   0%|          | 0/1 [00:00<?, ?ba/s]

Filtering duplicates... #16:   0%|          | 0/1 [00:00<?, ?ba/s]

Filtering duplicates... #19:   0%|          | 0/1 [00:00<?, ?ba/s]

Filtering duplicates... #15:   0%|          | 0/1 [00:00<?, ?ba/s]

Filtering duplicates... #8:   0%|          | 0/1 [00:00<?, ?ba/s]

Filtering duplicates... #17:   0%|          | 0/1 [00:00<?, ?ba/s]

Filtering duplicates... #14:   0%|          | 0/1 [00:00<?, ?ba/s]

Filtering duplicates... #12:   0%|          | 0/1 [00:00<?, ?ba/s]

Filtering duplicates... #28:   0%|          | 0/1 [00:00<?, ?ba/s]

Filtering duplicates... #22:   0%|          | 0/1 [00:00<?, ?ba/s]

Filtering duplicates... #20:   0%|          | 0/1 [00:00<?, ?ba/s]

Filtering duplicates... #29:   0%|          | 0/1 [00:00<?, ?ba/s]

Filtering duplicates... #31:   0%|          | 0/1 [00:00<?, ?ba/s]

Filtering duplicates... #27:   0%|          | 0/1 [00:00<?, ?ba/s]

Filtering duplicates... #24:   0%|          | 0/1 [00:00<?, ?ba/s]

Filtering duplicates... #25:   0%|          | 0/1 [00:00<?, ?ba/s]

Filtering duplicates... #30:   0%|          | 0/1 [00:00<?, ?ba/s]

Filtering duplicates... #23:   0%|          | 0/1 [00:00<?, ?ba/s]

Filtering duplicates... #21:   0%|          | 0/1 [00:00<?, ?ba/s]

Filtering duplicates... #26:   0%|          | 0/1 [00:00<?, ?ba/s]

                                  

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#8:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#9:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#10:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#11:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#12:   0%|          | 0/1 [00:00<?, ?ba/s]

#13:   0%|          | 0/1 [00:00<?, ?ba/s]

#14:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#15:   0%|          | 0/1 [00:00<?, ?ba/s]

#16:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#17:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#18:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#19:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#20:   0%|          | 0/1 [00:00<?, ?ba/s]

#21:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#22:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#23:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#24:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#25:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#26:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#27:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#28:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#29:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#30:   0%|          | 0/1 [00:00<?, ?ba/s]

#31:   0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
# test the ability to skip global filters
ds_1 = load_dataset("wikitext", "wikitext-103-v1", split="train[:1%]")

datasources = [
    {
        "dataset": ds,
        "columns": ["text"],
        "filters": [check_char_repetition, check_flagged_words],
        "cleaners": [remove_empty_lines, normalize_whitespace],
        "skip_global": False,
    },
    {
        "dataset": ds_1,
        "columns": ["text"],
        "filters": [check_char_repetition, check_flagged_words],
        "cleaners": [remove_empty_lines, normalize_whitespace],
        "skip_global": True,
    },
    # ...
]
pipeline = Pipeline(datasources)
pipeline.run(global_filters=global_filters)
logger.info(f"Final dataset size: {len(pipeline.datasources[0]['dataset'])}")

assert len(pipeline.datasources[0]["dataset"]) < len(pipeline.datasources[1]["dataset"])

                                

                                

                                

                                

                                

                                

                                

                                

                                                                

Indexing signatures...:   0%|          | 0/18011 [00:00<?, ?it/s]

                                                                

Constructing graph...:   0%|          | 0/7757 [00:00<?, ?it/s]

Iterating over components...:   0%|          | 0/10557 [00:00<?, ?it/s]

                                                                

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()