# core

> This module contains all the core functions used in the library.

In [None]:
#| default_exp core

In [None]:
#| export
import logging
import os

from datasets import concatenate_datasets, Dataset
from rich.logging import RichHandler

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
logger.addHandler(RichHandler(rich_tracebacks=True))
# Turn off logging for datasets
logging.getLogger("datasets").setLevel(logging.ERROR)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| hide
from datasets import load_dataset
from squeakily.filter import check_char_repetition, check_flagged_words, minhash_dedup
from squeakily.clean import remove_empty_lines, normalize_whitespace

In [None]:
#| export
class Pipeline:
    """
    A pipeline is a collection of datasources and their associated transformations to be run.
    """
    def __init__(
        self,
        datasources # The datasources to be run
    ):
        self.datasources = datasources
    
    def run(
        self,
        global_filters=[], # Filters to be run at the dataset level rather than the example level
        global_cleaners=[], # Cleaners to be run at the dataset level rather than the example level
        cleaning_first=False, # Whether to run the cleaning transformations first
        globals_first=False, # Whether to run the global transformations first
    ):
        """
        Run the pipeline.
        """
        for datasource in self.datasources:
            dataset = datasource["dataset"]
            column = datasource["columns"][0]
            logger.info(f"Running datasource: {dataset.builder_name}")
            if cleaning_first:
                for c in datasource["cleaners"]:
                    logger.info(f"Running cleaner: {c.__name__} on {column}")
                    dataset = dataset.map(
                        lambda x: {column: c(x[column])},
                        num_proc=os.cpu_count(),
                    )
                for f in datasource["filters"]:
                    logger.info(f"Running filter: {f.__name__} on {column}")
                    dataset = dataset.filter(
                        lambda x: f(x[column]),
                        num_proc=os.cpu_count(),
                    )
            else:
                for f in datasource["filters"]:
                    logger.info(f"Running filter: {f.__name__} on {column}")
                    dataset = dataset.filter(
                        lambda x: f(x[column]),
                        num_proc=os.cpu_count(),
                    )
                for c in datasource["cleaners"]:
                    logger.info(f"Running cleaner: {c.__name__} on {column}")
                    dataset = dataset.map(
                        lambda x: {column: c(x[column])},
                        num_proc=os.cpu_count(),
                    )
        
        if global_filters:
            # concatenate all datasets
            datasets = [d["dataset"] for d in self.datasources]
            global_column = self.datasources[0]["columns"][0]
            global_dataset = concatenate_datasets(datasets)

            # Add a column representing the original dataset name
            md = []
            for d in datasets:
                md.extend([d.builder_name] * len(d))
            meta_data = Dataset.from_dict({"meta_data": md})
            global_dataset_with_meta = concatenate_datasets([global_dataset, meta_data], axis=1)

            # Run the global filters
            for f in global_filters:
                logger.info(f"Running global filter: {f.__name__}")
                global_dataset_with_meta = f(global_dataset_with_meta, global_column)

            # Split the dataset back up
            for i, dataset in enumerate(datasets):
                self.datasources[i]["dataset"] = global_dataset_with_meta.filter(
                    lambda x: x["meta_data"] == dataset.builder_name,
                    num_proc=os.cpu_count(),
                )

In [None]:
#|echo: true
show_doc(Pipeline.run)

---

[source](https://github.com/CarperAI/squeakily/blob/main/squeakily/core.py#L30){target="_blank" style="float:right; font-size:smaller"}

### Pipeline.run

>      Pipeline.run (global_filters=[], global_cleaners=[],
>                    cleaning_first=False, globals_first=False)

Run the pipeline.

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| global_filters | list | [] | Filters to be run at the dataset level rather than the example level |
| global_cleaners | list | [] | Cleaners to be run at the dataset level rather than the example level |
| cleaning_first | bool | False | Whether to run the cleaning transformations first |
| globals_first | bool | False | Whether to run the global transformations first |

In [None]:
ds = load_dataset("wikitext", "wikitext-103-v1", split="train[:1%]")
logger.info(f"Original dataset size: {len(ds)}")
datasources = [
    {
        "dataset": ds,
        "columns": ["text"],
        "filters": [check_char_repetition, check_flagged_words],
        "cleaners": [remove_empty_lines, normalize_whitespace],
    },
    # ...
]

global_filters = [minhash_dedup]
pipeline = Pipeline(datasources)
pipeline.run(global_filters=global_filters)
logger.info(f"Final dataset size: {len(pipeline.datasources[0]['dataset'])}")

Fingerprinting... #1:   0%|          | 0/251 [00:00<?, ?ex/s]
[A

Fingerprinting... #3:   0%|          | 0/251 [00:00<?, ?ex/s]


Fingerprinting... #4:   0%|          | 0/251 [00:00<?, ?ex/s]



[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






Fingerprinting... #0:  29%|██▉       | 74/251 [00:00<00:00, 735.55ex/s]







[A[A[A[A[A[A[A[A
[A








[A[A[A[A[A[A[A[A[A

[A[A









[A[A[A[A[A[A[A[A[A[A


[A[A[A



[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A











[A[A[A[A[A[A[A[A[A[A[A[A






[A[A[A[A[A[A[A












Fingerprinting... #0:  65%|██████▌   | 164/251 [00:00<00:00, 831.33ex/s]







[A[A[A[A[A[A[A[A













[A[A[A[A[A[A[A[A[A[A[A[A[A[A














[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A
[A








[A[A[A[A[A[A[A[A[A















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

[A[A


[A[A[A


Indexing signatures...: 100%|██████████| 18014/18014 [00:01<00:00, 9486.98it/s] 









[A[A[A[A[A[A[A[A[A















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A


[A[A[A

Querying... #0:   0%|          | 0/251 [00:00<?, ?ex/s]



[A[A[A[A
[A








[A[A[A[A[A[A[A[A[A





[A[A[A[A[A[A
















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A






[A[A[A[A[A[A[A















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A


[A[A[A

[A[A



Querying... #0:  10%|█         | 26/251 [00:00<00:00, 244.72ex/s]
[A








[A[A[A[A[A[A[A[A[A





[A[A[A[A[A[A
















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A






[A[A[A[A[A[A[A















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

[A[A


Querying... #0:  23%|██▎       | 58/251 [00:00<00:00, 280.10ex/s]



[A[A[A[A
[A





[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A















[A[A[A[A[A[A[A[A

Finding duplicates... #0:   0%|          | 0/1 [00:00<?, ?ba/s]
[A

Finding duplicates... #3:   0%|          | 0/1 [00:00<?, ?ba/s]


[A[A[A



[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A









Finding duplicates... #0: 100%|██████████| 1/1 [00:00<00:00,  4.65ba/s]











[A[A[A[A[A[A[A[A[A[A[A











Finding duplicates... #13:   0%|          | 0/1 [00:00<?, ?ba/s]
Finding duplicates... #2: 100%|██████████| 1/1 [00:00<00:00,  4.49ba/s]













Finding duplicates... #1: 100%|██████████| 1/1 [00:00<00:00,  3.75ba/s]














[A[A[A[A[A[A[A[A[A[A[A[A[A[A


Finding duplicates... #4: 100%|██████████| 1/1 [00:00<00:00,  4.53ba/s]


Finding duplicates... #3: 100%|██████████| 1/1 [00:00<00:00,  3.95ba/s]





Finding duplicates... #5: 100%|██████████| 1/1 [00:00<00:00,  4.52ba/s][A[A[A[A














[A[A[A[A[A[A[A[A[A[A[

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()