In [None]:
#| hide
import logging

from squeakily.core import *

# Turn off logging for datasets
logging.getLogger("datasets").setLevel(logging.ERROR)

  from .autonotebook import tqdm as notebook_tqdm


# Tutorial: Using another library

> This tutorial shows how to use another library in a notebook. We will use the [scrubadub](https://scrubadub.readthedocs.io/en/stable/index.html) library to remove personal information from text.

First off, we need to install the library.

```bash
pip install scrubadub
```

Now we will use the same (wikitext) dataset as in the previous tutorial.

In [None]:
from datasets import load_dataset

ds = load_dataset("wikitext", "wikitext-103-v1", split="train[:1%]")

We will use the `scrubadub` library to remove personal information from the text. `scrubadub` usually defaults to removing the following types:
* [credential](https://scrubadub.readthedocs.io/en/stable/api_scrubadub_detectors.html#scrubadub-detectors-credentialdetector) - username and password combinations
* [credit_card](https://scrubadub.readthedocs.io/en/stable/api_scrubadub_detectors.html#scrubadub-detectors-creditcarddetector) - credit card numbers
* [drivers_license](https://scrubadub.readthedocs.io/en/stable/api_scrubadub_detectors.html#scrubadub-detectors-driverslicencedetector) - drivers license numbers
* [email](https://scrubadub.readthedocs.io/en/stable/api_scrubadub_detectors.html#scrubadub-detectors-emaildetector) - email addresses
* [national_insurance_number](https://scrubadub.readthedocs.io/en/stable/api_scrubadub_detectors.html#scrubadub-detectors-en-gb-nationalinsurancenumberdetector) - GB National Insurance numbers (NINOs)
* [phone](https://scrubadub.readthedocs.io/en/stable/api_scrubadub_detectors.html#scrubadub-detectors-phonedetector) - phone numbers
* [postalcode](https://scrubadub.readthedocs.io/en/stable/api_scrubadub_detectors.html#scrubadub-detectors-postalcodedetector) - british postal codes
* [social_security_number](https://scrubadub.readthedocs.io/en/stable/api_scrubadub_detectors.html#scrubadub-detectors-en-us-socialsecuritynumberdetector) - US Social Security numbers (SSNs)
* [tax_reference_number](https://scrubadub.readthedocs.io/en/stable/api_scrubadub_detectors.html#scrubadub-detectors-en-gb-taxreferencenumberdetector) - UK PAYE temporary reference number (TRN)
* [twitter](https://scrubadub.readthedocs.io/en/stable/api_scrubadub_detectors.html#scrubadub-detectors-twitterdetector) - twitter handles
* [url](https://scrubadub.readthedocs.io/en/stable/api_scrubadub_detectors.html#scrubadub-detectors-urldetector) - URLs
* [vehicle_license_plate](https://scrubadub.readthedocs.io/en/stable/api_scrubadub_detectors.html#scrubadub-detectors-vehiclelicenceplatedetector) - british vehicle license plates

However, while experimenting with the library it seems some of these are not on by default. Either way, we are only going to focus on the `credit_card`, `drivers_license`, `email`, `phone`, and `social_security_number` detectors. Therefore, we must turn the others off:

In [None]:
from scrubadub import Scrubber
from scrubadub.detectors import CredentialDetector, TwitterDetector, UrlDetector

scrubber = Scrubber()
scrubber.remove_detector(CredentialDetector)
scrubber.remove_detector(TwitterDetector)
scrubber.remove_detector(UrlDetector)

datasources = [
    {
        "dataset": ds,
        "name": "wikitext",
        "columns": ["text"],
        "filters": [],
        "cleaners": [scrubber.clean],
    },
    # ...
]

Essentially, any function that takes in a string and returns a string will work out of the box with `squeakily`. Luckily for us, `scrubadub` has a `clean` function that does just that. We can use this function to remove personal information from the text!

A similar process can be used for filters, except the return type is a `bool` instead of a `str` denoting whether or not the text should be kept.

:::{.callout-note}
Note: If you want to mix and match, it is super easy!

```python
from squeakily.clean import remove_empty_lines, remove_ip
datasources = [
    {
        "dataset": ds,
        "name": "wikitext",
        "columns": ["text"],
        "filters": [],
        "cleaners": [scrubber.clean, remove_empty_lines, remove_ip],
    },
    # ...
]
```
:::

Now we can process the `datasources` as before with a `Pipeline` object.

In [None]:
#|output: false
from squeakily.core import Pipeline

pipeline = Pipeline(datasources)
pipeline.run()

#0:   0%|          | 0/251 [00:00<?, ?ex/s]
[A

[A[A


[A[A[A



#0: 100%|██████████| 251/251 [00:00<00:00, 3072.01ex/s]





#6:   0%|          | 0/251 [00:00<?, ?ex/s]





#1: 100%|██████████| 251/251 [00:00<00:00, 2612.54ex/s]
#2: 100%|██████████| 251/251 [00:00<00:00, 2855.57ex/s]







#3: 100%|██████████| 251/251 [00:00<00:00, 2935.28ex/s]
#4: 100%|██████████| 251/251 [00:00<00:00, 3264.68ex/s]








[A[A[A[A[A[A[A[A








#10:   0%|          | 0/251 [00:00<?, ?ex/s]



[A[A[A[A









#5: 100%|██████████| 251/251 [00:00<00:00, 2389.82ex/s]
#6: 100%|██████████| 251/251 [00:00<00:00, 2589.32ex/s]











[A[A[A[A[A[A[A[A[A[A[A





[A[A[A[A[A[A











[A[A[A[A[A[A[A[A[A[A[A[A






#7: 100%|██████████| 251/251 [00:00<00:00, 2034.34ex/s]
#9: 100%|██████████| 251/251 [00:00<00:00, 2617.65ex/s]













#11: 100%|██████████| 251/251 [00:00<00:00, 3306.24ex/s]
#8: 100%|██████████| 251/251 [00:00<00:00, 1814.93ex/s]




