# clean

> This module contains all the various cleaning options supported.

In [None]:
#| default_exp clean

In [None]:
#| export
import re
from faker import Faker

fake = Faker()

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
# From: https://github.com/bigscience-workshop/data-preparation/blob/main/preprocessing/training/01b_oscar_cleaning_and_filtering/filtering.py#L95
whitespace={
    " ",
    " ",
    " ",
    " ",
    " ",
    "　",
    " ",
    " ",
    " ",
    " ",
    "￼",
    "",
}

def normalize_whitespace(
    text: str,  # The text to normalize
) -> str:       # The normalized text
    """
    Replace the various whitespace characters with the standard one.
    """
    text = "".join(
        [char if char not in whitespace else " " for char in text]
    )
    return text

In [None]:
# test the normalize_whitespace function
assert normalize_whitespace("a b c d e　f g h i￼jk") == "a b c d e f g h i j k"

In [None]:
#| export
unicode_punctuation = {
    "，": ",",
    "。": ".",
    "、": ",",
    "„": '"',
    "”": '"',
    "“": '"',
    "«": '"',
    "»": '"',
    "１": '"',
    "」": '"',
    "「": '"',
    "《": '"',
    "》": '"',
    "´": "'",
    "∶": ":",
    "：": ":",
    "？": "?",
    "！": "!",
    "（": "(",
    "）": ")",
    "；": ";",
    "–": "-",
    "—": " - ",
    "．": ". ",
    "～": "~",
    "’": "'",
    "…": "...",
    "━": "-",
    "〈": "<",
    "〉": ">",
    "【": "[",
    "】": "]",
    "％": "%",
    "►": "-",
}

def normalize_punctuation(
    text: str,  # The text to normalize
) -> str:       # The normalized text
    """
    Replace the various unicode punctuation characters with the standard ones.
    """
    text = "".join(
        [unicode_punctuation.get(char, char) for char in text]
    )
    return text

In [None]:
# test the normalize_punctuation function
text = "，。、„”“«»１」「《》´∶：？！（）；–—．～’…━〈〉【】％►"

assert normalize_punctuation(text) == ",.,\"\"\"\"\"\"\"\"\"\"\'::?!();- - . ~\'...-<>[]%-"

In [None]:
#| export
def remove_empty_lines(
    text: str,  # The text to remove empty lines from
) -> str:       # The text with empty lines removed
    """
    Remove empty lines from the text.
    Solution from https://stackoverflow.com/a/3711884/5768407
    """
    lines = text.splitlines()
    filtered = filter(lambda x: not re.match(r'^\s*$', x), lines)
    return "\n".join(filtered)

In [None]:
# test the remove_empty_lines function
starts_with_newline = "\nfoo\nbar"
multiple_newlines = "foo\n\nbar"
ends_with_newline = "foo\nbar\n"

assert remove_empty_lines(starts_with_newline) == "foo\nbar"
assert remove_empty_lines(multiple_newlines) == "foo\nbar"
assert remove_empty_lines(ends_with_newline) == "foo\nbar"

In [None]:
#| export
def replace_urls(
    text: str,                              # The text to replace URLs in
    dummy: str = "https://example.com/",    # The dummy text to replace URLs with
) -> str:                                   # The text with URLs replaced
    """Replace urls from text with a dummy."""
    return re.sub(r"http\S+", dummy, text)

In [None]:
# test the replace_urls function
url_after_space = "foo http://bar.com"
url_before_space = "http://foo.com bar"
assert replace_urls(url_after_space) == "foo https://example.com/"
assert replace_urls(url_before_space) == "https://example.com/ bar"

In [None]:
#| export
def replace_dates(
    text: str,                  # The text to remove dates from
    dummy: str = fake.date(),   # The dummy text to replace dates with
) -> str:                       # The text with dates replaced
    """Replace dates from text with a dummy."""
    return re.sub(r'\d{1,2}/\d{1,2}/\d{4}', dummy, text)

In [None]:
# test the replace_dates function
date_after_space = "foo 1/1/2020"
date_before_space = "1/1/2020 bar"
assert replace_dates(date_after_space, "1/1/1970") == "foo 1/1/1970"
assert replace_dates(date_before_space, "1/1/1970") == "1/1/1970 bar"

## PII Removal

Currently, we support the following PII removal options:

  * `replace_email`
  * `replace_phone`
  * `replace_ip`
  * `replace_credit_card`
  * `replace_ssn`

However, for emails, phone numbers, credit cards, and SSNs, we recommend you to use the [scrubadub](https://scrubadub.readthedocs.io/en/stable/index.html) library.

In [None]:
#| export
def replace_email(
    text: str,                          # The text to replace email addresses in
    dummy: str = fake.email(),          # The dummy text to replace email addresses with
) -> str:                               # The text with email addresses replaced
    """Replace email addresses from text with a dummy."""
    return re.sub(r"[\w\.-]+@[\w\.-]+", dummy, text)

In [None]:
# test the replace_email function
email_after_space = "foo fake@email.com"
email_before_space = "fake@email.com bar"
email_with_forward_periods = "foo.bar@email.com"
email_with_backward_periods = "foo@bar.email.com"

assert replace_email(email_after_space, "example@email.com") == "foo example@email.com"
assert replace_email(email_before_space, "example@email.com") == "example@email.com bar"
assert replace_email(email_with_forward_periods, "example@email.com") == "example@email.com"
assert replace_email(email_with_backward_periods, "example@email.com") == "example@email.com"

In [None]:
#| export
def replace_phone(
    text: str,                          # The text to replace phone numbers in
    dummy: str = fake.phone_number(),   # The dummy text to replace phone numbers with
) -> str:                               # The text with phone numbers replaced
    """Replace phone numbers from text with a dummy."""
    return re.sub(r"\(?\d{3}\)?-? *\d{3}-? *-?\d{4}", dummy, text)

In [None]:
# test the replace_phone function
phone_after_space = "foo 111-222-3333"
phone_before_space = "111-222-3333 bar"
phone_with_parens = "(111) 222-3333"
phone_with_spaces = "111 222 3333"
phone_with_dashes = "111-222-3333"

assert replace_phone(phone_after_space, "123-456-7890") == "foo 123-456-7890"
assert replace_phone(phone_before_space, "123-456-7890") == "123-456-7890 bar"
assert replace_phone(phone_with_parens, "123-456-7890") == "123-456-7890"
assert replace_phone(phone_with_spaces, "123-456-7890") == "123-456-7890"
assert replace_phone(phone_with_dashes, "123-456-7890") == "123-456-7890"

In [None]:
#| export
def replace_ip(
    text,                       # The text to replace ip addresses in
    dummy1: str = fake.ipv4(),  # The dummy text to replace ipv4 addresses with
    dummy2: str = fake.ipv6(),  # The dummy text to replace ipv6 addresses with
) -> str:                       # The text with ip addresses replaced
    """
    Replace ip addresses from text with a dummy.
    Solution from https://github.com/bigcode-project/bigcode-analysis/blob/main/data_analysis/pii/utils/emails_ip_addresses_detection.py#L48
    """
    ipv4_pattern = r"(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?:\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}"
    text = re.sub(ipv4_pattern, dummy1, text)
    ipv6_pattern = r"(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])"
    text = re.sub(ipv6_pattern, dummy2, text)
    return text

In [None]:
# test the replace_ip function
ip4_after_space = "foo 111.222.3.4"
ip4_before_space = "111.222.3.4 bar"
ip6_with_colons = "2001:0db8:0000:0000:0000:8a2e:0370:7334"

assert replace_ip(ip4_after_space, "127.0.0.1") == "foo 127.0.0.1"
assert replace_ip(ip4_before_space, "127.0.0.1") == "127.0.0.1 bar"
assert replace_ip(ip6_with_colons, "127.0.0.1", "0:0:0:0:0:0:0:1") == "0:0:0:0:0:0:0:1"

In [None]:
#| export
def replace_credit_card(
    text: str,                              # The text to replace credit card numbers in
    dummy: str = fake.credit_card_number(), # The dummy text to replace credit card numbers with
) -> str:                                   # The text with credit card numbers replaced
    """Replace credit card numbers from text with a dummy."""
    return re.sub(r"\d{4}-\d{4}-\d{4}-\d{4}", dummy, text)

In [None]:
# test the replace_credit_card function
credit_card_after_space = "foo 1111-2222-3333-4444"
credit_card_before_space = "1111-2222-3333-4444 bar"

assert replace_credit_card(credit_card_after_space, "1234-5678-9012-3456") == "foo 1234-5678-9012-3456"
assert replace_credit_card(credit_card_before_space, "1234-5678-9012-3456") == "1234-5678-9012-3456 bar"

In [None]:
#| export
def replace_ssn(
    text: str,                  # The text to replace social security numbers in
    dummy: str = fake.ssn(),    # The dummy text to replace social security numbers with
) -> str:                       # The text with social security numbers replaced
    """Replace social security numbers from text with a dummy."""
    return re.sub(r"\d{3}-\d{2}-\d{4}", dummy, text)

In [None]:
# test the replace_ssn function
ssn_after_space = "foo 111-22-3333"
ssn_before_space = "111-22-3333 bar"

assert replace_ssn(ssn_after_space, "123-45-6789") == "foo 123-45-6789"
assert replace_ssn(ssn_before_space, "123-45-6789") == "123-45-6789 bar"

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()