diff --git a/CHANGELOG.md b/CHANGELOG.md index 878f2e24..a0790621 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -298,6 +298,7 @@ ## dev (0.9.6) - Add Python 3.11. +- Add pre-processor when parsing addresses. - Drop Python 3.7 support since newer Python versions are faster and [Torch 2.0 does not support Python 3.7](https://dev-discuss.pytorch.org/t/dropping-support-for-cuda-11-6-and-python-3-7-from-pytorch-2-0-release/1021). - Add `torch.compile` integration to improve performance (Torch 1.x still supported) with `mode="reduce-overhead"` as diff --git a/deepparse/parser/address_parser.py b/deepparse/parser/address_parser.py index f048a8a5..ad064405 100644 --- a/deepparse/parser/address_parser.py +++ b/deepparse/parser/address_parser.py @@ -11,7 +11,7 @@ import warnings from functools import partial from pathlib import Path -from typing import Dict, List, Tuple, Union +from typing import Dict, List, Tuple, Union, Callable import torch from poutyne.framework import Experiment @@ -39,7 +39,8 @@ from ..errors import FastTextModelError from ..metrics import nll_loss, accuracy from ..network import ModelFactory -from ..preprocessing import AddressCleaner +from ..pre_processing import trailing_whitespace_cleaning, double_whitespaces_cleaning +from ..pre_processing import coma_cleaning, lower_cleaning, hyphen_cleaning from ..tools import CACHE_PATH, valid_poutyne_version from ..vectorizer import VectorizerFactory @@ -276,6 +277,7 @@ def __call__( batch_size: int = 32, num_workers: int = 0, with_hyphen_split: bool = False, + pre_processors: Union[None, List[Callable]] = None, ) -> Union[FormattedParsedAddress, List[FormattedParsedAddress]]: # pylint: disable=too-many-arguments """ @@ -304,16 +306,23 @@ def __call__( the hyphen split between the unit and the street number (e.g. Canada). For example, ``'3-305'`` will be replaced as ``'3 305'`` for the parsing. Where ``'3'`` is the unit, and ``'305'`` is the street number. We use a regular expression to replace alphanumerical characters separated by a hyphen at - the start of the string. We do so since some cities use hyphens in their names. Default is ``False``. + the start of the string. We do so since some cities use hyphens in their names. The default + is ``False``. If True, it adds the :func:`~deepparse.pre_processing.pre_processor.hyphen_cleaning` + pre-processor **at the end** of the pre-processor list to apply. + pre_processors (Union[None, List[Callable]]): A list of functions (callable) to apply pre-processing on + all the addresses to parse before parsing. See :ref:`pre_processor_label` for examples of + pre-processors. Since models were trained on lowercase data, during the parsing, we always apply a + lowercase pre-processor. If you pass a list of pre-processor, a lowercase pre-processor is + added **at the end** of the pre-processor list to apply. By default, None, + meaning we use the default setup, which is (in order) the coma removal pre-processor, lowercase, + double whitespace cleaning and trailing whitespace removal. Return: Either a :class:`~FormattedParsedAddress` or a list of :class:`~FormattedParsedAddress` when given more than one address. Note: - During the parsing, the addresses are lowercase, and commas are removed. One can also use the - ``with_hyphen_split`` bool argument for replacing hyphens (used to separate units from street numbers, - e.g. ``'3-305 a street name'``) by whitespace for proper cleaning. + Since model was trained on lowercase data, during the parsing, we always apply a lowercase pre-processor. Examples: @@ -351,6 +360,16 @@ def __call__( addresses_to_parse = CSVDatasetContainer("./a_path.csv", column_names=["address_column_name"], is_training_container=False) address_parser(addresses_to_parse) + + Using a user-define pre-processor + + .. code-block:: python + + def strip_parenthesis(address): + return address.strip("(").strip(")") + + address_parser(addresses_to_parse, pre_processors=[strip_parenthesis]) + # It will also use the default lower case pre-processor. """ self._model_os_validation(num_workers=num_workers) @@ -363,7 +382,18 @@ def __call__( if isinstance(addresses_to_parse, DatasetContainer): addresses_to_parse = addresses_to_parse.data - clean_addresses = AddressCleaner(with_hyphen_split=with_hyphen_split).clean(addresses_to_parse) + if pre_processors is None: + # Default pre_processing setup. + pre_processors = [coma_cleaning, lower_cleaning, trailing_whitespace_cleaning, double_whitespaces_cleaning] + else: + # We add, at the end, a lower casing cleaning pre-processor. + pre_processors.append(lower_cleaning) + + if with_hyphen_split: + pre_processors.append(hyphen_cleaning) + + self.pre_processors = pre_processors + clean_addresses = self._apply_pre_processors(addresses_to_parse) if self.verbose and len(addresses_to_parse) > PREDICTION_TIME_PERFORMANCE_THRESHOLD: print("Vectorizing the address") @@ -1197,3 +1227,12 @@ def _model_os_validation(self, num_workers): "FastText objects are not pickleable with the parallelism process used by default by MacOS. " "Thus, you need to set torch.multiprocessing.set_start_method('fork') to allow torch parallelism." ) + + def _apply_pre_processors(self, addresses: List[str]) -> List[str]: + res = [] + + for address in addresses: + for pre_processor in self.pre_processors: + processed_address = pre_processor(address) + res.append(" ".join(processed_address.split())) + return res diff --git a/deepparse/pre_processing/__init__.py b/deepparse/pre_processing/__init__.py new file mode 100644 index 00000000..1c5c715f --- /dev/null +++ b/deepparse/pre_processing/__init__.py @@ -0,0 +1 @@ +from .pre_processor import * diff --git a/deepparse/pre_processing/pre_processor.py b/deepparse/pre_processing/pre_processor.py new file mode 100644 index 00000000..b4a180c8 --- /dev/null +++ b/deepparse/pre_processing/pre_processor.py @@ -0,0 +1,85 @@ +import re + + +def double_whitespaces_cleaning(address: str) -> str: + """ + Pre-processor to remove double whitespace by one whitespace. + The regular expression use to clean multiple whitespaces is the following ``" {2,}"``. + + Args: + address: The address to apply double whitespace cleaning on. + + Return: + The double whitespace cleaned address. + """ + return re.sub(pattern=r" {2,}", repl=r" ", string=address) + + +def trailing_whitespace_cleaning(address: str) -> str: + """ + Pre-processor to remove trailing whitespace. + + Args: + address: The address to apply trailing whitespace cleaning on. + + Return: + The trailing whitespace cleaned address. + """ + return address.strip(" ") + + +def coma_cleaning(address: str) -> str: + """ + Pre-processor to remove coma. It is based on `issue 56 `_. + + Args: + address: The address to apply coma cleaning on. + + Return: + The coma-cleaned address. + """ + return address.replace(",", "") + + +def lower_cleaning(address: str) -> str: + """ + Pre-processor to lowercase an address since the original training data was in lowercase. + + Args: + address: The address to apply coma cleaning on. + + Return: + The lowercase address. + """ + return address.lower() + + +# The first group is the unit, and the second is the street number. +# Both include letters since they can include letters in some countries. For example, +# unit 3a or address 305a. +hyphen_splitted_unit_and_street_number_regex = r"^([0-9]*[a-z]?)-([0-9]*[a-z]?) " + + +def hyphen_cleaning(address: str) -> str: + """ + Pre-processor to clean hyphen between the street number and unit in an address. Since some addresses use the + hyphen to split the unit and street address, we replace the hyphen with whitespaces to allow a + proper splitting of the address. For example, the proper parsing of the address 3-305 street name is + Unit: 3, StreetNumber: 305, StreetName: street name. + + See `issue 137 `_ for more details. + + The regular expression use to clean hyphen is the following ``"^([0-9]*[a-z]?)-([0-9]*[a-z]?) "``. + The first group is the unit, and the second is the street number. Both include letters since they can include + letters in some countries. For example, unit 3a or address 305a. + + Note: the hyphen is also used in some cities' names, such as Saint-Jean; thus, we use regex to detect + the proper hyphen to replace. + + Args: + address: The address to apply coma cleaning on. + + Return: + The lowercase address. + """ + return re.sub(pattern=hyphen_splitted_unit_and_street_number_regex, repl=r"\1 \2 ", string=address) diff --git a/deepparse/preprocessing/__init__.py b/deepparse/preprocessing/__init__.py deleted file mode 100644 index 71b53988..00000000 --- a/deepparse/preprocessing/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .address_cleaner import * diff --git a/deepparse/preprocessing/address_cleaner.py b/deepparse/preprocessing/address_cleaner.py deleted file mode 100644 index abcc23a0..00000000 --- a/deepparse/preprocessing/address_cleaner.py +++ /dev/null @@ -1,47 +0,0 @@ -import re -from typing import List - -# The first group is the unit, and the second is the street number. -# Both include letters since they can include letters in some countries. For example, -# unit 3a or address 305a. -hyphen_splitted_unit_and_street_number_regex = r"^([0-9]*[a-z]?)-([0-9]*[a-z]?) " - - -class AddressCleaner: - def __init__(self, with_hyphen_split: bool = False) -> None: - self.with_hyphen_split = with_hyphen_split - - def clean(self, addresses: List[str]) -> List[str]: - res = [] - - for address in addresses: - processed_address = self.coma_cleaning(address) - - processed_address = self.lower_cleaning(processed_address) - - if self.with_hyphen_split: - processed_address = self.hyphen_cleaning(processed_address) - - res.append(" ".join(processed_address.split())) - return res - - @staticmethod - def coma_cleaning(text: str) -> str: - # See issue 56 https://github.com/GRAAL-Research/deepparse/issues/56 - return text.replace(",", "") - - @staticmethod - def lower_cleaning(text: str) -> str: - # Since the original training data was in lowercase - return text.lower() - - @staticmethod - def hyphen_cleaning(text: str) -> str: - # See issue 137 for more details https://github.com/GRAAL-Research/deepparse/issues/137. - # Since some addresses use the hyphen to split the unit and street address, we replace the hyphen - # with whitespaces to allow a proper splitting of the address. - # For example, the proper parsing of the address 3-305 street name is - # Unit: 3, StreetNumber: 305, StreetName: street name. - # Note: the hyphen is also used in some cities' names, such as Saint-Jean; thus, we use regex to detect - # the proper hyphen to replace. - return re.sub(hyphen_splitted_unit_and_street_number_regex, r"\1 \2 ", text) diff --git a/docs/source/index.rst b/docs/source/index.rst index 8b67f1e6..e8e81429 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -765,6 +765,7 @@ API Reference :caption: API parser + pre_processor dataset_container comparer cli diff --git a/docs/source/pre_processor.rst b/docs/source/pre_processor.rst new file mode 100644 index 00000000..49fc2641 --- /dev/null +++ b/docs/source/pre_processor.rst @@ -0,0 +1,16 @@ +.. _pre_processor_label: + +.. role:: hidden + :class: hidden-section + + +Pre-Processors +================= +Here are the available pre-processor in Deepparse. The first four are used as default settings when parsing +addresses. + +.. autofunction:: deepparse.pre_processing.pre_processor.coma_cleaning +.. autofunction:: deepparse.pre_processing.pre_processor.lower_cleaning +.. autofunction:: deepparse.pre_processing.pre_processor.trailing_whitespace_cleaning +.. autofunction:: deepparse.pre_processing.pre_processor.double_whitespaces_cleaning +.. autofunction:: deepparse.pre_processing.pre_processor.hyphen_cleaning \ No newline at end of file diff --git a/tests/preprocessing/__init__.py b/tests/pre_processing/__init__.py similarity index 100% rename from tests/preprocessing/__init__.py rename to tests/pre_processing/__init__.py diff --git a/tests/pre_processing/test_address_cleaner.py b/tests/pre_processing/test_address_cleaner.py new file mode 100644 index 00000000..d1f28513 --- /dev/null +++ b/tests/pre_processing/test_address_cleaner.py @@ -0,0 +1,98 @@ +from unittest import TestCase + +from deepparse.pre_processing import ( + coma_cleaning, + lower_cleaning, + trailing_whitespace_cleaning, + hyphen_cleaning, + double_whitespaces_cleaning, +) + + +class PreProcessorTest(TestCase): + @classmethod + def setUpClass(cls): + cls.a_clean_address = "350 rue des lilas ouest québec québec g1l 1b6" + cls.a_dirty_address_with_commas = "350 rue des lilas , ouest ,québec québec, g1l 1b6" + cls.a_commas_separated_address = "350, rue des lilas, ouest, québec, québec, g1l 1b6" + cls.a_dirty_address_with_uppercase = "350 rue des Lilas Ouest Québec Québec G1L 1B6" + cls.a_dirty_address_with_trailing_whitespaces = "350 rue des lilas ouest québec québec g1l 1b6 " + cls.a_dirty_address_with_whitespaces = "350 rue des lilas ouest québec québec g1l 1b6" + + cls.an_address_with_hyphen_split_address_components = "3-350 rue des lilas ouest" + cls.a_unit_clean_address = "3 350 rue des lilas ouest" + + cls.an_address_with_hyphen_split_address_components_with_hyphen_city = "3-350 rue des lilas ouest saint-jean" + cls.a_unit_hyphen_city_name_clean_address = "3 350 rue des lilas ouest saint-jean" + + cls.a_unit_with_letter_hyphen_split = "3a-350 rue des lilas ouest saint-jean" + cls.a_unit_with_letter_hyphen_split_clean_address = "3a 350 rue des lilas ouest saint-jean" + + cls.a_unit_with_letter_only_hyphen_split = "a-350 rue des lilas ouest saint-jean" + cls.a_unit_with_letter_only_hyphen_split_clean_address = "a 350 rue des lilas ouest saint-jean" + + cls.a_street_number_with_letter_hyphen_split = "3-350a rue des lilas ouest saint-jean" + cls.a_street_number_with_letter_hyphen_split_clean_address = "3 350a rue des lilas ouest saint-jean" + + cls.letters_hyphen_address = "3a-350b rue des lilas ouest saint-jean" + cls.letters_hyphen_address_split_clean_address = "3a 350b rue des lilas ouest saint-jean" + + def test_givenADirtyAddressWithCommas_whenComaCleaning_thenShouldRemoveCommas( + self, + ): + cleaned_address = coma_cleaning(self.a_commas_separated_address) + + self.assertEqual(self.a_clean_address, cleaned_address) + + def test_givenADirtyAddressWithUppercase_whenLowerCleaning_thenShouldLower(self): + cleaned_address = lower_cleaning(self.a_dirty_address_with_uppercase) + + self.assertEqual(self.a_clean_address, cleaned_address) + + def test_givenADirtyAddressWithWhitespaces_whenTrailingWhitespaceCleaning_thenShouldRemoveWhitespaces( + self, + ): + cleaned_address = trailing_whitespace_cleaning(self.a_dirty_address_with_trailing_whitespaces) + + self.assertEqual(self.a_clean_address, cleaned_address) + + def test_givenADirtyAddressWithWhitespacesInAddress_whenDoubleWhitespacesCleaning_thenShouldRemoveWhitespaces( + self, + ): + cleaned_address = double_whitespaces_cleaning(self.a_dirty_address_with_whitespaces) + + self.assertEqual(self.a_clean_address, cleaned_address) + + def test_givenAHyphenUnitStreetNumberAddress_whenCleaningAddress_thenShouldReplaceHyphenWithWhiteSpace(self): + cleaned_address = hyphen_cleaning(self.an_address_with_hyphen_split_address_components) + + self.assertEqual(self.a_unit_clean_address, cleaned_address) + + def test_givenAHyphenUnitAndCityAddress_whenCleaningAddress_thenShouldReplaceUnitStreetNumberHyphenWithWhiteSpace( + self, + ): + cleaned_address = hyphen_cleaning(self.an_address_with_hyphen_split_address_components_with_hyphen_city) + + self.assertEqual(self.a_unit_hyphen_city_name_clean_address, cleaned_address) + + def test_givenAnAlphabeticalUnitStreetNumberHyphen_whenCleaningAddress_thenShouldReplaceHyphenWithWhiteSpace(self): + cleaned_address = hyphen_cleaning(self.a_unit_with_letter_hyphen_split) + + self.assertEqual(self.a_unit_with_letter_hyphen_split_clean_address, cleaned_address) + + def test_givenAnAlphabeticalOnlyUnitHyphen_whenCleaningAddress_thenShouldReplaceHyphenWithWhiteSpace(self): + cleaned_address = hyphen_cleaning(self.a_unit_with_letter_only_hyphen_split) + + self.assertEqual(self.a_unit_with_letter_only_hyphen_split_clean_address, cleaned_address) + + def test_givenAnAlphabeticalStreetNumberUnitHyphen_whenCleaningAddress_thenShouldReplaceHyphenWithWhiteSpace(self): + cleaned_address = hyphen_cleaning(self.a_street_number_with_letter_hyphen_split) + + self.assertEqual(self.a_street_number_with_letter_hyphen_split_clean_address, cleaned_address) + + def test_givenAnAlphabeticalComponentsStreetNumberUnit_whenCleaningAddress_thenShouldReplaceHyphenWithWhiteSpace( + self, + ): + cleaned_address = hyphen_cleaning(self.letters_hyphen_address) + + self.assertEqual(self.letters_hyphen_address_split_clean_address, cleaned_address) diff --git a/tests/preprocessing/test_address_cleaner.py b/tests/preprocessing/test_address_cleaner.py deleted file mode 100644 index 1beac65f..00000000 --- a/tests/preprocessing/test_address_cleaner.py +++ /dev/null @@ -1,119 +0,0 @@ -from unittest import TestCase - -from deepparse.preprocessing import AddressCleaner - - -class AddressCleanerTest(TestCase): - @classmethod - def setUpClass(cls): - cls.a_clean_address = "350 rue des lilas ouest québec québec g1l 1b6" - cls.a_dirty_address_with_commas = "350 rue des lilas , ouest ,québec québec, g1l 1b6" - cls.a_commas_separated_address = "350, rue des lilas, ouest, québec, québec, g1l 1b6" - cls.a_dirty_address_with_uppercase = "350 rue des Lilas Ouest Québec Québec G1L 1B6" - cls.a_dirty_address_with_whitespaces = "350 rue des Lilas Ouest Québec Québec G1L 1B6" - - cls.an_address_with_hyphen_split_address_components = "3-350 rue des lilas ouest" - cls.a_unit_clean_address = "3 350 rue des lilas ouest" - - cls.an_address_with_hyphen_split_address_components_with_hyphen_city = "3-350 rue des lilas ouest, saint-jean" - cls.a_unit_hyphen_city_name_clean_address = "3 350 rue des lilas ouest saint-jean" - - cls.a_unit_with_letter_hyphen_split = "3a-350 rue des lilas ouest saint-jean" - cls.a_unit_with_letter_hyphen_split_clean_address = "3a 350 rue des lilas ouest saint-jean" - - cls.a_unit_with_letter_only_hyphen_split = "a-350 rue des lilas ouest saint-jean" - cls.a_unit_with_letter_only_hyphen_split_clean_address = "a 350 rue des lilas ouest saint-jean" - - cls.a_street_number_with_letter_hyphen_split = "3-350a rue des lilas ouest saint-jean" - cls.a_street_number_with_letter_hyphen_split_clean_address = "3 350a rue des lilas ouest saint-jean" - - cls.letters_hyphen_address = "3a-350b rue des lilas ouest saint-jean" - cls.letters_hyphen_address_split_clean_address = "3a 350b rue des lilas ouest saint-jean" - - cls.address_cleaner = AddressCleaner() - - def test_givenACleanAddress_whenCleaningAddress_thenShouldNotMakeAnyChange(self): - cleaned_address = self.address_cleaner.clean([self.a_clean_address]) - - self.assertEqual(self.a_clean_address, cleaned_address[0]) - - def test_givenADirtyAddressWithCommas_whenCleaningAddress_thenShouldRemoveCommas( - self, - ): - cleaned_address = self.address_cleaner.clean([self.a_dirty_address_with_commas]) - - self.assertEqual(self.a_clean_address, cleaned_address[0]) - - cleaned_address = self.address_cleaner.clean([self.a_commas_separated_address]) - - self.assertEqual(self.a_clean_address, cleaned_address[0]) - - def test_givenADirtyAddressWithUppercase_whenCleaningAddress_thenShouldLower(self): - cleaned_address = self.address_cleaner.clean([self.a_dirty_address_with_uppercase]) - - self.assertEqual(self.a_clean_address, cleaned_address[0]) - - def test_givenADirtyAddressWithWhitespaces_whenCleaningAddress_thenShouldRemoveWhitespaces( - self, - ): - cleaned_address = self.address_cleaner.clean([self.a_dirty_address_with_whitespaces]) - - self.assertEqual(self.a_clean_address, cleaned_address[0]) - - def test_givenMultipleDirtyAddresses_whenCleaningAddresses_thenShouldCleanAllAddresses( - self, - ): - cleaned_address = self.address_cleaner.clean( - [self.a_dirty_address_with_whitespaces, self.a_dirty_address_with_uppercase] - ) - - self.assertEqual(self.a_clean_address, cleaned_address[0]) - self.assertEqual(self.a_clean_address, cleaned_address[1]) - - def test_givenAHyphenUnitStreetNumberAddress_whenCleaningAddress_thenShouldReplaceHyphenWithWhiteSpace(self): - self.address_cleaner = AddressCleaner(with_hyphen_split=True) - - cleaned_address = self.address_cleaner.clean([self.an_address_with_hyphen_split_address_components]) - - self.assertEqual(self.a_unit_clean_address, cleaned_address[0]) - - def test_givenAHyphenUnitAndCityAddress_whenCleaningAddress_thenShouldReplaceUnitStreetNumberHyphenWithWhiteSpace( - self, - ): - self.address_cleaner = AddressCleaner(with_hyphen_split=True) - - cleaned_address = self.address_cleaner.clean( - [self.an_address_with_hyphen_split_address_components_with_hyphen_city] - ) - - self.assertEqual(self.a_unit_hyphen_city_name_clean_address, cleaned_address[0]) - - def test_givenAnAlphabeticalUnitStreetNumberHyphen_whenCleaningAddress_thenShouldReplaceHyphenWithWhiteSpace(self): - self.address_cleaner = AddressCleaner(with_hyphen_split=True) - - cleaned_address = self.address_cleaner.clean([self.a_unit_with_letter_hyphen_split]) - - self.assertEqual(self.a_unit_with_letter_hyphen_split_clean_address, cleaned_address[0]) - - def test_givenAnAlphabeticalOnlyUnitHyphen_whenCleaningAddress_thenShouldReplaceHyphenWithWhiteSpace(self): - self.address_cleaner = AddressCleaner(with_hyphen_split=True) - - cleaned_address = self.address_cleaner.clean([self.a_unit_with_letter_only_hyphen_split]) - - self.assertEqual(self.a_unit_with_letter_only_hyphen_split_clean_address, cleaned_address[0]) - - def test_givenAnAlphabeticalStreetNumberUnitHyphen_whenCleaningAddress_thenShouldReplaceHyphenWithWhiteSpace(self): - self.address_cleaner = AddressCleaner(with_hyphen_split=True) - - cleaned_address = self.address_cleaner.clean([self.a_street_number_with_letter_hyphen_split]) - - self.assertEqual(self.a_street_number_with_letter_hyphen_split_clean_address, cleaned_address[0]) - - def test_givenAnAlphabeticalComponentsStreetNumberUnit_whenCleaningAddress_thenShouldReplaceHyphenWithWhiteSpace( - self, - ): - self.address_cleaner = AddressCleaner(with_hyphen_split=True) - - cleaned_address = self.address_cleaner.clean([self.letters_hyphen_address]) - - self.assertEqual(self.letters_hyphen_address_split_clean_address, cleaned_address[0])