diff --git a/CHANGELOG.md b/CHANGELOG.md index 87ffa126..62e7d008 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -298,3 +298,4 @@ ## dev - Add Python 3.11 + - Add pre-processor when parsing addresses. diff --git a/deepparse/parser/address_parser.py b/deepparse/parser/address_parser.py index f8fd5442..4086a5b1 100644 --- a/deepparse/parser/address_parser.py +++ b/deepparse/parser/address_parser.py @@ -11,7 +11,7 @@ import warnings from functools import partial from pathlib import Path -from typing import Dict, List, Tuple, Union +from typing import Dict, List, Tuple, Union, Callable import torch from poutyne.framework import Experiment @@ -39,7 +39,7 @@ from ..errors import FastTextModelError from ..metrics import nll_loss, accuracy from ..network import ModelFactory -from ..preprocessing import AddressCleaner +from ..preprocessing import coma_cleaning, lower_cleaning, hyphen_cleaning from ..tools import CACHE_PATH, valid_poutyne_version from ..vectorizer import VectorizerFactory @@ -276,6 +276,7 @@ def __call__( batch_size: int = 32, num_workers: int = 0, with_hyphen_split: bool = False, + pre_processors: Union[None, List[Callable]] = None, ) -> Union[FormattedParsedAddress, List[FormattedParsedAddress]]: # pylint: disable=too-many-arguments """ @@ -304,16 +305,22 @@ def __call__( the hyphen split between the unit and the street number (e.g. Canada). For example, ``'3-305'`` will be replaced as ``'3 305'`` for the parsing. Where ``'3'`` is the unit, and ``'305'`` is the street number. We use a regular expression to replace alphanumerical characters separated by a hyphen at - the start of the string. We do so since some cities use hyphens in their names. Default is ``False``. + the start of the string. We do so since some cities use hyphens in their names. The default + is ``False``. If True, it adds the :func:`~deepparse.preprocessing.pre_processor.hyphen_cleaning` + pre-processor **at the end** of the pre-processor list to apply. + pre_processors (Union[None, List[Callable]]): A list of functions (callable) to apply pre-processing on + all the addresses to parse before parsing. See :ref:`pre_processor_label` for examples of + pre-processors. Since models were trained on lowercase data, during the parsing, we always apply a + lowercase pre-processor. If you pass a list of pre-processor, a lowercase pre-processor is + added **at the end** of the pre-processor list to apply. By default, None, + meaning we use the default setup, which is the coma removal pre-processor and lowercase. Return: Either a :class:`~FormattedParsedAddress` or a list of :class:`~FormattedParsedAddress` when given more than one address. Note: - During the parsing, the addresses are lowercase, and commas are removed. One can also use the - ``with_hyphen_split`` bool argument for replacing hyphens (used to separate units from street numbers, - e.g. ``'3-305 a street name'``) by whitespace for proper cleaning. + Since model was trained on lowercase data, during the parsing, we always apply a lowercase pre-processor. Examples: @@ -351,6 +358,16 @@ def __call__( addresses_to_parse = CSVDatasetContainer("./a_path.csv", column_names=["address_column_name"], is_training_container=False) address_parser(addresses_to_parse) + + Using a user-define pre-processor + + .. code-block:: python + + def strip_parenthesis(address): + return address.strip("(").strip(")") + + address_parser(addresses_to_parse, pre_processors=[strip_parenthesis]) + # It will also use the default lower case pre-processor. """ self._model_os_validation(num_workers=num_workers) @@ -363,7 +380,18 @@ def __call__( if isinstance(addresses_to_parse, DatasetContainer): addresses_to_parse = addresses_to_parse.data - clean_addresses = AddressCleaner(with_hyphen_split=with_hyphen_split).clean(addresses_to_parse) + if pre_processors is None: + # Default pre_processing setup. + pre_processors = [coma_cleaning, lower_cleaning] + else: + # We add, at the end, a lower casing cleaning pre-processor. + pre_processors.append(lower_cleaning) + + if with_hyphen_split: + pre_processors.append(hyphen_cleaning) + + self.pre_processors = pre_processors + clean_addresses = self._apply_pre_processors(addresses_to_parse) if self.verbose and len(addresses_to_parse) > PREDICTION_TIME_PERFORMANCE_THRESHOLD: print("Vectorizing the address") @@ -411,6 +439,7 @@ def retrain( seq2seq_params: Union[Dict, None] = None, layers_to_freeze: Union[str, None] = None, name_of_the_retrain_parser: Union[None, str] = None, + pre_processors: Union[None, List[Callable]] = None, ) -> List[Dict]: # pylint: disable=too-many-arguments, too-many-locals, too-many-branches, too-many-statements @@ -1189,3 +1218,12 @@ def _model_os_validation(self, num_workers): "FastText objects are not pickleable with the parallelism process used by default by MacOS. " "Thus, you need to set torch.multiprocessing.set_start_method('fork') to allow torch parallelism." ) + + def _apply_pre_processors(self, addresses: List[str]) -> List[str]: + res = [] + + for address in addresses: + for pre_processor in self.pre_processors: + processed_address = pre_processor(address) + res.append(" ".join(processed_address.split())) + return res diff --git a/deepparse/preprocessing/__init__.py b/deepparse/preprocessing/__init__.py index 71b53988..1c5c715f 100644 --- a/deepparse/preprocessing/__init__.py +++ b/deepparse/preprocessing/__init__.py @@ -1 +1 @@ -from .address_cleaner import * +from .pre_processor import * diff --git a/deepparse/preprocessing/address_cleaner.py b/deepparse/preprocessing/address_cleaner.py deleted file mode 100644 index abcc23a0..00000000 --- a/deepparse/preprocessing/address_cleaner.py +++ /dev/null @@ -1,47 +0,0 @@ -import re -from typing import List - -# The first group is the unit, and the second is the street number. -# Both include letters since they can include letters in some countries. For example, -# unit 3a or address 305a. -hyphen_splitted_unit_and_street_number_regex = r"^([0-9]*[a-z]?)-([0-9]*[a-z]?) " - - -class AddressCleaner: - def __init__(self, with_hyphen_split: bool = False) -> None: - self.with_hyphen_split = with_hyphen_split - - def clean(self, addresses: List[str]) -> List[str]: - res = [] - - for address in addresses: - processed_address = self.coma_cleaning(address) - - processed_address = self.lower_cleaning(processed_address) - - if self.with_hyphen_split: - processed_address = self.hyphen_cleaning(processed_address) - - res.append(" ".join(processed_address.split())) - return res - - @staticmethod - def coma_cleaning(text: str) -> str: - # See issue 56 https://github.com/GRAAL-Research/deepparse/issues/56 - return text.replace(",", "") - - @staticmethod - def lower_cleaning(text: str) -> str: - # Since the original training data was in lowercase - return text.lower() - - @staticmethod - def hyphen_cleaning(text: str) -> str: - # See issue 137 for more details https://github.com/GRAAL-Research/deepparse/issues/137. - # Since some addresses use the hyphen to split the unit and street address, we replace the hyphen - # with whitespaces to allow a proper splitting of the address. - # For example, the proper parsing of the address 3-305 street name is - # Unit: 3, StreetNumber: 305, StreetName: street name. - # Note: the hyphen is also used in some cities' names, such as Saint-Jean; thus, we use regex to detect - # the proper hyphen to replace. - return re.sub(hyphen_splitted_unit_and_street_number_regex, r"\1 \2 ", text) diff --git a/deepparse/preprocessing/pre_processor.py b/deepparse/preprocessing/pre_processor.py new file mode 100644 index 00000000..67517fb2 --- /dev/null +++ b/deepparse/preprocessing/pre_processor.py @@ -0,0 +1,58 @@ +import re + + +def coma_cleaning(address: str) -> str: + """ + Pre-processor to remove coma. It is based on `issue 56 `_. + + Args: + address: The address to apply coma cleaning on. + + Return: + The coma-cleaned address. + """ + return address.replace(",", "") + + +def lower_cleaning(address: str) -> str: + """ + Pre-processor to lowercase an address since the original training data was in lowercase. + + Args: + address: The address to apply coma cleaning on. + + Return: + The lowercase address. + """ + return address.lower() + + +# The first group is the unit, and the second is the street number. +# Both include letters since they can include letters in some countries. For example, +# unit 3a or address 305a. +hyphen_splitted_unit_and_street_number_regex = r"^([0-9]*[a-z]?)-([0-9]*[a-z]?) " + + +def hyphen_cleaning(address: str) -> str: + """ + Pre-processor to clean hyphen between the street number and unit in an address. Since some addresses use the + hyphen to split the unit and street address, we replace the hyphen with whitespaces to allow a + proper splitting of the address. For example, the proper parsing of the address 3-305 street name is + Unit: 3, StreetNumber: 305, StreetName: street name. + + See `issue 137 `_ for more details. + + The regular expression use to clean hyphen is the following ``"^([0-9]*[a-z]?)-([0-9]*[a-z]?) "``. + The first group is the unit, and the second is the street number. Both include letters since they can include + letters in some countries. For example, unit 3a or address 305a. + + Note: the hyphen is also used in some cities' names, such as Saint-Jean; thus, we use regex to detect + the proper hyphen to replace. + + Args: + address: The address to apply coma cleaning on. + + Return: + The lowercase address. + """ + return re.sub(hyphen_splitted_unit_and_street_number_regex, r"\1 \2 ", address) diff --git a/docs/source/index.rst b/docs/source/index.rst index 8b67f1e6..e8e81429 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -765,6 +765,7 @@ API Reference :caption: API parser + pre_processor dataset_container comparer cli