reworked DataCleaning into a more flexible approach

GRAAL-Research · Mar 21, 2023 · d20dd3c · d20dd3c
1 parent b9751d9
commit d20dd3c
Show file tree

Hide file tree

Showing 6 changed files with 106 additions and 55 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -298,3 +298,4 @@
 ## dev
 
  - Add Python 3.11
+ - Add pre-processor when parsing addresses.
diff --git a/deepparse/parser/address_parser.py b/deepparse/parser/address_parser.py
@@ -11,7 +11,7 @@
 import warnings
 from functools import partial
 from pathlib import Path
-from typing import Dict, List, Tuple, Union
+from typing import Dict, List, Tuple, Union, Callable
 
 import torch
 from poutyne.framework import Experiment
@@ -39,7 +39,7 @@
 from ..errors import FastTextModelError
 from ..metrics import nll_loss, accuracy
 from ..network import ModelFactory
-from ..preprocessing import AddressCleaner
+from ..preprocessing import coma_cleaning, lower_cleaning, hyphen_cleaning
 from ..tools import CACHE_PATH, valid_poutyne_version
 from ..vectorizer import VectorizerFactory
 
@@ -276,6 +276,7 @@ def __call__(
         batch_size: int = 32,
         num_workers: int = 0,
         with_hyphen_split: bool = False,
+        pre_processors: Union[None, List[Callable]] = None,
     ) -> Union[FormattedParsedAddress, List[FormattedParsedAddress]]:
         # pylint: disable=too-many-arguments
         """
@@ -304,16 +305,22 @@ def __call__(
                 the hyphen split between the unit and the street number (e.g. Canada). For example, ``'3-305'`` will be
                 replaced as ``'3 305'`` for the parsing. Where ``'3'`` is the unit, and ``'305'`` is the street number.
                 We use a regular expression to replace alphanumerical characters separated by a hyphen at
-                the start of the string. We do so since some cities use hyphens in their names. Default is ``False``.
+                the start of the string. We do so since some cities use hyphens in their names. The default
+                is ``False``. If True, it adds the :func:`~deepparse.preprocessing.pre_processor.hyphen_cleaning`
+                pre-processor **at the end** of the pre-processor list to apply.
+            pre_processors (Union[None, List[Callable]]): A list of functions (callable) to apply pre-processing on
+                all the addresses to parse before parsing. See :ref:`pre_processor_label` for examples of
+                pre-processors. Since models were trained on lowercase data, during the parsing, we always apply a
+                lowercase pre-processor. If you pass a list of pre-processor, a lowercase pre-processor is
+                added **at the end** of the pre-processor list to apply. By default, None,
+                meaning we use the default setup, which is the coma removal pre-processor and lowercase.
 
         Return:
             Either a :class:`~FormattedParsedAddress` or a list of
             :class:`~FormattedParsedAddress` when given more than one address.
 
         Note:
-            During the parsing, the addresses are lowercase, and commas are removed. One can also use the
-            ``with_hyphen_split`` bool argument for replacing hyphens (used to separate units from street numbers,
-            e.g. ``'3-305 a street name'``) by whitespace for proper cleaning.
+            Since model was trained on lowercase data, during the parsing, we always apply a lowercase pre-processor.
 
         Examples:
 
@@ -351,6 +358,16 @@ def __call__(
                 addresses_to_parse = CSVDatasetContainer("./a_path.csv", column_names=["address_column_name"],
                                                          is_training_container=False)
                 address_parser(addresses_to_parse)
+
+            Using a user-define pre-processor
+
+            .. code-block:: python
+
+                def strip_parenthesis(address):
+                    return address.strip("(").strip(")")
+
+                address_parser(addresses_to_parse, pre_processors=[strip_parenthesis])
+                # It will also use the default lower case pre-processor.
         """
         self._model_os_validation(num_workers=num_workers)
 
@@ -363,7 +380,18 @@ def __call__(
         if isinstance(addresses_to_parse, DatasetContainer):
             addresses_to_parse = addresses_to_parse.data
 
-        clean_addresses = AddressCleaner(with_hyphen_split=with_hyphen_split).clean(addresses_to_parse)
+        if pre_processors is None:
+            # Default pre_processing setup.
+            pre_processors = [coma_cleaning, lower_cleaning]
+        else:
+            # We add, at the end, a lower casing cleaning pre-processor.
+            pre_processors.append(lower_cleaning)
+
+        if with_hyphen_split:
+            pre_processors.append(hyphen_cleaning)
+
+        self.pre_processors = pre_processors
+        clean_addresses = self._apply_pre_processors(addresses_to_parse)
 
         if self.verbose and len(addresses_to_parse) > PREDICTION_TIME_PERFORMANCE_THRESHOLD:
             print("Vectorizing the address")
@@ -411,6 +439,7 @@ def retrain(
         seq2seq_params: Union[Dict, None] = None,
         layers_to_freeze: Union[str, None] = None,
         name_of_the_retrain_parser: Union[None, str] = None,
+        pre_processors: Union[None, List[Callable]] = None,
     ) -> List[Dict]:
         # pylint: disable=too-many-arguments, too-many-locals, too-many-branches, too-many-statements
 
@@ -1189,3 +1218,12 @@ def _model_os_validation(self, num_workers):
                 "FastText objects are not pickleable with the parallelism process used by default by MacOS. "
                 "Thus, you need to set torch.multiprocessing.set_start_method('fork') to allow torch parallelism."
             )
+
+    def _apply_pre_processors(self, addresses: List[str]) -> List[str]:
+        res = []
+
+        for address in addresses:
+            for pre_processor in self.pre_processors:
+                processed_address = pre_processor(address)
+                res.append(" ".join(processed_address.split()))
+        return res
diff --git a/deepparse/preprocessing/__init__.py b/deepparse/preprocessing/__init__.py
@@ -1 +1 @@
-from .address_cleaner import *
+from .pre_processor import *
diff --git a/deepparse/preprocessing/address_cleaner.py b/deepparse/preprocessing/address_cleaner.py
diff --git a/deepparse/preprocessing/pre_processor.py b/deepparse/preprocessing/pre_processor.py
@@ -0,0 +1,58 @@
+import re
+
+
+def coma_cleaning(address: str) -> str:
+    """
+    Pre-processor to remove coma. It is based on `issue 56 <https://github.com/GRAAL-Research/deepparse/issues/56>`_.
+
+    Args:
+        address: The address to apply coma cleaning on.
+
+    Return:
+        The coma-cleaned address.
+    """
+    return address.replace(",", "")
+
+
+def lower_cleaning(address: str) -> str:
+    """
+    Pre-processor to lowercase an address since the original training data was in lowercase.
+
+    Args:
+        address: The address to apply coma cleaning on.
+
+    Return:
+        The lowercase address.
+    """
+    return address.lower()
+
+
+# The first group is the unit, and the second is the street number.
+# Both include letters since they can include letters in some countries. For example,
+# unit 3a or address 305a.
+hyphen_splitted_unit_and_street_number_regex = r"^([0-9]*[a-z]?)-([0-9]*[a-z]?) "
+
+
+def hyphen_cleaning(address: str) -> str:
+    """
+    Pre-processor to clean hyphen between the street number and unit in an address. Since some addresses use the
+    hyphen to split the unit and street address, we replace the hyphen with whitespaces to allow a
+    proper splitting of the address. For example, the proper parsing of the address 3-305 street name is
+    Unit: 3, StreetNumber: 305, StreetName: street name.
+
+    See `issue 137 <https://github.com/GRAAL-Research/deepparse/issues/137>`_ for more details.
+
+    The regular expression use to clean hyphen is the following ``"^([0-9]*[a-z]?)-([0-9]*[a-z]?) "``.
+    The first group is the unit, and the second is the street number. Both include letters since they can include
+    letters in some countries. For example, unit 3a or address 305a.
+
+    Note: the hyphen is also used in some cities' names, such as Saint-Jean; thus, we use regex to detect
+    the proper hyphen to replace.
+
+    Args:
+        address: The address to apply coma cleaning on.
+
+    Return:
+        The lowercase address.
+    """
+    return re.sub(hyphen_splitted_unit_and_street_number_regex, r"\1 \2 ", address)
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -765,6 +765,7 @@ API Reference
   :caption: API
 
   parser
+  pre_processor
   dataset_container
   comparer
   cli