merge solved

GRAAL-Research · Mar 21, 2023 · d0782aa · d0782aa
2 parents 46b4330 + e4baa2e
commit d0782aa
Show file tree

Hide file tree

Showing 11 changed files with 248 additions and 174 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -298,6 +298,7 @@
 ## dev (0.9.6)
 
 - Add Python 3.11.
+- Add pre-processor when parsing addresses.
 - Drop Python 3.7 support since newer Python versions are faster
   and [Torch 2.0 does not support Python 3.7](https://dev-discuss.pytorch.org/t/dropping-support-for-cuda-11-6-and-python-3-7-from-pytorch-2-0-release/1021).
 - Add `torch.compile` integration to improve performance (Torch 1.x still supported) with `mode="reduce-overhead"` as

diff --git a/deepparse/parser/address_parser.py b/deepparse/parser/address_parser.py
@@ -11,7 +11,7 @@
 import warnings
 from functools import partial
 from pathlib import Path
-from typing import Dict, List, Tuple, Union
+from typing import Dict, List, Tuple, Union, Callable
 
 import torch
 from poutyne.framework import Experiment
@@ -39,7 +39,8 @@
 from ..errors import FastTextModelError
 from ..metrics import nll_loss, accuracy
 from ..network import ModelFactory
-from ..preprocessing import AddressCleaner
+from ..pre_processing import trailing_whitespace_cleaning, double_whitespaces_cleaning
+from ..pre_processing import coma_cleaning, lower_cleaning, hyphen_cleaning
 from ..tools import CACHE_PATH, valid_poutyne_version
 from ..vectorizer import VectorizerFactory
 
@@ -276,6 +277,7 @@ def __call__(
         batch_size: int = 32,
         num_workers: int = 0,
         with_hyphen_split: bool = False,
+        pre_processors: Union[None, List[Callable]] = None,
     ) -> Union[FormattedParsedAddress, List[FormattedParsedAddress]]:
         # pylint: disable=too-many-arguments
         """
@@ -304,16 +306,23 @@ def __call__(
                 the hyphen split between the unit and the street number (e.g. Canada). For example, ``'3-305'`` will be
                 replaced as ``'3 305'`` for the parsing. Where ``'3'`` is the unit, and ``'305'`` is the street number.
                 We use a regular expression to replace alphanumerical characters separated by a hyphen at
-                the start of the string. We do so since some cities use hyphens in their names. Default is ``False``.
+                the start of the string. We do so since some cities use hyphens in their names. The default
+                is ``False``. If True, it adds the :func:`~deepparse.pre_processing.pre_processor.hyphen_cleaning`
+                pre-processor **at the end** of the pre-processor list to apply.
+            pre_processors (Union[None, List[Callable]]): A list of functions (callable) to apply pre-processing on
+                all the addresses to parse before parsing. See :ref:`pre_processor_label` for examples of
+                pre-processors. Since models were trained on lowercase data, during the parsing, we always apply a
+                lowercase pre-processor. If you pass a list of pre-processor, a lowercase pre-processor is
+                added **at the end** of the pre-processor list to apply. By default, None,
+                meaning we use the default setup, which is (in order) the coma removal pre-processor, lowercase,
+                double whitespace cleaning and trailing whitespace removal.
 
         Return:
             Either a :class:`~FormattedParsedAddress` or a list of
             :class:`~FormattedParsedAddress` when given more than one address.
 
         Note:
-            During the parsing, the addresses are lowercase, and commas are removed. One can also use the
-            ``with_hyphen_split`` bool argument for replacing hyphens (used to separate units from street numbers,
-            e.g. ``'3-305 a street name'``) by whitespace for proper cleaning.
+            Since model was trained on lowercase data, during the parsing, we always apply a lowercase pre-processor.
 
         Examples:
 
@@ -351,6 +360,16 @@ def __call__(
                 addresses_to_parse = CSVDatasetContainer("./a_path.csv", column_names=["address_column_name"],
                                                          is_training_container=False)
                 address_parser(addresses_to_parse)
+
+            Using a user-define pre-processor
+
+            .. code-block:: python
+
+                def strip_parenthesis(address):
+                    return address.strip("(").strip(")")
+
+                address_parser(addresses_to_parse, pre_processors=[strip_parenthesis])
+                # It will also use the default lower case pre-processor.
         """
         self._model_os_validation(num_workers=num_workers)
 
@@ -363,7 +382,18 @@ def __call__(
         if isinstance(addresses_to_parse, DatasetContainer):
             addresses_to_parse = addresses_to_parse.data
 
-        clean_addresses = AddressCleaner(with_hyphen_split=with_hyphen_split).clean(addresses_to_parse)
+        if pre_processors is None:
+            # Default pre_processing setup.
+            pre_processors = [coma_cleaning, lower_cleaning, trailing_whitespace_cleaning, double_whitespaces_cleaning]
+        else:
+            # We add, at the end, a lower casing cleaning pre-processor.
+            pre_processors.append(lower_cleaning)
+
+        if with_hyphen_split:
+            pre_processors.append(hyphen_cleaning)
+
+        self.pre_processors = pre_processors
+        clean_addresses = self._apply_pre_processors(addresses_to_parse)
 
         if self.verbose and len(addresses_to_parse) > PREDICTION_TIME_PERFORMANCE_THRESHOLD:
             print("Vectorizing the address")
@@ -1197,3 +1227,12 @@ def _model_os_validation(self, num_workers):
                 "FastText objects are not pickleable with the parallelism process used by default by MacOS. "
                 "Thus, you need to set torch.multiprocessing.set_start_method('fork') to allow torch parallelism."
             )
+
+    def _apply_pre_processors(self, addresses: List[str]) -> List[str]:
+        res = []
+
+        for address in addresses:
+            for pre_processor in self.pre_processors:
+                processed_address = pre_processor(address)
+                res.append(" ".join(processed_address.split()))
+        return res
diff --git a/deepparse/pre_processing/__init__.py b/deepparse/pre_processing/__init__.py
@@ -0,0 +1 @@
+from .pre_processor import *
diff --git a/deepparse/pre_processing/pre_processor.py b/deepparse/pre_processing/pre_processor.py
@@ -0,0 +1,85 @@
+import re
+
+
+def double_whitespaces_cleaning(address: str) -> str:
+    """
+    Pre-processor to remove double whitespace by one whitespace.
+    The regular expression use to clean multiple whitespaces is the following ``" {2,}"``.
+
+    Args:
+        address: The address to apply double whitespace cleaning on.
+
+    Return:
+        The double whitespace cleaned address.
+    """
+    return re.sub(pattern=r" {2,}", repl=r" ", string=address)
+
+
+def trailing_whitespace_cleaning(address: str) -> str:
+    """
+    Pre-processor to remove trailing whitespace.
+
+    Args:
+        address: The address to apply trailing whitespace cleaning on.
+
+    Return:
+        The trailing whitespace cleaned address.
+    """
+    return address.strip(" ")
+
+
+def coma_cleaning(address: str) -> str:
+    """
+    Pre-processor to remove coma. It is based on `issue 56 <https://github.com/GRAAL-Research/deepparse/issues/56>`_.
+
+    Args:
+        address: The address to apply coma cleaning on.
+
+    Return:
+        The coma-cleaned address.
+    """
+    return address.replace(",", "")
+
+
+def lower_cleaning(address: str) -> str:
+    """
+    Pre-processor to lowercase an address since the original training data was in lowercase.
+
+    Args:
+        address: The address to apply coma cleaning on.
+
+    Return:
+        The lowercase address.
+    """
+    return address.lower()
+
+
+# The first group is the unit, and the second is the street number.
+# Both include letters since they can include letters in some countries. For example,
+# unit 3a or address 305a.
+hyphen_splitted_unit_and_street_number_regex = r"^([0-9]*[a-z]?)-([0-9]*[a-z]?) "
+
+
+def hyphen_cleaning(address: str) -> str:
+    """
+    Pre-processor to clean hyphen between the street number and unit in an address. Since some addresses use the
+    hyphen to split the unit and street address, we replace the hyphen with whitespaces to allow a
+    proper splitting of the address. For example, the proper parsing of the address 3-305 street name is
+    Unit: 3, StreetNumber: 305, StreetName: street name.
+
+    See `issue 137 <https://github.com/GRAAL-Research/deepparse/issues/137>`_ for more details.
+
+    The regular expression use to clean hyphen is the following ``"^([0-9]*[a-z]?)-([0-9]*[a-z]?) "``.
+    The first group is the unit, and the second is the street number. Both include letters since they can include
+    letters in some countries. For example, unit 3a or address 305a.
+
+    Note: the hyphen is also used in some cities' names, such as Saint-Jean; thus, we use regex to detect
+    the proper hyphen to replace.
+
+    Args:
+        address: The address to apply coma cleaning on.
+
+    Return:
+        The lowercase address.
+    """
+    return re.sub(pattern=hyphen_splitted_unit_and_street_number_regex, repl=r"\1 \2 ", string=address)
diff --git a/deepparse/preprocessing/__init__.py b/deepparse/preprocessing/__init__.py
diff --git a/deepparse/preprocessing/address_cleaner.py b/deepparse/preprocessing/address_cleaner.py
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -765,6 +765,7 @@ API Reference
   :caption: API
 
   parser
+  pre_processor
   dataset_container
   comparer
   cli

diff --git a/docs/source/pre_processor.rst b/docs/source/pre_processor.rst
@@ -0,0 +1,16 @@
+.. _pre_processor_label:
+
+.. role:: hidden
+    :class: hidden-section
+
+
+Pre-Processors
+=================
+Here are the available pre-processor in Deepparse. The first four are used as default settings when parsing
+addresses.
+
+.. autofunction:: deepparse.pre_processing.pre_processor.coma_cleaning
+.. autofunction:: deepparse.pre_processing.pre_processor.lower_cleaning
+.. autofunction:: deepparse.pre_processing.pre_processor.trailing_whitespace_cleaning
+.. autofunction:: deepparse.pre_processing.pre_processor.double_whitespaces_cleaning
+.. autofunction:: deepparse.pre_processing.pre_processor.hyphen_cleaning
diff --git a/tests/preprocessing/__init__.py → tests/pre_processing/__init__.py b/tests/preprocessing/__init__.py → tests/pre_processing/__init__.py
diff --git a/tests/pre_processing/test_address_cleaner.py b/tests/pre_processing/test_address_cleaner.py
@@ -0,0 +1,98 @@
+from unittest import TestCase
+
+from deepparse.pre_processing import (
+    coma_cleaning,
+    lower_cleaning,
+    trailing_whitespace_cleaning,
+    hyphen_cleaning,
+    double_whitespaces_cleaning,
+)
+
+
+class PreProcessorTest(TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.a_clean_address = "350 rue des lilas ouest québec québec g1l 1b6"
+        cls.a_dirty_address_with_commas = "350 rue des lilas , ouest ,québec québec, g1l 1b6"
+        cls.a_commas_separated_address = "350, rue des lilas, ouest, québec, québec, g1l 1b6"
+        cls.a_dirty_address_with_uppercase = "350 rue des Lilas Ouest Québec Québec G1L 1B6"
+        cls.a_dirty_address_with_trailing_whitespaces = "350 rue des lilas ouest québec québec g1l 1b6 "
+        cls.a_dirty_address_with_whitespaces = "350     rue des lilas ouest québec québec g1l 1b6"
+
+        cls.an_address_with_hyphen_split_address_components = "3-350 rue des lilas ouest"
+        cls.a_unit_clean_address = "3 350 rue des lilas ouest"
+
+        cls.an_address_with_hyphen_split_address_components_with_hyphen_city = "3-350 rue des lilas ouest saint-jean"
+        cls.a_unit_hyphen_city_name_clean_address = "3 350 rue des lilas ouest saint-jean"
+
+        cls.a_unit_with_letter_hyphen_split = "3a-350 rue des lilas ouest saint-jean"
+        cls.a_unit_with_letter_hyphen_split_clean_address = "3a 350 rue des lilas ouest saint-jean"
+
+        cls.a_unit_with_letter_only_hyphen_split = "a-350 rue des lilas ouest saint-jean"
+        cls.a_unit_with_letter_only_hyphen_split_clean_address = "a 350 rue des lilas ouest saint-jean"
+
+        cls.a_street_number_with_letter_hyphen_split = "3-350a rue des lilas ouest saint-jean"
+        cls.a_street_number_with_letter_hyphen_split_clean_address = "3 350a rue des lilas ouest saint-jean"
+
+        cls.letters_hyphen_address = "3a-350b rue des lilas ouest saint-jean"
+        cls.letters_hyphen_address_split_clean_address = "3a 350b rue des lilas ouest saint-jean"
+
+    def test_givenADirtyAddressWithCommas_whenComaCleaning_thenShouldRemoveCommas(
+        self,
+    ):
+        cleaned_address = coma_cleaning(self.a_commas_separated_address)
+
+        self.assertEqual(self.a_clean_address, cleaned_address)
+
+    def test_givenADirtyAddressWithUppercase_whenLowerCleaning_thenShouldLower(self):
+        cleaned_address = lower_cleaning(self.a_dirty_address_with_uppercase)
+
+        self.assertEqual(self.a_clean_address, cleaned_address)
+
+    def test_givenADirtyAddressWithWhitespaces_whenTrailingWhitespaceCleaning_thenShouldRemoveWhitespaces(
+        self,
+    ):
+        cleaned_address = trailing_whitespace_cleaning(self.a_dirty_address_with_trailing_whitespaces)
+
+        self.assertEqual(self.a_clean_address, cleaned_address)
+
+    def test_givenADirtyAddressWithWhitespacesInAddress_whenDoubleWhitespacesCleaning_thenShouldRemoveWhitespaces(
+        self,
+    ):
+        cleaned_address = double_whitespaces_cleaning(self.a_dirty_address_with_whitespaces)
+
+        self.assertEqual(self.a_clean_address, cleaned_address)
+
+    def test_givenAHyphenUnitStreetNumberAddress_whenCleaningAddress_thenShouldReplaceHyphenWithWhiteSpace(self):
+        cleaned_address = hyphen_cleaning(self.an_address_with_hyphen_split_address_components)
+
+        self.assertEqual(self.a_unit_clean_address, cleaned_address)
+
+    def test_givenAHyphenUnitAndCityAddress_whenCleaningAddress_thenShouldReplaceUnitStreetNumberHyphenWithWhiteSpace(
+        self,
+    ):
+        cleaned_address = hyphen_cleaning(self.an_address_with_hyphen_split_address_components_with_hyphen_city)
+
+        self.assertEqual(self.a_unit_hyphen_city_name_clean_address, cleaned_address)
+
+    def test_givenAnAlphabeticalUnitStreetNumberHyphen_whenCleaningAddress_thenShouldReplaceHyphenWithWhiteSpace(self):
+        cleaned_address = hyphen_cleaning(self.a_unit_with_letter_hyphen_split)
+
+        self.assertEqual(self.a_unit_with_letter_hyphen_split_clean_address, cleaned_address)
+
+    def test_givenAnAlphabeticalOnlyUnitHyphen_whenCleaningAddress_thenShouldReplaceHyphenWithWhiteSpace(self):
+        cleaned_address = hyphen_cleaning(self.a_unit_with_letter_only_hyphen_split)
+
+        self.assertEqual(self.a_unit_with_letter_only_hyphen_split_clean_address, cleaned_address)
+
+    def test_givenAnAlphabeticalStreetNumberUnitHyphen_whenCleaningAddress_thenShouldReplaceHyphenWithWhiteSpace(self):
+        cleaned_address = hyphen_cleaning(self.a_street_number_with_letter_hyphen_split)
+
+        self.assertEqual(self.a_street_number_with_letter_hyphen_split_clean_address, cleaned_address)
+
+    def test_givenAnAlphabeticalComponentsStreetNumberUnit_whenCleaningAddress_thenShouldReplaceHyphenWithWhiteSpace(
+        self,
+    ):
+        cleaned_address = hyphen_cleaning(self.letters_hyphen_address)
+
+        self.assertEqual(self.letters_hyphen_address_split_clean_address, cleaned_address)