Skip to content

Commit

Permalink
reworked DataCleaning into a more flexible approach
Browse files Browse the repository at this point in the history
  • Loading branch information
davebulaval committed Mar 21, 2023
1 parent b9751d9 commit d20dd3c
Show file tree
Hide file tree
Showing 6 changed files with 106 additions and 55 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -298,3 +298,4 @@
## dev

- Add Python 3.11
- Add pre-processor when parsing addresses.
52 changes: 45 additions & 7 deletions deepparse/parser/address_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import warnings
from functools import partial
from pathlib import Path
from typing import Dict, List, Tuple, Union
from typing import Dict, List, Tuple, Union, Callable

import torch
from poutyne.framework import Experiment
Expand Down Expand Up @@ -39,7 +39,7 @@
from ..errors import FastTextModelError
from ..metrics import nll_loss, accuracy
from ..network import ModelFactory
from ..preprocessing import AddressCleaner
from ..preprocessing import coma_cleaning, lower_cleaning, hyphen_cleaning
from ..tools import CACHE_PATH, valid_poutyne_version
from ..vectorizer import VectorizerFactory

Expand Down Expand Up @@ -276,6 +276,7 @@ def __call__(
batch_size: int = 32,
num_workers: int = 0,
with_hyphen_split: bool = False,
pre_processors: Union[None, List[Callable]] = None,
) -> Union[FormattedParsedAddress, List[FormattedParsedAddress]]:
# pylint: disable=too-many-arguments
"""
Expand Down Expand Up @@ -304,16 +305,22 @@ def __call__(
the hyphen split between the unit and the street number (e.g. Canada). For example, ``'3-305'`` will be
replaced as ``'3 305'`` for the parsing. Where ``'3'`` is the unit, and ``'305'`` is the street number.
We use a regular expression to replace alphanumerical characters separated by a hyphen at
the start of the string. We do so since some cities use hyphens in their names. Default is ``False``.
the start of the string. We do so since some cities use hyphens in their names. The default
is ``False``. If True, it adds the :func:`~deepparse.preprocessing.pre_processor.hyphen_cleaning`
pre-processor **at the end** of the pre-processor list to apply.
pre_processors (Union[None, List[Callable]]): A list of functions (callable) to apply pre-processing on
all the addresses to parse before parsing. See :ref:`pre_processor_label` for examples of
pre-processors. Since models were trained on lowercase data, during the parsing, we always apply a
lowercase pre-processor. If you pass a list of pre-processor, a lowercase pre-processor is
added **at the end** of the pre-processor list to apply. By default, None,
meaning we use the default setup, which is the coma removal pre-processor and lowercase.
Return:
Either a :class:`~FormattedParsedAddress` or a list of
:class:`~FormattedParsedAddress` when given more than one address.
Note:
During the parsing, the addresses are lowercase, and commas are removed. One can also use the
``with_hyphen_split`` bool argument for replacing hyphens (used to separate units from street numbers,
e.g. ``'3-305 a street name'``) by whitespace for proper cleaning.
Since model was trained on lowercase data, during the parsing, we always apply a lowercase pre-processor.
Examples:
Expand Down Expand Up @@ -351,6 +358,16 @@ def __call__(
addresses_to_parse = CSVDatasetContainer("./a_path.csv", column_names=["address_column_name"],
is_training_container=False)
address_parser(addresses_to_parse)
Using a user-define pre-processor
.. code-block:: python
def strip_parenthesis(address):
return address.strip("(").strip(")")
address_parser(addresses_to_parse, pre_processors=[strip_parenthesis])
# It will also use the default lower case pre-processor.
"""
self._model_os_validation(num_workers=num_workers)

Expand All @@ -363,7 +380,18 @@ def __call__(
if isinstance(addresses_to_parse, DatasetContainer):
addresses_to_parse = addresses_to_parse.data

clean_addresses = AddressCleaner(with_hyphen_split=with_hyphen_split).clean(addresses_to_parse)
if pre_processors is None:
# Default pre_processing setup.
pre_processors = [coma_cleaning, lower_cleaning]
else:
# We add, at the end, a lower casing cleaning pre-processor.
pre_processors.append(lower_cleaning)

if with_hyphen_split:
pre_processors.append(hyphen_cleaning)

self.pre_processors = pre_processors
clean_addresses = self._apply_pre_processors(addresses_to_parse)

if self.verbose and len(addresses_to_parse) > PREDICTION_TIME_PERFORMANCE_THRESHOLD:
print("Vectorizing the address")
Expand Down Expand Up @@ -411,6 +439,7 @@ def retrain(
seq2seq_params: Union[Dict, None] = None,
layers_to_freeze: Union[str, None] = None,
name_of_the_retrain_parser: Union[None, str] = None,
pre_processors: Union[None, List[Callable]] = None,
) -> List[Dict]:
# pylint: disable=too-many-arguments, too-many-locals, too-many-branches, too-many-statements

Expand Down Expand Up @@ -1189,3 +1218,12 @@ def _model_os_validation(self, num_workers):
"FastText objects are not pickleable with the parallelism process used by default by MacOS. "
"Thus, you need to set torch.multiprocessing.set_start_method('fork') to allow torch parallelism."
)

def _apply_pre_processors(self, addresses: List[str]) -> List[str]:
res = []

for address in addresses:
for pre_processor in self.pre_processors:
processed_address = pre_processor(address)
res.append(" ".join(processed_address.split()))
return res
2 changes: 1 addition & 1 deletion deepparse/preprocessing/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from .address_cleaner import *
from .pre_processor import *
47 changes: 0 additions & 47 deletions deepparse/preprocessing/address_cleaner.py

This file was deleted.

58 changes: 58 additions & 0 deletions deepparse/preprocessing/pre_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import re


def coma_cleaning(address: str) -> str:
"""
Pre-processor to remove coma. It is based on `issue 56 <https://github.com/GRAAL-Research/deepparse/issues/56>`_.
Args:
address: The address to apply coma cleaning on.
Return:
The coma-cleaned address.
"""
return address.replace(",", "")


def lower_cleaning(address: str) -> str:
"""
Pre-processor to lowercase an address since the original training data was in lowercase.
Args:
address: The address to apply coma cleaning on.
Return:
The lowercase address.
"""
return address.lower()


# The first group is the unit, and the second is the street number.
# Both include letters since they can include letters in some countries. For example,
# unit 3a or address 305a.
hyphen_splitted_unit_and_street_number_regex = r"^([0-9]*[a-z]?)-([0-9]*[a-z]?) "


def hyphen_cleaning(address: str) -> str:
"""
Pre-processor to clean hyphen between the street number and unit in an address. Since some addresses use the
hyphen to split the unit and street address, we replace the hyphen with whitespaces to allow a
proper splitting of the address. For example, the proper parsing of the address 3-305 street name is
Unit: 3, StreetNumber: 305, StreetName: street name.
See `issue 137 <https://github.com/GRAAL-Research/deepparse/issues/137>`_ for more details.
The regular expression use to clean hyphen is the following ``"^([0-9]*[a-z]?)-([0-9]*[a-z]?) "``.
The first group is the unit, and the second is the street number. Both include letters since they can include
letters in some countries. For example, unit 3a or address 305a.
Note: the hyphen is also used in some cities' names, such as Saint-Jean; thus, we use regex to detect
the proper hyphen to replace.
Args:
address: The address to apply coma cleaning on.
Return:
The lowercase address.
"""
return re.sub(hyphen_splitted_unit_and_street_number_regex, r"\1 \2 ", address)
1 change: 1 addition & 0 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -765,6 +765,7 @@ API Reference
:caption: API

parser
pre_processor
dataset_container
comparer
cli
Expand Down

0 comments on commit d20dd3c

Please sign in to comment.