Skip to content

Commit

Permalink
merge solved
Browse files Browse the repository at this point in the history
  • Loading branch information
davebulaval committed Mar 21, 2023
2 parents 46b4330 + e4baa2e commit d0782aa
Show file tree
Hide file tree
Showing 11 changed files with 248 additions and 174 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,7 @@
## dev (0.9.6)

- Add Python 3.11.
- Add pre-processor when parsing addresses.
- Drop Python 3.7 support since newer Python versions are faster
and [Torch 2.0 does not support Python 3.7](https://dev-discuss.pytorch.org/t/dropping-support-for-cuda-11-6-and-python-3-7-from-pytorch-2-0-release/1021).
- Add `torch.compile` integration to improve performance (Torch 1.x still supported) with `mode="reduce-overhead"` as
Expand Down
53 changes: 46 additions & 7 deletions deepparse/parser/address_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import warnings
from functools import partial
from pathlib import Path
from typing import Dict, List, Tuple, Union
from typing import Dict, List, Tuple, Union, Callable

import torch
from poutyne.framework import Experiment
Expand Down Expand Up @@ -39,7 +39,8 @@
from ..errors import FastTextModelError
from ..metrics import nll_loss, accuracy
from ..network import ModelFactory
from ..preprocessing import AddressCleaner
from ..pre_processing import trailing_whitespace_cleaning, double_whitespaces_cleaning
from ..pre_processing import coma_cleaning, lower_cleaning, hyphen_cleaning
from ..tools import CACHE_PATH, valid_poutyne_version
from ..vectorizer import VectorizerFactory

Expand Down Expand Up @@ -276,6 +277,7 @@ def __call__(
batch_size: int = 32,
num_workers: int = 0,
with_hyphen_split: bool = False,
pre_processors: Union[None, List[Callable]] = None,
) -> Union[FormattedParsedAddress, List[FormattedParsedAddress]]:
# pylint: disable=too-many-arguments
"""
Expand Down Expand Up @@ -304,16 +306,23 @@ def __call__(
the hyphen split between the unit and the street number (e.g. Canada). For example, ``'3-305'`` will be
replaced as ``'3 305'`` for the parsing. Where ``'3'`` is the unit, and ``'305'`` is the street number.
We use a regular expression to replace alphanumerical characters separated by a hyphen at
the start of the string. We do so since some cities use hyphens in their names. Default is ``False``.
the start of the string. We do so since some cities use hyphens in their names. The default
is ``False``. If True, it adds the :func:`~deepparse.pre_processing.pre_processor.hyphen_cleaning`
pre-processor **at the end** of the pre-processor list to apply.
pre_processors (Union[None, List[Callable]]): A list of functions (callable) to apply pre-processing on
all the addresses to parse before parsing. See :ref:`pre_processor_label` for examples of
pre-processors. Since models were trained on lowercase data, during the parsing, we always apply a
lowercase pre-processor. If you pass a list of pre-processor, a lowercase pre-processor is
added **at the end** of the pre-processor list to apply. By default, None,
meaning we use the default setup, which is (in order) the coma removal pre-processor, lowercase,
double whitespace cleaning and trailing whitespace removal.
Return:
Either a :class:`~FormattedParsedAddress` or a list of
:class:`~FormattedParsedAddress` when given more than one address.
Note:
During the parsing, the addresses are lowercase, and commas are removed. One can also use the
``with_hyphen_split`` bool argument for replacing hyphens (used to separate units from street numbers,
e.g. ``'3-305 a street name'``) by whitespace for proper cleaning.
Since model was trained on lowercase data, during the parsing, we always apply a lowercase pre-processor.
Examples:
Expand Down Expand Up @@ -351,6 +360,16 @@ def __call__(
addresses_to_parse = CSVDatasetContainer("./a_path.csv", column_names=["address_column_name"],
is_training_container=False)
address_parser(addresses_to_parse)
Using a user-define pre-processor
.. code-block:: python
def strip_parenthesis(address):
return address.strip("(").strip(")")
address_parser(addresses_to_parse, pre_processors=[strip_parenthesis])
# It will also use the default lower case pre-processor.
"""
self._model_os_validation(num_workers=num_workers)

Expand All @@ -363,7 +382,18 @@ def __call__(
if isinstance(addresses_to_parse, DatasetContainer):
addresses_to_parse = addresses_to_parse.data

clean_addresses = AddressCleaner(with_hyphen_split=with_hyphen_split).clean(addresses_to_parse)
if pre_processors is None:
# Default pre_processing setup.
pre_processors = [coma_cleaning, lower_cleaning, trailing_whitespace_cleaning, double_whitespaces_cleaning]
else:
# We add, at the end, a lower casing cleaning pre-processor.
pre_processors.append(lower_cleaning)

if with_hyphen_split:
pre_processors.append(hyphen_cleaning)

self.pre_processors = pre_processors
clean_addresses = self._apply_pre_processors(addresses_to_parse)

if self.verbose and len(addresses_to_parse) > PREDICTION_TIME_PERFORMANCE_THRESHOLD:
print("Vectorizing the address")
Expand Down Expand Up @@ -1197,3 +1227,12 @@ def _model_os_validation(self, num_workers):
"FastText objects are not pickleable with the parallelism process used by default by MacOS. "
"Thus, you need to set torch.multiprocessing.set_start_method('fork') to allow torch parallelism."
)

def _apply_pre_processors(self, addresses: List[str]) -> List[str]:
res = []

for address in addresses:
for pre_processor in self.pre_processors:
processed_address = pre_processor(address)
res.append(" ".join(processed_address.split()))
return res
1 change: 1 addition & 0 deletions deepparse/pre_processing/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .pre_processor import *
85 changes: 85 additions & 0 deletions deepparse/pre_processing/pre_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import re


def double_whitespaces_cleaning(address: str) -> str:
"""
Pre-processor to remove double whitespace by one whitespace.
The regular expression use to clean multiple whitespaces is the following ``" {2,}"``.
Args:
address: The address to apply double whitespace cleaning on.
Return:
The double whitespace cleaned address.
"""
return re.sub(pattern=r" {2,}", repl=r" ", string=address)


def trailing_whitespace_cleaning(address: str) -> str:
"""
Pre-processor to remove trailing whitespace.
Args:
address: The address to apply trailing whitespace cleaning on.
Return:
The trailing whitespace cleaned address.
"""
return address.strip(" ")


def coma_cleaning(address: str) -> str:
"""
Pre-processor to remove coma. It is based on `issue 56 <https://github.com/GRAAL-Research/deepparse/issues/56>`_.
Args:
address: The address to apply coma cleaning on.
Return:
The coma-cleaned address.
"""
return address.replace(",", "")


def lower_cleaning(address: str) -> str:
"""
Pre-processor to lowercase an address since the original training data was in lowercase.
Args:
address: The address to apply coma cleaning on.
Return:
The lowercase address.
"""
return address.lower()


# The first group is the unit, and the second is the street number.
# Both include letters since they can include letters in some countries. For example,
# unit 3a or address 305a.
hyphen_splitted_unit_and_street_number_regex = r"^([0-9]*[a-z]?)-([0-9]*[a-z]?) "


def hyphen_cleaning(address: str) -> str:
"""
Pre-processor to clean hyphen between the street number and unit in an address. Since some addresses use the
hyphen to split the unit and street address, we replace the hyphen with whitespaces to allow a
proper splitting of the address. For example, the proper parsing of the address 3-305 street name is
Unit: 3, StreetNumber: 305, StreetName: street name.
See `issue 137 <https://github.com/GRAAL-Research/deepparse/issues/137>`_ for more details.
The regular expression use to clean hyphen is the following ``"^([0-9]*[a-z]?)-([0-9]*[a-z]?) "``.
The first group is the unit, and the second is the street number. Both include letters since they can include
letters in some countries. For example, unit 3a or address 305a.
Note: the hyphen is also used in some cities' names, such as Saint-Jean; thus, we use regex to detect
the proper hyphen to replace.
Args:
address: The address to apply coma cleaning on.
Return:
The lowercase address.
"""
return re.sub(pattern=hyphen_splitted_unit_and_street_number_regex, repl=r"\1 \2 ", string=address)
1 change: 0 additions & 1 deletion deepparse/preprocessing/__init__.py

This file was deleted.

47 changes: 0 additions & 47 deletions deepparse/preprocessing/address_cleaner.py

This file was deleted.

1 change: 1 addition & 0 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -765,6 +765,7 @@ API Reference
:caption: API

parser
pre_processor
dataset_container
comparer
cli
Expand Down
16 changes: 16 additions & 0 deletions docs/source/pre_processor.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
.. _pre_processor_label:

.. role:: hidden
:class: hidden-section


Pre-Processors
=================
Here are the available pre-processor in Deepparse. The first four are used as default settings when parsing
addresses.

.. autofunction:: deepparse.pre_processing.pre_processor.coma_cleaning
.. autofunction:: deepparse.pre_processing.pre_processor.lower_cleaning
.. autofunction:: deepparse.pre_processing.pre_processor.trailing_whitespace_cleaning
.. autofunction:: deepparse.pre_processing.pre_processor.double_whitespaces_cleaning
.. autofunction:: deepparse.pre_processing.pre_processor.hyphen_cleaning
File renamed without changes.
98 changes: 98 additions & 0 deletions tests/pre_processing/test_address_cleaner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
from unittest import TestCase

from deepparse.pre_processing import (
coma_cleaning,
lower_cleaning,
trailing_whitespace_cleaning,
hyphen_cleaning,
double_whitespaces_cleaning,
)


class PreProcessorTest(TestCase):
@classmethod
def setUpClass(cls):
cls.a_clean_address = "350 rue des lilas ouest québec québec g1l 1b6"
cls.a_dirty_address_with_commas = "350 rue des lilas , ouest ,québec québec, g1l 1b6"
cls.a_commas_separated_address = "350, rue des lilas, ouest, québec, québec, g1l 1b6"
cls.a_dirty_address_with_uppercase = "350 rue des Lilas Ouest Québec Québec G1L 1B6"
cls.a_dirty_address_with_trailing_whitespaces = "350 rue des lilas ouest québec québec g1l 1b6 "
cls.a_dirty_address_with_whitespaces = "350 rue des lilas ouest québec québec g1l 1b6"

cls.an_address_with_hyphen_split_address_components = "3-350 rue des lilas ouest"
cls.a_unit_clean_address = "3 350 rue des lilas ouest"

cls.an_address_with_hyphen_split_address_components_with_hyphen_city = "3-350 rue des lilas ouest saint-jean"
cls.a_unit_hyphen_city_name_clean_address = "3 350 rue des lilas ouest saint-jean"

cls.a_unit_with_letter_hyphen_split = "3a-350 rue des lilas ouest saint-jean"
cls.a_unit_with_letter_hyphen_split_clean_address = "3a 350 rue des lilas ouest saint-jean"

cls.a_unit_with_letter_only_hyphen_split = "a-350 rue des lilas ouest saint-jean"
cls.a_unit_with_letter_only_hyphen_split_clean_address = "a 350 rue des lilas ouest saint-jean"

cls.a_street_number_with_letter_hyphen_split = "3-350a rue des lilas ouest saint-jean"
cls.a_street_number_with_letter_hyphen_split_clean_address = "3 350a rue des lilas ouest saint-jean"

cls.letters_hyphen_address = "3a-350b rue des lilas ouest saint-jean"
cls.letters_hyphen_address_split_clean_address = "3a 350b rue des lilas ouest saint-jean"

def test_givenADirtyAddressWithCommas_whenComaCleaning_thenShouldRemoveCommas(
self,
):
cleaned_address = coma_cleaning(self.a_commas_separated_address)

self.assertEqual(self.a_clean_address, cleaned_address)

def test_givenADirtyAddressWithUppercase_whenLowerCleaning_thenShouldLower(self):
cleaned_address = lower_cleaning(self.a_dirty_address_with_uppercase)

self.assertEqual(self.a_clean_address, cleaned_address)

def test_givenADirtyAddressWithWhitespaces_whenTrailingWhitespaceCleaning_thenShouldRemoveWhitespaces(
self,
):
cleaned_address = trailing_whitespace_cleaning(self.a_dirty_address_with_trailing_whitespaces)

self.assertEqual(self.a_clean_address, cleaned_address)

def test_givenADirtyAddressWithWhitespacesInAddress_whenDoubleWhitespacesCleaning_thenShouldRemoveWhitespaces(
self,
):
cleaned_address = double_whitespaces_cleaning(self.a_dirty_address_with_whitespaces)

self.assertEqual(self.a_clean_address, cleaned_address)

def test_givenAHyphenUnitStreetNumberAddress_whenCleaningAddress_thenShouldReplaceHyphenWithWhiteSpace(self):
cleaned_address = hyphen_cleaning(self.an_address_with_hyphen_split_address_components)

self.assertEqual(self.a_unit_clean_address, cleaned_address)

def test_givenAHyphenUnitAndCityAddress_whenCleaningAddress_thenShouldReplaceUnitStreetNumberHyphenWithWhiteSpace(
self,
):
cleaned_address = hyphen_cleaning(self.an_address_with_hyphen_split_address_components_with_hyphen_city)

self.assertEqual(self.a_unit_hyphen_city_name_clean_address, cleaned_address)

def test_givenAnAlphabeticalUnitStreetNumberHyphen_whenCleaningAddress_thenShouldReplaceHyphenWithWhiteSpace(self):
cleaned_address = hyphen_cleaning(self.a_unit_with_letter_hyphen_split)

self.assertEqual(self.a_unit_with_letter_hyphen_split_clean_address, cleaned_address)

def test_givenAnAlphabeticalOnlyUnitHyphen_whenCleaningAddress_thenShouldReplaceHyphenWithWhiteSpace(self):
cleaned_address = hyphen_cleaning(self.a_unit_with_letter_only_hyphen_split)

self.assertEqual(self.a_unit_with_letter_only_hyphen_split_clean_address, cleaned_address)

def test_givenAnAlphabeticalStreetNumberUnitHyphen_whenCleaningAddress_thenShouldReplaceHyphenWithWhiteSpace(self):
cleaned_address = hyphen_cleaning(self.a_street_number_with_letter_hyphen_split)

self.assertEqual(self.a_street_number_with_letter_hyphen_split_clean_address, cleaned_address)

def test_givenAnAlphabeticalComponentsStreetNumberUnit_whenCleaningAddress_thenShouldReplaceHyphenWithWhiteSpace(
self,
):
cleaned_address = hyphen_cleaning(self.letters_hyphen_address)

self.assertEqual(self.letters_hyphen_address_split_clean_address, cleaned_address)

0 comments on commit d0782aa

Please sign in to comment.