-
-
Notifications
You must be signed in to change notification settings - Fork 31
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
11 changed files
with
248 additions
and
174 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .pre_processor import * |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
import re | ||
|
||
|
||
def double_whitespaces_cleaning(address: str) -> str: | ||
""" | ||
Pre-processor to remove double whitespace by one whitespace. | ||
The regular expression use to clean multiple whitespaces is the following ``" {2,}"``. | ||
Args: | ||
address: The address to apply double whitespace cleaning on. | ||
Return: | ||
The double whitespace cleaned address. | ||
""" | ||
return re.sub(pattern=r" {2,}", repl=r" ", string=address) | ||
|
||
|
||
def trailing_whitespace_cleaning(address: str) -> str: | ||
""" | ||
Pre-processor to remove trailing whitespace. | ||
Args: | ||
address: The address to apply trailing whitespace cleaning on. | ||
Return: | ||
The trailing whitespace cleaned address. | ||
""" | ||
return address.strip(" ") | ||
|
||
|
||
def coma_cleaning(address: str) -> str: | ||
""" | ||
Pre-processor to remove coma. It is based on `issue 56 <https://github.com/GRAAL-Research/deepparse/issues/56>`_. | ||
Args: | ||
address: The address to apply coma cleaning on. | ||
Return: | ||
The coma-cleaned address. | ||
""" | ||
return address.replace(",", "") | ||
|
||
|
||
def lower_cleaning(address: str) -> str: | ||
""" | ||
Pre-processor to lowercase an address since the original training data was in lowercase. | ||
Args: | ||
address: The address to apply coma cleaning on. | ||
Return: | ||
The lowercase address. | ||
""" | ||
return address.lower() | ||
|
||
|
||
# The first group is the unit, and the second is the street number. | ||
# Both include letters since they can include letters in some countries. For example, | ||
# unit 3a or address 305a. | ||
hyphen_splitted_unit_and_street_number_regex = r"^([0-9]*[a-z]?)-([0-9]*[a-z]?) " | ||
|
||
|
||
def hyphen_cleaning(address: str) -> str: | ||
""" | ||
Pre-processor to clean hyphen between the street number and unit in an address. Since some addresses use the | ||
hyphen to split the unit and street address, we replace the hyphen with whitespaces to allow a | ||
proper splitting of the address. For example, the proper parsing of the address 3-305 street name is | ||
Unit: 3, StreetNumber: 305, StreetName: street name. | ||
See `issue 137 <https://github.com/GRAAL-Research/deepparse/issues/137>`_ for more details. | ||
The regular expression use to clean hyphen is the following ``"^([0-9]*[a-z]?)-([0-9]*[a-z]?) "``. | ||
The first group is the unit, and the second is the street number. Both include letters since they can include | ||
letters in some countries. For example, unit 3a or address 305a. | ||
Note: the hyphen is also used in some cities' names, such as Saint-Jean; thus, we use regex to detect | ||
the proper hyphen to replace. | ||
Args: | ||
address: The address to apply coma cleaning on. | ||
Return: | ||
The lowercase address. | ||
""" | ||
return re.sub(pattern=hyphen_splitted_unit_and_street_number_regex, repl=r"\1 \2 ", string=address) |
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -765,6 +765,7 @@ API Reference | |
:caption: API | ||
|
||
parser | ||
pre_processor | ||
dataset_container | ||
comparer | ||
cli | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
.. _pre_processor_label: | ||
|
||
.. role:: hidden | ||
:class: hidden-section | ||
|
||
|
||
Pre-Processors | ||
================= | ||
Here are the available pre-processor in Deepparse. The first four are used as default settings when parsing | ||
addresses. | ||
|
||
.. autofunction:: deepparse.pre_processing.pre_processor.coma_cleaning | ||
.. autofunction:: deepparse.pre_processing.pre_processor.lower_cleaning | ||
.. autofunction:: deepparse.pre_processing.pre_processor.trailing_whitespace_cleaning | ||
.. autofunction:: deepparse.pre_processing.pre_processor.double_whitespaces_cleaning | ||
.. autofunction:: deepparse.pre_processing.pre_processor.hyphen_cleaning |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
from unittest import TestCase | ||
|
||
from deepparse.pre_processing import ( | ||
coma_cleaning, | ||
lower_cleaning, | ||
trailing_whitespace_cleaning, | ||
hyphen_cleaning, | ||
double_whitespaces_cleaning, | ||
) | ||
|
||
|
||
class PreProcessorTest(TestCase): | ||
@classmethod | ||
def setUpClass(cls): | ||
cls.a_clean_address = "350 rue des lilas ouest québec québec g1l 1b6" | ||
cls.a_dirty_address_with_commas = "350 rue des lilas , ouest ,québec québec, g1l 1b6" | ||
cls.a_commas_separated_address = "350, rue des lilas, ouest, québec, québec, g1l 1b6" | ||
cls.a_dirty_address_with_uppercase = "350 rue des Lilas Ouest Québec Québec G1L 1B6" | ||
cls.a_dirty_address_with_trailing_whitespaces = "350 rue des lilas ouest québec québec g1l 1b6 " | ||
cls.a_dirty_address_with_whitespaces = "350 rue des lilas ouest québec québec g1l 1b6" | ||
|
||
cls.an_address_with_hyphen_split_address_components = "3-350 rue des lilas ouest" | ||
cls.a_unit_clean_address = "3 350 rue des lilas ouest" | ||
|
||
cls.an_address_with_hyphen_split_address_components_with_hyphen_city = "3-350 rue des lilas ouest saint-jean" | ||
cls.a_unit_hyphen_city_name_clean_address = "3 350 rue des lilas ouest saint-jean" | ||
|
||
cls.a_unit_with_letter_hyphen_split = "3a-350 rue des lilas ouest saint-jean" | ||
cls.a_unit_with_letter_hyphen_split_clean_address = "3a 350 rue des lilas ouest saint-jean" | ||
|
||
cls.a_unit_with_letter_only_hyphen_split = "a-350 rue des lilas ouest saint-jean" | ||
cls.a_unit_with_letter_only_hyphen_split_clean_address = "a 350 rue des lilas ouest saint-jean" | ||
|
||
cls.a_street_number_with_letter_hyphen_split = "3-350a rue des lilas ouest saint-jean" | ||
cls.a_street_number_with_letter_hyphen_split_clean_address = "3 350a rue des lilas ouest saint-jean" | ||
|
||
cls.letters_hyphen_address = "3a-350b rue des lilas ouest saint-jean" | ||
cls.letters_hyphen_address_split_clean_address = "3a 350b rue des lilas ouest saint-jean" | ||
|
||
def test_givenADirtyAddressWithCommas_whenComaCleaning_thenShouldRemoveCommas( | ||
self, | ||
): | ||
cleaned_address = coma_cleaning(self.a_commas_separated_address) | ||
|
||
self.assertEqual(self.a_clean_address, cleaned_address) | ||
|
||
def test_givenADirtyAddressWithUppercase_whenLowerCleaning_thenShouldLower(self): | ||
cleaned_address = lower_cleaning(self.a_dirty_address_with_uppercase) | ||
|
||
self.assertEqual(self.a_clean_address, cleaned_address) | ||
|
||
def test_givenADirtyAddressWithWhitespaces_whenTrailingWhitespaceCleaning_thenShouldRemoveWhitespaces( | ||
self, | ||
): | ||
cleaned_address = trailing_whitespace_cleaning(self.a_dirty_address_with_trailing_whitespaces) | ||
|
||
self.assertEqual(self.a_clean_address, cleaned_address) | ||
|
||
def test_givenADirtyAddressWithWhitespacesInAddress_whenDoubleWhitespacesCleaning_thenShouldRemoveWhitespaces( | ||
self, | ||
): | ||
cleaned_address = double_whitespaces_cleaning(self.a_dirty_address_with_whitespaces) | ||
|
||
self.assertEqual(self.a_clean_address, cleaned_address) | ||
|
||
def test_givenAHyphenUnitStreetNumberAddress_whenCleaningAddress_thenShouldReplaceHyphenWithWhiteSpace(self): | ||
cleaned_address = hyphen_cleaning(self.an_address_with_hyphen_split_address_components) | ||
|
||
self.assertEqual(self.a_unit_clean_address, cleaned_address) | ||
|
||
def test_givenAHyphenUnitAndCityAddress_whenCleaningAddress_thenShouldReplaceUnitStreetNumberHyphenWithWhiteSpace( | ||
self, | ||
): | ||
cleaned_address = hyphen_cleaning(self.an_address_with_hyphen_split_address_components_with_hyphen_city) | ||
|
||
self.assertEqual(self.a_unit_hyphen_city_name_clean_address, cleaned_address) | ||
|
||
def test_givenAnAlphabeticalUnitStreetNumberHyphen_whenCleaningAddress_thenShouldReplaceHyphenWithWhiteSpace(self): | ||
cleaned_address = hyphen_cleaning(self.a_unit_with_letter_hyphen_split) | ||
|
||
self.assertEqual(self.a_unit_with_letter_hyphen_split_clean_address, cleaned_address) | ||
|
||
def test_givenAnAlphabeticalOnlyUnitHyphen_whenCleaningAddress_thenShouldReplaceHyphenWithWhiteSpace(self): | ||
cleaned_address = hyphen_cleaning(self.a_unit_with_letter_only_hyphen_split) | ||
|
||
self.assertEqual(self.a_unit_with_letter_only_hyphen_split_clean_address, cleaned_address) | ||
|
||
def test_givenAnAlphabeticalStreetNumberUnitHyphen_whenCleaningAddress_thenShouldReplaceHyphenWithWhiteSpace(self): | ||
cleaned_address = hyphen_cleaning(self.a_street_number_with_letter_hyphen_split) | ||
|
||
self.assertEqual(self.a_street_number_with_letter_hyphen_split_clean_address, cleaned_address) | ||
|
||
def test_givenAnAlphabeticalComponentsStreetNumberUnit_whenCleaningAddress_thenShouldReplaceHyphenWithWhiteSpace( | ||
self, | ||
): | ||
cleaned_address = hyphen_cleaning(self.letters_hyphen_address) | ||
|
||
self.assertEqual(self.letters_hyphen_address_split_clean_address, cleaned_address) |
Oops, something went wrong.