Skip to content

Commit

Permalink
improve documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
davebulaval committed Oct 17, 2023
1 parent adcbe41 commit 2f396dc
Show file tree
Hide file tree
Showing 9 changed files with 68 additions and 65 deletions.
20 changes: 10 additions & 10 deletions deepparse/comparer/addresses_comparer.py
Expand Up @@ -11,14 +11,14 @@
class AddressesComparer:
"""
Address comparer to compare addresses with each other and retrieves the differences between them. The addresses
are parsed using an address parser based on one of the seq2seq pretrained networks either with fastText or BPEmb.
are parsed using an address parser based on one of the seq2seq pretrained networks, either with fastText or BPEmb.
The address comparer can compare already parsed addresses. The address parser first recompose the raw
addresses then suggests its own tags, then it makes a comparison with the tags of the source parsing and the
The address comparer can compare already parsed addresses. The address parser first recomposes the raw
addresses then suggest its own tags; then it makes a comparison with the tags of the source parsing and the
newly parsed address
The address comparer is also able to compare raw addresses by first parsing the addresses using the
address parser and then brings out the differences among the parsed addresses.
address parser and then bring out the differences among the parsed addresses.
Args:
Expand All @@ -40,13 +40,13 @@ def compare_tags(
) -> Union[List[FormattedComparedAddressesTags], FormattedComparedAddressesTags]:
"""
Compare tags of a source parsing with the parsing from AddressParser. First, it reconstructs the
raw address from the parsing, then AddressParser generates tags and then compares the two parsings.
raw address from the parsing, AddressParser generates tags and compares the two parsings.
Args:
addresses_tags_to_compare (Union[List[tuple], List[List[tuple]]]): list of tuple that contains
the tags for the address components from the source. Can compare multiples parsings if passed as a
the tags for the address components from the source. Can compare multiple parsings if passed as a
list of tuples.
with_prob (Union[None, bool]): A option flag to either or not include prob in the comparison report.
with_prob (Union[None, bool]): An option flag to either or not include prob in the comparison report.
The probabilities are not compared but only included in the report.
The default value is None, which means not taking into account.
Expand Down Expand Up @@ -122,14 +122,14 @@ def compare_raw(
with_prob: Union[None, bool] = None,
) -> List[FormattedComparedAddressesRaw]:
"""
Compare a list of raw addresses together, it starts by parsing the addresses
Compare a list of raw addresses together. It starts by parsing the addresses
with the setted parser and then return the differences between the addresses components
retrieved with our model.
Args:
raw_addresses_to_compare (Union[Tuple[str], List[Tuple[str]]]):
List of string that represent raw addresses to compare.
with_prob (Union[None, bool]): A option flag to either or not include prob in the comparison report.
List of strings that represent raw addresses to compare.
with_prob (Union[None, bool]): An option flag to either or not include prob in the comparison report.
The probabilities are not compared but only included in the report.
The default value is None, which means not taking into account.
Expand Down
28 changes: 14 additions & 14 deletions deepparse/comparer/formatted_compared_addresses.py
Expand Up @@ -20,7 +20,7 @@ class FormattedComparedAddresses(ABC):
for the first one.
second_address(FormattedParsedAddress): A formatted parsed address that contains the parsing information
for the second one.
origin: (Tuple[str, str]): The origin of the parsing (ex : from source or from a deepparse pretrained model).
origin: (Tuple[str, str]): The origin of the parsing (ex : from source or a Deepparse pretrained model).
Example:
Expand All @@ -40,7 +40,7 @@ class FormattedComparedAddresses(ABC):
@property
def list_of_bool(self) -> List:
"""
A list of boolean that contains all the address components names and indicates if it is the same for the
A list of boolean that contains all the address components' names and indicates if it is the same for the
two addresses.
Return:
Expand Down Expand Up @@ -86,7 +86,7 @@ def comparison_report(self, nb_delimiters: Union[int, None] = None) -> None:

def _comparison_report(self, nb_delimiters: Union[int, None]) -> str:
"""
Builds a comparison_report with delimiters to make the beginning and the end of the comparison easier to spot.
Builds a comparison_report with delimiters to make the comparison's beginning and end easier to spot.
"""

# Get terminal size to adapt the output to the user
Expand All @@ -102,15 +102,15 @@ def _comparison_report(self, nb_delimiters: Union[int, None]) -> str:
@abstractmethod
def _comparison_report_builder(self) -> str:
"""
Builds the core of a comparison report for the different comparisons. Since the procedure to make a tags
comparison and the raw addresses comparison is different, the comparison report is not the same for the two.
Builds the core of a comparison report for the various comparisons. Since the procedure to make a tags
comparison and the raw addresses comparison are different, the comparison report is not the same for the two.
It is then implemented in each specific class.
"""

@abstractmethod
def _get_probs(self) -> Dict:
"""
Get the tags from the parsing with their associated probabilities, the method needs to be implemented in each
To get the tags from the parsing with their associated probabilities, the method needs to be implemented in each
class because they don't use the probabilities the same way.
"""

Expand Down Expand Up @@ -180,7 +180,7 @@ def _get_tags_diff_color(
Args:
name_one (str, optional) : Name associated with first color. The default value is the first address.
name_two (str, optional) : Name associated with second color. The default value is the second address.
name_two (str, optional) : Name associated with the second colour. The default value is the second address.
verbose (bool, optional): If True, it will print a presentation of the colours and what they mean.
The default value is True.
Expand Down Expand Up @@ -221,14 +221,14 @@ def _get_tags_diff_color(
def _bool_address_tags_are_the_same(self, parsed_addresses: Union[List[List[tuple]], List[tuple]]) -> List[tuple]:
"""
Compare addresses components and put the differences in a dictionary where the keys are the
names of the addresses components, and the values are the value of the addresses component.
names of the addresses components, and the values are the values of the addresses component.
Args:
parsed_addresses (Union[List[List[tuple]], List[tuple]]): Contains the tags and the
address components name for the parsed addresses.
address components' names for the parsed addresses.
Return:
List[tuple]: List of tuples that contains all addresses components that differ from each other.
List[tuple]: List of tuples that contain all addresses components that differ from each other.
"""
unique_address_component_names = self._unique_addresses_component_names(parsed_addresses)

Expand Down Expand Up @@ -258,16 +258,16 @@ def _bool_address_tags_are_the_same(self, parsed_addresses: Union[List[List[tupl
@staticmethod
def _unique_addresses_component_names(parsed_addresses: List[List[tuple]]) -> List:
"""
Retrieves all the unique address components names from the comparison then returns it.
Retrieves all the unique address component names from the comparison, then returns it.
Args:
parsed_addresses (List[List[tuple]]): Contains the tags and the
address components name for the parsed addresses.
address components' names for the parsed addresses.
Return:
Returns a list of all the unique address components names.
Returns a list of all the unique address component names.
"""
# Here we don't use a set since order will change and report will also change.
# We don't use a set here since the order and report will change.
unique_address_component_names = []
for tuple_values in parsed_addresses:
for address_component in tuple_values:
Expand Down
6 changes: 3 additions & 3 deletions deepparse/comparer/formatted_compared_addresses_raw.py
Expand Up @@ -12,7 +12,7 @@ class FormattedComparedAddressesRaw(FormattedComparedAddresses):

def _get_probs(self) -> Dict:
"""
Get the tags from the parsing with their associated probabilities, the method needs to be implemented in each
To get the tags from the parsing with their associated probabilities, the method needs to be implemented in each
class because they don't use the probabilities the same way.
"""
return {
Expand Down Expand Up @@ -45,8 +45,8 @@ def _get_raw_diff_color(self, verbose=True) -> str:

def _comparison_report_builder(self) -> str:
"""
Builds the core of a comparison report for the different comparisons. Since the procedure to make a tags
comparison and the raw addresses comparison is different, the comparison report is not the same for the two.
Builds the core of a comparison report for the various comparisons. Since the procedure to make a tags
comparison and the raw addresses comparison are different, the comparison report is not the same for the two.
It is then implemented in each specific class.
"""
str_formatted = ""
Expand Down
6 changes: 3 additions & 3 deletions deepparse/comparer/formatted_compared_addresses_tags.py
Expand Up @@ -12,7 +12,7 @@ class FormattedComparedAddressesTags(FormattedComparedAddresses):

def _get_probs(self) -> Dict:
"""
Get the tags from the parsing with their associated probabilities, the method needs to be implemented in each
To get the tags from the parsing with their associated probabilities, the method needs to be implemented in each
class because they don't use the probabilities the same way.
"""
return {
Expand All @@ -37,8 +37,8 @@ def _get_probs_of_tags(self, verbose: bool = True) -> str:

def _comparison_report_builder(self) -> str:
"""
Builds the core of a comparison report for the different comparisons. Since the procedure to make a tags
comparison and the raw addresses comparison is different, the comparison report is not the same for the two.
Builds the core of a comparison report for the various comparisons. Since the procedure to make a tags
comparison and the raw addresses comparison are different, the comparison report is not the same for the two.
It is then implemented in each specific class.
"""

Expand Down
15 changes: 9 additions & 6 deletions deepparse/data_validation/data_validation.py
Expand Up @@ -3,30 +3,33 @@

def validate_if_any_empty(string_elements: List) -> bool:
"""
Return true if one of the string element is an empty one.
Return ``True`` if one of the string elements is empty. For example, the second element in the following list is
an empty address: ``["An address", "", "Another address"]``. Thus, it will return ``False``.
Args:
string_elements (list): A list of string to validate.
string_elements (list): A list of strings to validate.
"""
return any(is_empty(string_element) for string_element in string_elements)


def validate_if_any_whitespace_only(string_elements: List) -> bool:
"""
Return true if one of the string element is only whitespace.
Return ``True`` if one of the string elements is only whitespace. For example, the second element in the
following list is only whitespace: ``["An address", " ", "Another address"]``. Thus, it will return ``False``.
Args:
string_elements (list): A list of string to validate.
string_elements (list): A list of strings to validate.
"""
return any(is_whitespace_only(string_element) for string_element in string_elements)


def validate_if_any_none(string_elements: List) -> bool:
"""
Return true if one of the string element is a None value.
Return ``True`` if one string element is a ``None`` value. For example, the second element in the following
list is a ``None`` value: ``["An address", None, "Another address"]``. Thus, it will return ``False``.
Args:
string_elements (list): A list of string to validate.
string_elements (list): A list of strings to validate.
"""
return any(is_none(string_element) for string_element in string_elements)

Expand Down
20 changes: 10 additions & 10 deletions deepparse/dataset_container/dataset_container.py
Expand Up @@ -54,14 +54,14 @@ def __getitem__(
self, idx: Union[int, slice]
) -> Union[List[str], str, List[List[Tuple[str, List]]], Tuple[str, List]]:
"""
If the DatasetContainer is a predict one:
If the DatasetContainer is a "predict" one:
- it can be a list of string items (e.g. a list of address (str)), or
- it can be a list of string items (e.g. a list of addresses (str)), or
- it can be a unique string item (e.g. one address).
If the DatasetContainer is a training one:
- it can be a list of tuple (str, list) items, namely a list of parsed example (e.g. an address with
- it can be a list of tuple (str, list) items, namely a list of parsed examples (e.g. an address with
the tags), or
- it can be a tuple (str, list) item.
Expand Down Expand Up @@ -114,7 +114,7 @@ def _training_validation(self) -> None:

if not self._data_tags_is_same_len_then_address():
print(
f"Some addresses (whitespace-split) and the tags associated with them are not the same len. "
f"Some addresses (whitespace-split) and the associated tags are not the same len. "
f"If you are using a CSVDatasetContainer, consider using the tag_seperator_reformat_fn argument."
f"Here is the report of those cases where len differ to help you out:\n"
f"{self._data_tags_not_the_same_len_diff()}"
Expand Down Expand Up @@ -190,8 +190,8 @@ def __init__(self, data_path: str, is_training_container: bool = True) -> None:
if not is_training_container:
if self._test_predict_container_is_list_of_tuple():
raise DataError(
"The data is a list of tuple by the dataset container is a predict container. "
"Predict container should contains only a list of address."
"The data is a list of tuples, but the dataset container is a predict container. "
"Predict container should contain only a list of addresses."
)

self.validate_dataset()
Expand Down Expand Up @@ -226,17 +226,17 @@ class CSVDatasetContainer(DatasetContainer):
data_path (str): The path to the CSV dataset file.
column_names (list): A column name list to extract the dataset element.
If the dataset container is a predict one, the list must be of exactly one element
(i.e. the address column). On the other hand, if the dataset container is a training one, the list must be
If the dataset container is a "predict" one, the list must be of exactly one element
(i.e. the address column). On the other hand, if the dataset container is a "training" one, the list must be
of exactly two elements: addresses and tags.
is_training_container (bool): Either or not, the dataset container is a training container. This will determine
the dataset validation test we apply to the dataset. That is, a predict dataset doesn't include tags.
The default value is true.
separator (str): The CSV columns separator to use. By default, ``"\\t"``.
tag_seperator_reformat_fn (Callable, optional): A function to parse a tags string and return a list of
address tags. For example, if the tag column is a former python list saved with pandas, the characters ``]``
address tags. For example, if the tag column is a former Python list saved with pandas, the characters ``]``
, ``]`` and ``'`` will be included as the tags' element. Thus, a parsing function will take a string as is
parameter and output a python list. The default function process it as a former python list.
parameter and output a python list. The default function processes it as a former Python list.
That is, it removes the ``[],`` characters and splits the sequence at each comma (``","``).
csv_reader_kwargs (dict, optional): Keyword arguments to pass to pandas ``read_csv`` use internally. By default,
the ``data_path`` is passed along with our default ``sep`` value ( ``"\\t"``) and the ``"utf-8"`` encoding
Expand Down
10 changes: 5 additions & 5 deletions deepparse/dataset_container/tools.py
Expand Up @@ -14,23 +14,23 @@ def former_python_list(tags: str) -> List:
A list of the parsed tag set.
"""
# We remove the [ and ] of the list.
# Then, we split each element using a comma as separator.
# Finally, since some case the element are separated by a comma (e.g. element1,element2)
# Then, we split each element using a comma as a separator.
# Finally, in some cases, the element are separated by a comma (e.g. element1,element2)
# or a comma and a whitespace (e.g. element1, element2), we strip the whitespace on all tags to
# remove the trailing whitespace when element are separated by a coma and a whitespace.
# remove the trailing whitespace when a coma and a whitespace separate elements.
# To fix https://github.com/GRAAL-Research/deepparse/issues/124.
return [tag.strip() for tag in tags.replace("[", "").replace("]", "").replace("'", "").split(",")]


def validate_column_names(column_names: List[str]) -> bool:
"""
Function validate if element of a list of column name are valid.
Function to validate if the element of a list of column names is valid.
Args:
column_names (List[str]): A list of column names.
Return:
Either or not, the colum name are valid.
Either or not, the column names are valid.
"""
improper_column_names = False
if validate_if_any_empty(column_names) or validate_if_any_whitespace_only(column_names):
Expand Down

0 comments on commit 2f396dc

Please sign in to comment.