Merge branch 'dev' into dependabot/pip/black-24.3.0

GRAAL-Research · Apr 14, 2024 · a3c64ae · a3c64ae
2 parents a82a904 + 3252428
commit a3c64ae
Show file tree

Hide file tree

Showing 51 changed files with 323 additions and 299 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,7 +15,7 @@
 
 - Added "contributing to"
 - Added fix for comma problem (#56)
-- Added content in Address Parser doc for tags definition
+- Added content in Address Parser documentation for tags definition
 - Fixed Pylint bug with PyTorch 1.6
 - Fixed `pack_padded` cpu error with PyTorch new release
 
@@ -75,15 +75,15 @@
 
 ## 0.3.6
 
-- Added a method for a dict conversion of parsed addresses for simpler `Pandas` integration.
+- Added a method for dictionary conversion of parsed addresses for simpler `Pandas` integration.
 - Added examples for parsing addresses and how to convert them into a DataFrame.
 - Fixed error with download module.
 
 ## 0.4
 
 - Added verbose flag to training and test based on the __init__ of address parser.
 - Added a feature to retrain our models with prediction tags dictionary different from the default one.
-- Added in-doc code examples.
+- Added in-documentation code examples.
 - Added code examples.
 - Small improvement of models implementation.
 
@@ -134,7 +134,7 @@
 ## 0.6.2
 
 - Improved (slightly) code speed of data padding method as per PyTorch list or array to Tensor recommendation.
-- Improved doc for RuntimeError due to retraining FastText and BPEmb model in the same directory.
+- Improved documentation for RuntimeError due to retraining FastText and BPEmb model in the same directory.
 - Added error handling RuntimeError when retraining.
 
 ## 0.6.3
@@ -162,21 +162,21 @@
 ## 0.6.6
 
 - Fixed errors in code examples
-- Improved doc of download_from_url
+- Improved documentation of download_from_url
 - Improve error management of retrain and test
 
 ## 0.6.7
 
 - Fixed errors in data validation
-- Improved doc over data validation
+- Improved documentation over data validation
 - Bugfix data slicing error with data containers
 - Add an example on how to use a retrained model
 
 ## 0.7
 
 - Improved CLI
 - Fixed bug in CLI export dataset
-- Improved the doc of the CLI
+- Improved the documentation of the CLI
 
 ## 0.7.1
 
@@ -208,7 +208,7 @@
   user-given name
 - Hot-fix missing raise for DataError validation of address to parse when address is tuple
 - Bug-fix handling of string column name for CSVDatasetContainer that raised ValueError
-- Improve parse CLI doc and fix error in doc stating JSON format is supported as input data
+- Improve parse CLI documentation and fix error in documentation stating JSON format is supported as input data
 - Add batch_size to parse CLI
 - Add minimum version to Gensim 4.0.0.
 - Add a new CLI function, retrain, to retrain from the command line

diff --git a/deepparse/app/request_examples.http b/deepparse/app/request_examples.http
@@ -16,5 +16,5 @@ Content-Type: application/json
 
 [
   {"raw": "16 rue Grande-Place, Victoriaville, QC, G6S 1E6"},
-  {"raw": "123 rue Valancourt, Val-Alain, quebec, g9v1s3"}
+  {"raw": "123 rue valancourt, val-alain, quebec, g9v 1s3"}
 ]
diff --git a/deepparse/cli/download_model.py b/deepparse/cli/download_model.py
@@ -1,13 +1,12 @@
 import argparse
 import sys
 
-
 from deepparse.download_tools import download_model, MODEL_MAPPING_CHOICES
 
 
 def main(args=None) -> None:
     """
-    CLI function to manually download all the dependencies for a pretrained model.
+    CLI function to download all the dependencies for a pretrained model manually.
 
     Example of usage:
 
@@ -41,7 +40,7 @@ def get_parser() -> argparse.ArgumentParser:
         "--saving_cache_dir",
         type=str,
         default=None,
-        help="To change the default saving cache directory (default to None e.g. default path).",
+        help="To change the default saving cache directory (default to None, e.g. default path).",
     )
 
     return parser

diff --git a/deepparse/cli/download_models.py b/deepparse/cli/download_models.py
@@ -6,7 +6,7 @@
 
 def main(args=None) -> None:
     """
-    CLI function to manually download all the dependencies for all pretrained models.
+    CLI function to download all the dependencies for all pretrained models manually.
 
     Example of usage:
 
@@ -34,7 +34,7 @@ def get_parser() -> argparse.ArgumentParser:
         "--saving_cache_dir",
         type=str,
         default=None,
-        help="To change the default saving cache directory (default to None e.g. default path).",
+        help="To change the default saving cache directory (default to None, e.g. default path).",
     )
 
     return parser

diff --git a/deepparse/cli/parse.py b/deepparse/cli/parse.py
@@ -32,15 +32,15 @@
 def main(args=None) -> None:
     # pylint: disable=too-many-locals, too-many-branches
     """
-    CLI function to rapidly parse an addresses dataset and output it in another file.
+    CLI function to easily parse an address dataset and output it in another file.
 
     Examples of usage:
 
     .. code-block:: sh
 
         parse fasttext ./dataset_path.csv parsed_address.pickle
 
-    Using a gpu device
+    Using a GPU device
 
     .. code-block:: sh
 
@@ -119,7 +119,7 @@ def main(args=None) -> None:
 
 
 def get_parser() -> argparse.ArgumentParser:
-    """Return ArgumentParser for the cli."""
+    """Return ArgumentParser for the CLI."""
 
     parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
     parser.add_argument(
@@ -137,11 +137,11 @@ def get_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "export_filename",
         help=wrap(
-            "The filename to use to export the parsed addresses. We will infer the file format base on the "
+            "The filename to use to export the parsed addresses. We will infer the file format based on the "
             "file extension. That is, if the file is a pickle (.p or .pickle), we will export it into a pickle file. "
-            "The supported format are Pickle, CSV and JSON. "
+            "The supported formats are Pickle, CSV and JSON. "
             "The file will be exported in the same repositories as the dataset_path. "
-            "See the doc for more details on the format exporting."
+            "See the documentation for more details on the format exporting."
         ),
         type=str,
     )

diff --git a/deepparse/cli/parser_arguments_adder.py b/deepparse/cli/parser_arguments_adder.py
@@ -25,7 +25,7 @@ def add_csv_column_name_arg(parser: ArgumentParser) -> None:
     parser.add_argument(
         "--csv_column_name",
         help=wrap(
-            "The column name to extract address in the CSV. Need to be specified if the provided dataset_path "
+            "The column name to extract the address in the CSV. It needs to be specified if the provided dataset_path "
             "leads to a CSV file."
         ),
         type=str,
@@ -37,7 +37,7 @@ def add_csv_column_names_arg(parser: ArgumentParser) -> None:
     parser.add_argument(
         "--csv_column_names",
         help=wrap(
-            "The column names to extract address and tags in the CSV. Need to be specified if the provided "
+            "The column names to extract addresses and tags in the CSV. It needs to be specified if the provided "
             "dataset_path leads to a CSV file. Column names have to be separated by a whitespace. For"
             "example, --csv_column_names column1 column2. By default, None."
         ),

diff --git a/deepparse/cli/retrain.py b/deepparse/cli/retrain.py
@@ -64,7 +64,7 @@ def handle_prediction_tags(parsed_args):
 def main(args=None) -> None:
     # pylint: disable=too-many-locals, too-many-branches
     """
-    CLI function to rapidly retrain an addresses parser and saves it. One can retrain a base pretrained model
+    CLI function to easily retrain an address parser and save it. One can retrain a base pretrained model
     using most of the arguments as the :meth:`~AddressParser.retrain` method. By default, all the parameters have
     the same default value as the :meth:`~AddressParser.retrain` method. The supported parameters are the following:
 
@@ -86,7 +86,7 @@ def main(args=None) -> None:
 
         retrain fasttext ./train_dataset_path.csv
 
-    Using a gpu device
+    Using a GPU device
 
     .. code-block:: sh
 
@@ -142,7 +142,7 @@ def main(args=None) -> None:
 
 
 def get_parser() -> argparse.ArgumentParser:
-    """Return ArgumentParser for the cli."""
+    """Return ArgumentParser for the CLI."""
 
     parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
 
@@ -198,8 +198,8 @@ def get_parser() -> argparse.ArgumentParser:
         "--logging_path",
         help=wrap(
             "The logging path for the checkpoints and the retrained model. "
-            "Note that training creates checkpoints, and we use Poutyne library that use the best epoch "
-            "model and reloads the state if any checkpoints are already there. "
+            "Note that training creates checkpoints, and we use the Poutyne library that uses the best epoch "
+            "model and reload the state if any checkpoints are already there. "
             "Thus, an error will be raised if you change the model type. For example, "
             "you retrain a FastText model and then retrain a BPEmb in the same logging path directory."
             "By default, the path is './checkpoints'."
@@ -241,7 +241,7 @@ def get_parser() -> argparse.ArgumentParser:
         help=wrap(
             "Path to a JSON file of prediction tags to use to retrain. Tags are in a key-value style, where "
             "the key is the tag name, and the value is the index one."
-            "The last element has to be an EOS tag. Read the doc for more detail about EOS tag."
+            "The last element has to be an EOS tag. Read the documentation for more details about the EOS tag."
         ),
         default=None,
         type=str,

diff --git a/deepparse/cli/test.py b/deepparse/cli/test.py
@@ -108,7 +108,7 @@ def main(args=None) -> None:
 
 
 def get_parser() -> argparse.ArgumentParser:
-    """Return ArgumentParser for the cli."""
+    """Return ArgumentParser for the CLI."""
 
     parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
 

diff --git a/deepparse/comparer/addresses_comparer.py b/deepparse/comparer/addresses_comparer.py
@@ -10,8 +10,9 @@
 @dataclass(frozen=True)
 class AddressesComparer:
     """
-    Address comparer to compare addresses with each other and retrieves the differences between them. The addresses
-    are parsed using an address parser based on one of the seq2seq pretrained networks, either with fastText or BPEmb.
+    Address comparer is used to compare addresses with each other and retrieve the differences between them. The
+    addresses are parsed using an address parser based on one of the seq2seq pretrained networks, either with
+    FastText or BPEmb.
 
     The address comparer can compare already parsed addresses. The address parser first recomposes the raw
     addresses then suggest its own tags; then it makes a comparison with the tags of the source parsing and the
@@ -43,12 +44,12 @@ def compare_tags(
         raw address from the parsing, AddressParser generates tags and compares the two parsings.
 
         Args:
-            addresses_tags_to_compare (Union[List[tuple], List[List[tuple]]]): list of tuple that contains
+            addresses_tags_to_compare (Union[List[tuple], List[List[tuple]]]): list of tuples that contain
             the tags for the address components from the source. Can compare multiple parsings if passed as a
             list of tuples.
-            with_prob (Union[None, bool]): An option flag to either or not include prob in the comparison report.
-                The probabilities are not compared but only included in the report.
-                The default value is None, which means not taking into account.
+            with_prob (Union[None, bool]): An option flag to either or not include probabilities in the comparison
+                report. The probabilities are not compared but only included in the report. The default value is
+                ``None``, which means not taking into account.
 
         Return:
             Either a :class:`~FormattedComparedAddressesTags` or a list of :class:`~FormattedComparedAddressTags`
@@ -123,15 +124,14 @@ def compare_raw(
     ) -> List[FormattedComparedAddressesRaw]:
         """
         Compare a list of raw addresses together. It starts by parsing the addresses
-        with the setted parser and then return the differences between the addresses components
-        retrieved with our model.
+        with the parser and then return the differences between the parsed address components of the two addresses.
 
         Args:
             raw_addresses_to_compare (Union[Tuple[str], List[Tuple[str]]]):
                 List of strings that represent raw addresses to compare.
-            with_prob (Union[None, bool]): An option flag to either or not include prob in the comparison report.
-                The probabilities are not compared but only included in the report.
-                The default value is None, which means not taking into account.
+            with_prob (Union[None, bool]): An option flag to either or not include probabilities in the comparison
+                report. The probabilities are not compared but only included in the report. The default value is
+                ``None``, which means not taking into account.
 
         Return:
             Either a :class:`~FormattedComparedAddressesRaw` or a list of
@@ -184,8 +184,8 @@ def compare_raw(
     @staticmethod
     def _format_comparisons_dict(comparison_tuples: List, origin_tuple: Tuple[str, str], with_prob: bool) -> List[Dict]:
         """
-        Return formatted dict that contains two FormattedParsedAddress and the origin name tuple and output it in a
-        dict format.
+        Return formatted dictionary that contains two FormattedParsedAddress and the origin name tuple and output it
+        in a dictionary format.
         """
 
         list_of_formatted_comparisons_dict = []

diff --git a/deepparse/comparer/formatted_compared_addresses.py b/deepparse/comparer/formatted_compared_addresses.py
@@ -110,16 +110,16 @@ def _comparison_report_builder(self) -> str:
     @abstractmethod
     def _get_probs(self) -> Dict:
         """
-        To get the tags from the parsing with their associated probabilities, the method needs to be implemented in each
-        class because they don't use the probabilities the same way.
+        A method to get the tags from the parsing with their associated probabilities, it needs to be implemented in
+        each class because they don't use the probabilities the same way.
         """
 
     @staticmethod
     def _get_color_diff(string_one: str, string_two: str, highlight: bool = False) -> str:
         """
-        Compare two strings and determine the difference between the two. The differences are noted with colour code;
-        if the first string has more elements than the second one, it will be noted in one colour; on the contrary,
-        if the other string has something more, it will have a different colour notation.
+        Compare two strings and determine the difference between the two. The differences are highlighted with a
+        coloured scheme; if the first string has more elements than the second one, it will be noted in one colour;
+        on the contrary, if the other string has something more, it will have a different colour notation.
 
         Args:
             string_one (str): The first string to compare.
@@ -129,15 +129,15 @@ def _get_color_diff(string_one: str, string_two: str, highlight: bool = False) -
                 two strings are spaces. The default is False.
 
         Notes:
-            the method is colorblind-friendly, which means that the output will be
+            The method is colorblind-friendly, which means that the output will be
             in colours that minimize the risk that a user cannot see the difference as
             defined here https://davidmathlogic.com/colorblind/#%23D81B60-%231E88E5-%23FFC107-%23004D40.
 
             If both the strings share the same character, it will be written in white.
             If the first string has something more than the second one, it will be indicated in blue.
             If the second string has something more than the first one, it will be noted in yellow.
 
-            It uses SequenceMatcher to get the different codes to be later converted into colour codes.
+            It uses SequenceMatcher to convert the different codes into colour codes later.
 
         Return:
             str: The two strings joined, and the differences are noted in colour codes
@@ -176,13 +176,16 @@ def _get_tags_diff_color(
         verbose: bool = True,
     ) -> str:
         """
-        Print the output of the string with colour codes that represent the differences between the two strings.
+        Print the output of the string with colour codes representing the differences between the two strings.
 
         Args:
-            name_one (str, optional) : Name associated with first color. The default value is the first address.
-            name_two (str, optional) : Name associated with the second colour. The default value is the second address.
-            verbose (bool, optional): If True, it will print a presentation of the colours and what they mean.
-                The default value is True.
+            name_one (str, optional) : Name associated with first color. The default value is ``"first address"``,
+                namely the first address of the two. We recommend using a whitespace characters between the words.
+            name_two (str, optional) : Name associated with the second colour. The default value is
+                ``"second address"``, namely the second address of the two.  We recommend using a whitespace
+                characters between the words.
+            verbose (bool, optional): If True, it will print a presentation of the colours and their meaning.
+                The default value is ``True``.
 
         """
 
@@ -220,15 +223,15 @@ def _get_tags_diff_color(
 
     def _bool_address_tags_are_the_same(self, parsed_addresses: Union[List[List[tuple]], List[tuple]]) -> List[tuple]:
         """
-        Compare addresses components and put the differences in a dictionary where the keys are the
+        Compare the components between two addresses and put the differences in a dictionary where the keys are the
         names of the addresses components, and the values are the values of the addresses component.
 
         Args:
             parsed_addresses (Union[List[List[tuple]], List[tuple]]): Contains the tags and the
             address components' names for the parsed addresses.
 
         Return:
-            List[tuple]: List of tuples that contain all addresses components that differ from each other.
+            List[tuple]: List of tuples containing the components that differ from the two addresses.
         """
         unique_address_component_names = self._unique_addresses_component_names(parsed_addresses)
 

diff --git a/deepparse/comparer/formatted_compared_addresses_raw.py b/deepparse/comparer/formatted_compared_addresses_raw.py
@@ -12,8 +12,8 @@ class FormattedComparedAddressesRaw(FormattedComparedAddresses):
 
     def _get_probs(self) -> Dict:
         """
-        To get the tags from the parsing with their associated probabilities, the method needs to be implemented in each
-        class because they don't use the probabilities the same way.
+        Method to get the tags from the parsing with their associated probabilities, a method needs to be
+        implemented in each class because they don't use the probabilities the same way.
         """
         return {
             self.first_address.raw_address: self.first_address.address_parsed_components,

diff --git a/deepparse/comparer/formatted_compared_addresses_tags.py b/deepparse/comparer/formatted_compared_addresses_tags.py
@@ -12,7 +12,7 @@ class FormattedComparedAddressesTags(FormattedComparedAddresses):
 
     def _get_probs(self) -> Dict:
         """
-        To get the tags from the parsing with their associated probabilities, the method needs to be implemented in each
+        To get the tags from the parsing with their associated probabilities, A method needs to be implemented in each
         class because they don't use the probabilities the same way.
         """
         return {