improve documentation

GRAAL-Research · Oct 17, 2023 · 2f396dc · 2f396dc
1 parent adcbe41
commit 2f396dc
Show file tree

Hide file tree

Showing 9 changed files with 68 additions and 65 deletions.
diff --git a/deepparse/comparer/addresses_comparer.py b/deepparse/comparer/addresses_comparer.py
@@ -11,14 +11,14 @@
 class AddressesComparer:
     """
     Address comparer to compare addresses with each other and retrieves the differences between them. The addresses
-    are parsed using an address parser based on one of the seq2seq pretrained networks either with fastText or BPEmb.
+    are parsed using an address parser based on one of the seq2seq pretrained networks, either with fastText or BPEmb.
 
-    The address comparer can compare already parsed addresses. The address parser first recompose the raw
-    addresses then suggests its own tags, then it makes a comparison with the tags of the source parsing and the
+    The address comparer can compare already parsed addresses. The address parser first recomposes the raw
+    addresses then suggest its own tags; then it makes a comparison with the tags of the source parsing and the
     newly parsed address
 
     The address comparer is also able to compare raw addresses by first parsing the addresses using the
-    address parser and then brings out the differences among the parsed addresses.
+    address parser and then bring out the differences among the parsed addresses.
 
 
     Args:
@@ -40,13 +40,13 @@ def compare_tags(
     ) -> Union[List[FormattedComparedAddressesTags], FormattedComparedAddressesTags]:
         """
         Compare tags of a source parsing with the parsing from AddressParser. First, it reconstructs the
-        raw address from the parsing, then AddressParser generates tags and then compares the two parsings.
+        raw address from the parsing, AddressParser generates tags and compares the two parsings.
 
         Args:
             addresses_tags_to_compare (Union[List[tuple], List[List[tuple]]]): list of tuple that contains
-            the tags for the address components from the source. Can compare multiples parsings if passed as a
+            the tags for the address components from the source. Can compare multiple parsings if passed as a
             list of tuples.
-            with_prob (Union[None, bool]): A option flag to either or not include prob in the comparison report.
+            with_prob (Union[None, bool]): An option flag to either or not include prob in the comparison report.
                 The probabilities are not compared but only included in the report.
                 The default value is None, which means not taking into account.
 
@@ -122,14 +122,14 @@ def compare_raw(
         with_prob: Union[None, bool] = None,
     ) -> List[FormattedComparedAddressesRaw]:
         """
-        Compare a list of raw addresses together, it starts by parsing the addresses
+        Compare a list of raw addresses together. It starts by parsing the addresses
         with the setted parser and then return the differences between the addresses components
         retrieved with our model.
 
         Args:
             raw_addresses_to_compare (Union[Tuple[str], List[Tuple[str]]]):
-                List of string that represent raw addresses to compare.
-            with_prob (Union[None, bool]): A option flag to either or not include prob in the comparison report.
+                List of strings that represent raw addresses to compare.
+            with_prob (Union[None, bool]): An option flag to either or not include prob in the comparison report.
                 The probabilities are not compared but only included in the report.
                 The default value is None, which means not taking into account.
 

diff --git a/deepparse/comparer/formatted_compared_addresses.py b/deepparse/comparer/formatted_compared_addresses.py
@@ -20,7 +20,7 @@ class FormattedComparedAddresses(ABC):
                                                 for the first one.
         second_address(FormattedParsedAddress): A formatted parsed address that contains the parsing information
                                                 for the second one.
-        origin: (Tuple[str, str]): The origin of the parsing (ex : from source or from a deepparse pretrained model).
+        origin: (Tuple[str, str]): The origin of the parsing (ex : from source or a Deepparse pretrained model).
 
     Example:
 
@@ -40,7 +40,7 @@ class FormattedComparedAddresses(ABC):
     @property
     def list_of_bool(self) -> List:
         """
-        A list of boolean that contains all the address components names and indicates if it is the same for the
+        A list of boolean that contains all the address components' names and indicates if it is the same for the
         two addresses.
 
         Return:
@@ -86,7 +86,7 @@ def comparison_report(self, nb_delimiters: Union[int, None] = None) -> None:
 
     def _comparison_report(self, nb_delimiters: Union[int, None]) -> str:
         """
-        Builds a comparison_report with delimiters to make the beginning and the end of the comparison easier to spot.
+        Builds a comparison_report with delimiters to make the comparison's beginning and end easier to spot.
         """
 
         # Get terminal size to adapt the output to the user
@@ -102,15 +102,15 @@ def _comparison_report(self, nb_delimiters: Union[int, None]) -> str:
     @abstractmethod
     def _comparison_report_builder(self) -> str:
         """
-        Builds the core of a comparison report for the different comparisons. Since the procedure to make a tags
-        comparison and the raw addresses comparison is different, the comparison report is not the same for the two.
+        Builds the core of a comparison report for the various comparisons. Since the procedure to make a tags
+        comparison and the raw addresses comparison are different, the comparison report is not the same for the two.
         It is then implemented in each specific class.
         """
 
     @abstractmethod
     def _get_probs(self) -> Dict:
         """
-        Get the tags from the parsing with their associated probabilities, the method needs to be implemented in each
+        To get the tags from the parsing with their associated probabilities, the method needs to be implemented in each
         class because they don't use the probabilities the same way.
         """
 
@@ -180,7 +180,7 @@ def _get_tags_diff_color(
 
         Args:
             name_one (str, optional) : Name associated with first color. The default value is the first address.
-            name_two (str, optional) : Name associated with second color. The default value is the second address.
+            name_two (str, optional) : Name associated with the second colour. The default value is the second address.
             verbose (bool, optional): If True, it will print a presentation of the colours and what they mean.
                 The default value is True.
 
@@ -221,14 +221,14 @@ def _get_tags_diff_color(
     def _bool_address_tags_are_the_same(self, parsed_addresses: Union[List[List[tuple]], List[tuple]]) -> List[tuple]:
         """
         Compare addresses components and put the differences in a dictionary where the keys are the
-        names of the addresses components, and the values are the value of the addresses component.
+        names of the addresses components, and the values are the values of the addresses component.
 
         Args:
             parsed_addresses (Union[List[List[tuple]], List[tuple]]): Contains the tags and the
-            address components name for the parsed addresses.
+            address components' names for the parsed addresses.
 
         Return:
-            List[tuple]: List of tuples that contains all addresses components that differ from each other.
+            List[tuple]: List of tuples that contain all addresses components that differ from each other.
         """
         unique_address_component_names = self._unique_addresses_component_names(parsed_addresses)
 
@@ -258,16 +258,16 @@ def _bool_address_tags_are_the_same(self, parsed_addresses: Union[List[List[tupl
     @staticmethod
     def _unique_addresses_component_names(parsed_addresses: List[List[tuple]]) -> List:
         """
-        Retrieves all the unique address components names from the comparison then returns it.
+        Retrieves all the unique address component names from the comparison, then returns it.
 
         Args:
             parsed_addresses (List[List[tuple]]): Contains the tags and the
-            address components name for the parsed addresses.
+            address components' names for the parsed addresses.
 
         Return:
-            Returns a list of all the unique address components names.
+            Returns a list of all the unique address component names.
         """
-        # Here we don't use a set since order will change and report will also change.
+        # We don't use a set here since the order and report will change.
         unique_address_component_names = []
         for tuple_values in parsed_addresses:
             for address_component in tuple_values:

diff --git a/deepparse/comparer/formatted_compared_addresses_raw.py b/deepparse/comparer/formatted_compared_addresses_raw.py
@@ -12,7 +12,7 @@ class FormattedComparedAddressesRaw(FormattedComparedAddresses):
 
     def _get_probs(self) -> Dict:
         """
-        Get the tags from the parsing with their associated probabilities, the method needs to be implemented in each
+        To get the tags from the parsing with their associated probabilities, the method needs to be implemented in each
         class because they don't use the probabilities the same way.
         """
         return {
@@ -45,8 +45,8 @@ def _get_raw_diff_color(self, verbose=True) -> str:
 
     def _comparison_report_builder(self) -> str:
         """
-        Builds the core of a comparison report for the different comparisons. Since the procedure to make a tags
-        comparison and the raw addresses comparison is different, the comparison report is not the same for the two.
+        Builds the core of a comparison report for the various comparisons. Since the procedure to make a tags
+        comparison and the raw addresses comparison are different, the comparison report is not the same for the two.
         It is then implemented in each specific class.
         """
         str_formatted = ""

diff --git a/deepparse/comparer/formatted_compared_addresses_tags.py b/deepparse/comparer/formatted_compared_addresses_tags.py
@@ -12,7 +12,7 @@ class FormattedComparedAddressesTags(FormattedComparedAddresses):
 
     def _get_probs(self) -> Dict:
         """
-        Get the tags from the parsing with their associated probabilities, the method needs to be implemented in each
+        To get the tags from the parsing with their associated probabilities, the method needs to be implemented in each
         class because they don't use the probabilities the same way.
         """
         return {
@@ -37,8 +37,8 @@ def _get_probs_of_tags(self, verbose: bool = True) -> str:
 
     def _comparison_report_builder(self) -> str:
         """
-        Builds the core of a comparison report for the different comparisons. Since the procedure to make a tags
-        comparison and the raw addresses comparison is different, the comparison report is not the same for the two.
+        Builds the core of a comparison report for the various comparisons. Since the procedure to make a tags
+        comparison and the raw addresses comparison are different, the comparison report is not the same for the two.
         It is then implemented in each specific class.
         """
 

diff --git a/deepparse/data_validation/data_validation.py b/deepparse/data_validation/data_validation.py
@@ -3,30 +3,33 @@
 
 def validate_if_any_empty(string_elements: List) -> bool:
     """
-    Return true if one of the string element is an empty one.
+    Return ``True`` if one of the string elements is empty. For example, the second element in the following list is
+    an empty address: ``["An address", "", "Another address"]``. Thus, it will return ``False``.
 
     Args:
-        string_elements (list): A list of string to validate.
+        string_elements (list): A list of strings to validate.
     """
     return any(is_empty(string_element) for string_element in string_elements)
 
 
 def validate_if_any_whitespace_only(string_elements: List) -> bool:
     """
-    Return true if one of the string element is only whitespace.
+    Return ``True`` if one of the string elements is only whitespace. For example, the second element in the
+    following list is only whitespace: ``["An address", " ", "Another address"]``. Thus, it will return ``False``.
 
     Args:
-        string_elements (list): A list of string to validate.
+        string_elements (list): A list of strings to validate.
     """
     return any(is_whitespace_only(string_element) for string_element in string_elements)
 
 
 def validate_if_any_none(string_elements: List) -> bool:
     """
-    Return true if one of the string element is a None value.
+    Return ``True`` if one string element is a ``None`` value. For example, the second element in the following
+    list is a ``None`` value: ``["An address", None, "Another address"]``. Thus, it will return ``False``.
 
     Args:
-        string_elements (list): A list of string to validate.
+        string_elements (list): A list of strings to validate.
     """
     return any(is_none(string_element) for string_element in string_elements)
 

diff --git a/deepparse/dataset_container/dataset_container.py b/deepparse/dataset_container/dataset_container.py
@@ -54,14 +54,14 @@ def __getitem__(
         self, idx: Union[int, slice]
     ) -> Union[List[str], str, List[List[Tuple[str, List]]], Tuple[str, List]]:
         """
-        If the DatasetContainer is a predict one:
+        If the DatasetContainer is a "predict" one:
 
-            - it can be a list of string items (e.g. a list of address (str)), or
+            - it can be a list of string items (e.g. a list of addresses (str)), or
             - it can be a unique string item (e.g. one address).
 
         If the DatasetContainer is a training one:
 
-            - it can be a list of tuple (str, list) items, namely a list of parsed example (e.g. an address with
+            - it can be a list of tuple (str, list) items, namely a list of parsed examples (e.g. an address with
                 the tags), or
             - it can be a tuple (str, list) item.
 
@@ -114,7 +114,7 @@ def _training_validation(self) -> None:
 
         if not self._data_tags_is_same_len_then_address():
             print(
-                f"Some addresses (whitespace-split) and the tags associated with them are not the same len. "
+                f"Some addresses (whitespace-split) and the associated tags are not the same len. "
                 f"If you are using a CSVDatasetContainer, consider using the tag_seperator_reformat_fn argument."
                 f"Here is the report of those cases where len differ to help you out:\n"
                 f"{self._data_tags_not_the_same_len_diff()}"
@@ -190,8 +190,8 @@ def __init__(self, data_path: str, is_training_container: bool = True) -> None:
         if not is_training_container:
             if self._test_predict_container_is_list_of_tuple():
                 raise DataError(
-                    "The data is a list of tuple by the dataset container is a predict container. "
-                    "Predict container should contains only a list of address."
+                    "The data is a list of tuples, but the dataset container is a predict container. "
+                    "Predict container should contain only a list of addresses."
                 )
 
         self.validate_dataset()
@@ -226,17 +226,17 @@ class CSVDatasetContainer(DatasetContainer):
 
         data_path (str): The path to the CSV dataset file.
         column_names (list): A column name list to extract the dataset element.
-            If the dataset container is a predict one, the list must be of exactly one element
-            (i.e. the address column). On the other hand, if the dataset container is a training one, the list must be
+            If the dataset container is a "predict" one, the list must be of exactly one element
+            (i.e. the address column). On the other hand, if the dataset container is a "training" one, the list must be
             of exactly two elements: addresses and tags.
         is_training_container (bool): Either or not, the dataset container is a training container. This will determine
             the dataset validation test we apply to the dataset. That is, a predict dataset doesn't include tags.
             The default value is true.
         separator (str): The CSV columns separator to use. By default, ``"\\t"``.
         tag_seperator_reformat_fn (Callable, optional): A function to parse a tags string and return a list of
-            address tags. For example, if the tag column is a former python list saved with pandas, the characters ``]``
+            address tags. For example, if the tag column is a former Python list saved with pandas, the characters ``]``
             , ``]`` and ``'`` will be included as the tags' element. Thus, a parsing function will take a string as is
-            parameter and output a python list. The default function process it as a former python list.
+            parameter and output a python list. The default function processes it as a former Python list.
             That is, it removes the ``[],`` characters and splits the sequence at each comma (``","``).
         csv_reader_kwargs (dict, optional): Keyword arguments to pass to pandas ``read_csv`` use internally. By default,
             the ``data_path`` is passed along with our default ``sep`` value ( ``"\\t"``) and the ``"utf-8"`` encoding

diff --git a/deepparse/dataset_container/tools.py b/deepparse/dataset_container/tools.py
@@ -14,23 +14,23 @@ def former_python_list(tags: str) -> List:
         A list of the parsed tag set.
     """
     # We remove the [ and ] of the list.
-    # Then, we split each element using a comma as separator.
-    # Finally, since some case the element are separated by a comma (e.g. element1,element2)
+    # Then, we split each element using a comma as a separator.
+    # Finally, in some cases, the element are separated by a comma (e.g. element1,element2)
     # or a comma and a whitespace (e.g. element1, element2), we strip the whitespace on all tags to
-    # remove the trailing whitespace when element are separated by a coma and a whitespace.
+    # remove the trailing whitespace when a coma and a whitespace separate elements.
     # To fix https://github.com/GRAAL-Research/deepparse/issues/124.
     return [tag.strip() for tag in tags.replace("[", "").replace("]", "").replace("'", "").split(",")]
 
 
 def validate_column_names(column_names: List[str]) -> bool:
     """
-    Function validate if element of a list of column name are valid.
+    Function to validate if the element of a list of column names is valid.
 
     Args:
         column_names (List[str]): A list of column names.
 
     Return:
-        Either or not, the colum name are valid.
+        Either or not, the column names are valid.
     """
     improper_column_names = False
     if validate_if_any_empty(column_names) or validate_if_any_whitespace_only(column_names):