Number candidates: no multiple verbatim entries for same occurrence.

It's better to just have a single verbatim entry for numbers that begin with non-digit start characters. Normalized matching produces the same weight, even if users prefer to see "matched verbatim".
Charcoal-SE · Dec 13, 2021 · 6c3d856 · 6c3d856
1 parent 3cdf36e
commit 6c3d856
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 21 deletions.
diff --git a/findspam.py b/findspam.py
@@ -955,13 +955,6 @@ def check_set_and_add_matches(reported_match_type, these_candidates, number_dete
         if reported_match_type == 'verbatim' and in_both:
             # Need to regenerate the normalized list without the ones derived from the verbatim matches
             candidates_without_verbatim_matches = candidates - in_both
-            # The way the verbatim candidates are generated results in multiple entries when there are characters
-            # starting the match other than a digit. This is done because humans like to have those non-digits and
-            # see verbatim matches.
-            # Instead of trying to figure out which variation(s) are actually included in the candidates list, we
-            # just remove all of the variations prior to recreating the normalized list.
-            for base in in_both:
-                candidates_without_verbatim_matches -= phone_numbers.get_candidate_set_with_start_characters(base)
             normalized_candidates = set(phone_numbers.normalize_list(candidates_without_verbatim_matches))
         for found in in_both:
             matches.append('{} found {}'.format(found, reported_match_type))

diff --git a/phone_numbers.py b/phone_numbers.py
@@ -154,10 +154,12 @@ def get_candidates(text, also_normalized=False):
     # The differences between this implementation and the original get_candidates(), which was based on a
     # regex implementation, are:
     #   1. This doesn't have the same potential for catistrophic CPU usage based on input text.
-    #   2. When the first character in the candidate is not a digit, this returns up to three candidates.
-    #      For example "+(123..." will return ["+(123...", "(123...", "123..."]. The regex version does not return
+    #   2. When the first character in the candidate is not a digit, this returns only one candidate.
+    #      For example "+(123..." will return ["+(123..."]. The regex version returns two candidates, but not
     #      the version without the non-digit start characters (i.e. it returns ["+(123...", "(123..."]).
     #      The characters other than digits which are valid at the start are in VALID_NON_DIGIT_START_CHARACTERS.
+    #      The intent at that time was to generate more verbatim matches, but it's better to just have the one
+    #      result. In the meantime, normalized matching has been improved and more emphasis placed on it.
     #   3. The regex version routinely returned duplicate entries. This implementation only returns duplicate
     #      entries if there are duplicates in the input text.
     candidates = []
@@ -229,18 +231,10 @@ def if_digits_add_digits_to_all_in_process_and_promote():
                 if prev_non_digit in VALID_NON_DIGIT_START_CHARACTERS:
                     if prev_prev_non_digit in VALID_NON_DIGIT_START_CHARACTERS:
                         in_process.append(prev_prev_non_digit + prev_non_digit + digits)
-                        # In order to keep the same number of entries in the normalized list, we add an extra
-                        # in_process_normalized entry here. We will have to remove it later, or just never
-                        # promote such entries to the candidates list.
-                        in_process_normalized.append('z' + digits)
-                        in_process_digit_counts.append(len_digits)
-                    in_process.append(prev_non_digit + digits)
-                    # Again, an extra in_process_normalized entry
-                    in_process_normalized.append('z' + digits)
-                    in_process_digit_counts.append(len_digits)
-                # The original regex version didn't include as candidates the sequence without the non-digit
-                # start characters when those characters existed.
-                in_process.append(digits)
+                    else:
+                        in_process.append(prev_non_digit + digits)
+                else:
+                    in_process.append(digits)
                 in_process_normalized.append(digits)
                 in_process_digit_counts.append(len_digits)
                 promote_any_in_process_with_appropriate_digit_count()