Skip to content

Commit

Permalink
Number candidates: no multiple verbatim entries for same occurrence.
Browse files Browse the repository at this point in the history
It's better to just have a single verbatim entry for numbers that
begin with non-digit start characters. Normalized matching produces
the same weight, even if users prefer to see "matched verbatim".
  • Loading branch information
makyen committed Dec 13, 2021
1 parent 3cdf36e commit 6c3d856
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 21 deletions.
7 changes: 0 additions & 7 deletions findspam.py
Original file line number Diff line number Diff line change
Expand Up @@ -955,13 +955,6 @@ def check_set_and_add_matches(reported_match_type, these_candidates, number_dete
if reported_match_type == 'verbatim' and in_both:
# Need to regenerate the normalized list without the ones derived from the verbatim matches
candidates_without_verbatim_matches = candidates - in_both
# The way the verbatim candidates are generated results in multiple entries when there are characters
# starting the match other than a digit. This is done because humans like to have those non-digits and
# see verbatim matches.
# Instead of trying to figure out which variation(s) are actually included in the candidates list, we
# just remove all of the variations prior to recreating the normalized list.
for base in in_both:
candidates_without_verbatim_matches -= phone_numbers.get_candidate_set_with_start_characters(base)
normalized_candidates = set(phone_numbers.normalize_list(candidates_without_verbatim_matches))
for found in in_both:
matches.append('{} found {}'.format(found, reported_match_type))
Expand Down
22 changes: 8 additions & 14 deletions phone_numbers.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,10 +154,12 @@ def get_candidates(text, also_normalized=False):
# The differences between this implementation and the original get_candidates(), which was based on a
# regex implementation, are:
# 1. This doesn't have the same potential for catistrophic CPU usage based on input text.
# 2. When the first character in the candidate is not a digit, this returns up to three candidates.
# For example "+(123..." will return ["+(123...", "(123...", "123..."]. The regex version does not return
# 2. When the first character in the candidate is not a digit, this returns only one candidate.
# For example "+(123..." will return ["+(123..."]. The regex version returns two candidates, but not
# the version without the non-digit start characters (i.e. it returns ["+(123...", "(123..."]).
# The characters other than digits which are valid at the start are in VALID_NON_DIGIT_START_CHARACTERS.
# The intent at that time was to generate more verbatim matches, but it's better to just have the one
# result. In the meantime, normalized matching has been improved and more emphasis placed on it.
# 3. The regex version routinely returned duplicate entries. This implementation only returns duplicate
# entries if there are duplicates in the input text.
candidates = []
Expand Down Expand Up @@ -229,18 +231,10 @@ def if_digits_add_digits_to_all_in_process_and_promote():
if prev_non_digit in VALID_NON_DIGIT_START_CHARACTERS:
if prev_prev_non_digit in VALID_NON_DIGIT_START_CHARACTERS:
in_process.append(prev_prev_non_digit + prev_non_digit + digits)
# In order to keep the same number of entries in the normalized list, we add an extra
# in_process_normalized entry here. We will have to remove it later, or just never
# promote such entries to the candidates list.
in_process_normalized.append('z' + digits)
in_process_digit_counts.append(len_digits)
in_process.append(prev_non_digit + digits)
# Again, an extra in_process_normalized entry
in_process_normalized.append('z' + digits)
in_process_digit_counts.append(len_digits)
# The original regex version didn't include as candidates the sequence without the non-digit
# start characters when those characters existed.
in_process.append(digits)
else:
in_process.append(prev_non_digit + digits)
else:
in_process.append(digits)
in_process_normalized.append(digits)
in_process_digit_counts.append(len_digits)
promote_any_in_process_with_appropriate_digit_count()
Expand Down

0 comments on commit 6c3d856

Please sign in to comment.