/
suggest.py
546 lines (441 loc) · 25.8 KB
/
suggest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
"""
The main "suggest correction for this misspelling" module.
On a bird-eye view level, suggest does:
* tries small word "edits" (remove letters, insert letters, swap letters) and checks (with the help
of :mod:`lookup <spylls.hunspell.algo.lookup>`) there are any valid ones
* if no good suggestions found, tries "ngram-based" suggestions (calculating ngram-based distance to
all dictionary words and select the closest ones), handled by
:mod:`ngram_suggest <spylls.hunspell.algo.ngram_suggest>`
* if possible, tries metaphone-based suggestions, handled by :mod:`phonet_suggest <spylls.hunspell.algo.phonet_suggest>`
Note that Spylls's implementation takes one liberty comparing to Hunspell's:
In Hunspell, ngram suggestions (select all words from dictionary that ngram-similar => produce suggestions)
and phonetic suggestions (select all words from dictionary that phonetically similar => produce suggestions)
are done in the same cycle, because they both iterate through entire dictionary. Spylls does it
in two separate cycles, again, for the sake of clarity (note that dictionaries with metaphone
transformation rules defined are extremely rare).
To follow algorithm details, start reading from :meth:`Suggest.__call__`
.. toctree::
:caption: Specific algorithms
algo_ngram_suggest
algo_phonet_suggest
.. autoclass:: Suggest
Suggestion classes
^^^^^^^^^^^^^^^^^^
.. autoclass:: Suggestion
:members:
.. autoclass:: MultiWordSuggestion
:members:
"""
from typing import Iterator, List, Set, Union
import dataclasses
from dataclasses import dataclass
from spylls.hunspell import data
from spylls.hunspell.algo.capitalization import Type as CapType
from spylls.hunspell.algo import ngram_suggest, phonet_suggest, permutations as pmt
MAXPHONSUGS = 2
MAXSUGGESTIONS = 15
GOOD_EDITS = ['spaceword', 'uppercase', 'replchars']
@dataclass
class Suggestion:
"""
Suggestions is what Suggest produces internally to store enough information about some suggestion
to make sure it is a good one.
"""
#: Actual suggestion text
text: str
#: How suggestion was produced, useful for debugging, typically same as the method
#: of the editing which led to this suggestion, like "badchar", "twowords", "phonet" etc.
kind: str
def __repr__(self):
return f"Suggestion[{self.kind}]({self.text})"
def replace(self, **changes):
return dataclasses.replace(self, **changes)
@dataclass
class MultiWordSuggestion:
"""
Represents suggestion to split words into several.
"""
#: List of words
words: List[str]
#: Same as :attr:`Suggestion.source`
source: str
#: Whether those words are allowed to be joined by dash. We should disallow it if the multi-word
#: suggestion was produced by :attr:`Aff.REP <spylls.hunspell.data.aff.Aff.REP>` table, see
#: :meth:`Suggest.edits` for details.
allow_dash: bool = True
def stringify(self, separator=' '):
return Suggestion(separator.join(self.words), self.source)
def __repr__(self):
return f"Suggestion[{self.source}]({self.words!r})"
class Suggest:
"""
``Suggest`` object is created on :class:`Dictionary <spylls.hunspell.Dictionary>` reading. Typically,
you would not use it directly, but you might want for experiments::
>>> dictionary = Dictionary.from_files('dictionaries/en_US')
>>> suggest = dictionary.suggester
>>> [*suggest('spylls')]
['spells', 'spills']
>>> for suggestion in suggest.suggestions('spylls'):
... print(suggestion)
Suggestion[badchar](spell)
Suggestion[badchar](spill)
See :meth:`__call__` as the main entry point for algorithm explanation.
**Main methods**
.. automethod:: __call__
.. automethod:: suggestions
**Suggestion types**
.. automethod:: edits
.. automethod:: ngram_suggestions
.. automethod:: phonet_suggestions
"""
def __init__(self, aff: data.Aff, dic: data.Dic, lookup):
self.aff = aff
self.dic = dic
self.lookup = lookup
# TODO: also NONGRAMSUGGEST and ONLYUPCASE
bad_flags = {*filter(None, [self.aff.FORBIDDENWORD, self.aff.NOSUGGEST, self.aff.ONLYINCOMPOUND])}
self.words_for_ngram = [word for word in self.dic.words if not bad_flags.intersection(word.flags)]
def __call__(self, word: str) -> Iterator[str]:
"""
Outer "public" interface: returns a list of all valid suggestions, as strings.
Method returns a generator, so it is up to client code to fetch as many suggestions as it
needs::
>>> suggestions = suggester('unredable')
<generator object Suggest.__call__ at 0x7f74f5056350>
>>> suggestions.__next__()
'unreadable'
Note that suggestion to split words in two also returned as a single string, with a space::
>>> [*suggester('badcat')]
['bad cat', 'bad-cat', 'baccarat']
Internally, the method just calls :meth:`suggestions` (which returns instances of :class:`Suggestion`)
and yields suggestion texts.
Args:
word: Word to check
"""
yield from (suggestion.text for suggestion in self.suggestions(word))
def suggestions(self, word: str) -> Iterator[Suggestion]: # pylint: disable=too-many-statements
"""
Main suggestion search loop. What it does, in general, is:
* generates possible misspelled word cases (for ex., "KIttens" in dictionary might've been
'KIttens', 'kIttens', 'kittens', or 'Kittens')
* produces word edits with :meth:`edits` (with the help of
:mod:`permutations <spylls.hunspell.algo.permutations>` module), checks them with
:class:`Lookup <spylls.hunspell.algo.lookup.Lookup>`, and decides if that's maybe enough
* but if it is not (and if .aff settings allow), ngram-based suggestions are produced with
:meth:`ngram_suggestions`, and phonetically similar suggestions with :meth:`phonet_suggestions`
That's very simplified explanation, read the code!
Args:
word: Word to check
"""
# Whether some suggestion (permutation of the word) is an existing and allowed word,
# just delegates to Lookup
def is_good_suggestion(word):
# Note that instead of using Lookup's main method, we just see if there is any good forms
# of this exact word, avoiding ICONV and trying to break word by dashes.
return any(self.lookup.good_forms(word, capitalization=False, allow_nosuggest=False))
# The suggestion is considered forbidden if there is ANY homonym in dictionary with flag
# FORBIDDENWORD. Besides marking swearing words, this feature also allows to include in
# dictionaries known "correctly-looking but actually non-existent" forms, which might important
# with very flexive languages.
def is_forbidden(word):
return self.aff.FORBIDDENWORD and self.dic.has_flag(word, self.aff.FORBIDDENWORD)
# This set will gather all good suggestions that were already returned (in order, for example,
# to not return same suggestion twice)
handled: Set[str] = set()
# Suggestions that are already considered good are passed through this method, which converts
# it to proper capitalization form, and then either yields it (if it is not forbidden,
# hadn't already seen, etc), or just does nothing.
# Method is quite lengthy, but is nested because updates and reuses ``handled`` local var
def handle_found(suggestion, *, check_inclusion=False):
text = suggestion.text
# If any of the homonyms has KEEPCASE flag, we shouldn't coerce it from the base form.
# But CHECKSHARPS flag presence changes the meaning of KEEPCASE...
if (self.aff.KEEPCASE and self.dic.has_flag(text, self.aff.KEEPCASE) and not
(self.aff.CHECKSHARPS and 'ß' in text)):
# Don't try to change text's case
pass
else:
# "Coerce" suggested text from the capitalization that it has in the dictionary, to
# the capitalization of the misspelled word. E.g., if misspelled was "Kiten", suggestion
# is "kitten" (how it is in the dictionary), and coercion (what we really want
# to return to user) is "Kitten"
text = self.aff.casing.coerce(text, captype)
# ...but if this particular capitalized form is forbidden, return back to original text
if text != suggestion.text and is_forbidden(text):
text = suggestion.text
# "aNew" will suggest "a new", here we fix it back to "a New"
if captype in [CapType.HUH, CapType.HUHINIT] and ' ' in text:
pos = text.find(' ')
if text[pos + 1] != word[pos] and text[pos + 1].upper() == word[pos]:
text = text[:pos+1] + word[pos] + text[pos+2:]
# If the word is forbidden, nothing more to do
if is_forbidden(text):
return
# Finally, OCONV table in .aff-file might specify what chars to replace in suggestions
# (for example, "'" to proper typographic "’", or common digraphs)
text = self.aff.OCONV(text) if self.aff.OCONV else text
# If we already seen this suggestion, nothing to do
# Note that this should happen AFTER the OCONV: it sometimes changes the content significantly.
# For example, Dutch dictionary recodes "ij" into ligature (one character representing this
# two letters) to enforce proper capitalization (and decodes it back on OCONV).
if text in handled:
return
# Sometimes we want to skip suggestions even if they are same as PARTS of already
# seen ones: for examle, ngram-based suggestions might produce very similar forms, like
# "impermanent" and "permanent" -- both of them are correct, but if the first is
# closer (by length/content) to misspelling, there is no point in suggesting the second
if check_inclusion and any(previous.lower() in text.lower() for previous in handled):
return
# Remember we seen it
handled.add(text)
# And here we are!
yield suggestion.replace(text=text)
# **Start of the main suggest code**
# First, produce all possible "good capitalizations" from the provided word. For example,
# if the word is "MsDonalds", good capitalizations (what it might've been in dictionary) are
# "msdonalds" (full lowercase) "msDonalds" (first letter lowercased), or maybe "Msdonalds"
# (only first letter capitalized). Note that "MSDONALDS" (it should've been all caps) is not
# produced as a possible good form, but checked separately in ``edits``
captype, variants = self.aff.casing.corrections(word)
# Check a special case: if it is possible that words would be possible to be capitalized
# on compounding, then we check capitalized form of the word. If it is correct, that's the
# only suggestion we ever need.
if self.aff.FORCEUCASE and captype == CapType.NO:
for capitalized in self.aff.casing.capitalize(word):
if is_good_suggestion(capitalized):
yield from handle_found(Suggestion(capitalized.capitalize(), 'forceucase'))
return # No more need to check anything
good_edits_found = False
# Now, for all capitalization variant
for idx, variant in enumerate(variants):
# If it is different from original capitalization, and is good, we suggest it
if idx > 0 and is_good_suggestion(variant):
yield from handle_found(Suggestion(variant, 'case'))
# Now we do two rounds:
# * generate edits and check whether they are valid non-compound words,
# * generate the same edits again and check whether they are valid compound words
#
# The reason of splitting the checks is: a) optimization (checking whether the word is
# a valid compound is much more pricey) and b) suggestion quality (it is considered that
# non-compound words are more probable).
# There are cases when this algorithm brings problems: for example, "11thhour" will NOT
# suggest "11th hour" -- because "11th" is compound word, and "hour" is not, but we
# first check whether the splitting is correct assuming all words are non-compound, and
# then check whether it is correct assuming both are compound. That's how it is in Hunspell,
# and Spylls doesn't tries to fix that.
nocompound = False
# 1. Only generate suggestions that are correct NOT COMPOUND words.
for suggestion in self.edit_suggestions(variant, handle_found, limit=MAXSUGGESTIONS, compounds=False):
yield suggestion
# remember if any "good" edits was found -- in this case we don't need
# ngram suggestions
good_edits_found = good_edits_found or (suggestion.kind in GOOD_EDITS)
# If any of those kinds are found, we don't need to try _any_ compound suggestions
if suggestion.kind in ['uppercase', 'replchars', 'mapchars']:
nocompound = True
# We might break from the method immediately if the suggestion is to split word
# with space, and this phrase is _in the dictionary as a whole_: "alot" => "a lot".
# Then we don't need ANY other suggestions. This is a trick to enforce suggesting
# to break commonly misspelled phrases withot oversuggesting
if suggestion.kind == 'spaceword':
return
# 2. Only generate suggestions that are correct COMPOUND words
if not nocompound:
for suggestion in self.edit_suggestions(variant, handle_found,
limit=self.aff.MAXCPDSUGS, compounds=True):
yield suggestion
good_edits_found = good_edits_found or (suggestion.kind in GOOD_EDITS)
if good_edits_found:
return
# One additional pass at suggesting: if the word contains dashes, and there were no dash-containing
# suggestions, we should try to spellcheck it separately... But only one misspelled part is allowed.
if '-' in word and not any('-' in sug for sug in handled):
chunks = word.split('-')
for idx, chunk in enumerate(chunks):
# If this chunk of the word is misspelled...
if not is_good_suggestion(chunk):
# ...try all suggestions for this separate chunk
for sug in self(chunk):
candidate = '-'.join([*chunks[:idx], sug, *chunks[idx+1:]])
# And check if the whole word with this chunk replaced is a good word. Note that
# we use lookup's main method here, which will also break it into words by
# dashes. It is done (instead of just checking chunk-by-chunk), because there
# are ambiguities how to split: if we are fixing "quas-Afro-American"->"quasi-Afro-American",
# and "Afro-American" is present in the dictionary as a whole word, it is better
# to delegate search for the right breaking to the Lookup.
if self.lookup(candidate):
yield Suggestion(candidate, 'dashes')
# after only one misspelled chunk, we stop the search anyways, whether we found some
# sugestions or not
break
# If there was no good edits that were valid words, we might try
# ngram-based suggestion algorithm: it is slower, but able to correct severely misspelled words
ngrams_seen = 0
for sug in self.ngram_suggestions(word, handled=handled):
for res in handle_found(Suggestion(sug, 'ngram'), check_inclusion=True):
ngrams_seen += 1
yield res
if ngrams_seen >= self.aff.MAXNGRAMSUGS:
break
# Also, if metaphone transformations (phonetic coding of words) were defined in the .aff file,
# we might try to use them to produce suggestions
phonet_seen = 0
for sug in self.phonet_suggestions(word):
for res in handle_found(Suggestion(sug, 'phonet'), check_inclusion=True):
phonet_seen += 1
yield res
if phonet_seen >= MAXPHONSUGS:
break
def edit_suggestions(self, word: str, handle_found, *, compounds: bool, limit: int) -> Iterator[Suggestion]:
def is_good_suggestion(word):
# Note that instead of using Lookup's main method, we just see if there is any good forms
# of this exact word, avoiding ICONV and trying to break word by dashes.
if compounds:
return any(self.lookup.good_forms(word, capitalization=False, allow_nosuggest=False, affix_forms=False))
return any(self.lookup.good_forms(word, capitalization=False, allow_nosuggest=False, compound_forms=False))
# For some set of suggestions, produces only good ones:
def filter_suggestions(suggestions):
for suggestion in suggestions:
# For multiword suggestion,
if isinstance(suggestion, MultiWordSuggestion):
# ...if all of the words is correct
if all(is_good_suggestion(word) for word in suggestion.words):
# ...we just convert it to plain text suggestion "word1 word2"
yield suggestion.stringify()
if suggestion.allow_dash:
# ...and "word1-word2" if allowed
yield suggestion.stringify('-')
else:
# Singleword suggestion is just yielded if it is good
if is_good_suggestion(suggestion.text):
yield suggestion
count = 0
for suggestion in filter_suggestions(self.edits(word)):
# Then, for each of those correct suggestions,
# we run them through handle_found (which performs some final transformations and
# checks, and may handle 1 or 0 suggestions; 0 in the case it was already seen or
# is a forbidden word)
for res in handle_found(suggestion):
yield res
count += 1
if count > limit:
return
def edits(self, word: str) -> Iterator[Union[Suggestion, MultiWordSuggestion]]:
"""
Produces all possible word edits in a form of :class:`Suggestion` or :class:`MultiWordSuggestion`.
Note that:
* order is important, that's the order user will receive the suggestions (and the further the
suggestion type in the order, the more probably it would be dropped due to suggestion count
limit)
* suggestion "source" tag is important: :meth:`suggestions` uses it to distinguish between
good and questionble edits (if there were any good ones, ngram suggestion wouldn't
be used)
Args:
word: Word to mutate
"""
# suggestions for an uppercase word (html -> HTML)
yield Suggestion(self.aff.casing.upper(word), 'uppercase')
# REP table in affix file specifies "typical misspellings", and we try to replace them.
# Note that the content of REP table taken not only from aff file, but also from "ph:" tag
# in dictionary (lines looking like ``hello ph:helo`` put word "hello" in dictionary, and
# "helo -> hello" in REP-table), see read_dic.
#
# It might return several words if REP table has "REP <something> <some>_<thing>" (_ is code
# for space).
#
# ...in this case we should suggest both "<word1> <word2>" as one dictionary entry, and
# "<word1>" "<word1>" as a sequence -- but clarifying this sequence might NOT be joined by "-"
for suggestion in pmt.replchars(word, self.aff.REP):
if isinstance(suggestion, list):
yield Suggestion(' '.join(suggestion), 'replchars')
yield MultiWordSuggestion(suggestion, 'replchars', allow_dash=False)
else:
yield Suggestion(suggestion, 'replchars')
for words in pmt.twowords(word):
yield Suggestion(' '.join(words), 'spaceword')
if self.use_dash():
# "alot" => "a-lot"
yield Suggestion('-'.join(words), 'spaceword')
# MAP in aff file specifies related chars (for example, "ïi"), and mapchars produces all
# changes of the word with related chars replaced. For example, "naive" produces "naïve".
for suggestion in pmt.mapchars(word, self.aff.MAP):
yield Suggestion(suggestion, 'mapchars')
# Try to swap adjacent characters (ktiten -> kitten), produces all possible forms with ONE
# swap; but for 4- and 5-letter words tries also two swaps "ahev -> have"
for suggestion in pmt.swapchar(word):
yield Suggestion(suggestion, 'swapchar')
# Try longer swaps (up to 4 chars distance)
for suggestion in pmt.longswapchar(word):
yield Suggestion(suggestion, 'longswapchar')
# Try to replace chars by those close on keyboard ("wueue" -> "queue"), KEY in aff file specifies
# keyboard layout.
for suggestion in pmt.badcharkey(word, self.aff.KEY):
yield Suggestion(suggestion, 'badcharkey')
# Try remove character (produces all forms with one char removed: "clat" => "lat", "cat", "clt", "cla")
for suggestion in pmt.extrachar(word):
yield Suggestion(suggestion, 'extrachar')
# Try insert character (from set of all possible language chars specified in aff), produces
# all forms with any of the TRY chars inserted in all possible positions
for suggestion in pmt.forgotchar(word, self.aff.TRY):
yield Suggestion(suggestion, 'forgotchar')
# Try to move a character forward and backwars:
# "rnai" => "nari", "nair", "rain" (forward: r moved 2 and 3 chars, n moved 2 chars)
# => "rina", "irna", "arni" (backward: i moved 2 and 3 chars, a moved 2 chars)
# (no 1-char movements necessary, they are covered by swapchar above)
for suggestion in pmt.movechar(word):
yield Suggestion(suggestion, 'movechar')
# Try replace each character with any of other language characters
for suggestion in pmt.badchar(word, self.aff.TRY):
yield Suggestion(suggestion, 'badchar')
# Try fix two-character doubling: "chickcken" -> "chicken" (one-character doubling is
# already covered by extrachar)
for suggestion in pmt.doubletwochars(word):
yield Suggestion(suggestion, 'doubletwochars')
if not self.aff.NOSPLITSUGS:
# Try split word by space in all possible positions
# NOSPLITSUGS option in aff prohibits it, it is important, say, for Scandinavian languages
for suggestion_pair in pmt.twowords(word):
yield MultiWordSuggestion(suggestion_pair, 'twowords', allow_dash=self.use_dash())
def ngram_suggestions(self, word: str, handled: Set[str]) -> Iterator[str]:
"""
Produces ngram-based suggestions, by passing to
:meth:`ngram_suggest.ngram_suggest <spylls.hunspell.algo.ngram_suggest.ngram_suggest>` current
misspelling, already found suggestions and settings from .aff file.
See :mod:`ngram_suggest <spylls.hunspell.algo.ngram_suggest>`.
Args:
word: Misspelled word
handled: List of already handled (known) suggestions; it is reused in
:meth:`ngram_suggest.filter_guesses <spylls.hunspell.algo.ngram_suggest.filter_guesses>`
to decide whether we add "not really good" ngram-based suggestions to result
"""
if self.aff.MAXNGRAMSUGS == 0:
return
yield from ngram_suggest.ngram_suggest(
word.lower(),
dictionary_words=self.words_for_ngram,
prefixes=self.aff.PFX, suffixes=self.aff.SFX,
known={*(word.lower() for word in handled)},
maxdiff=self.aff.MAXDIFF,
onlymaxdiff=self.aff.ONLYMAXDIFF,
has_phonetic=(self.aff.PHONE is not None))
def phonet_suggestions(self, word: str) -> Iterator[str]:
"""
Produces phonetical similarity-based suggestions, by passing to
:meth:`phonet_suggest.phonet_suggest <spylls.hunspell.algo.phonet_suggest.phonet_suggest>` current
misspelling and settings from .aff file.
See :mod:`phonet_suggest <spylls.hunspell.algo.phonet_suggest.phonet_suggest>`.
Args:
word: Misspelled word
"""
if not self.aff.PHONE:
return
yield from phonet_suggest.phonet_suggest(word, dictionary_words=self.words_for_ngram, table=self.aff.PHONE)
def use_dash(self) -> bool:
"""
Yeah, that's how hunspell defines whether words can be split by dash in this language:
either dash is explicitly mentioned in TRY directive, or TRY directive indicates the
language uses Latinic script. So dictionaries omiting TRY directive, or for languages with,
say, Cyrillic script not including "-" in it, will never suggest "foobar" => "foo-bar",
even if it is the perfectly normal way to spell.
"""
return '-' in self.aff.TRY or 'a' in self.aff.TRY