Skip to content

Commit

Permalink
Treat characters 'ÅåÄäÖö' as special when matching in the Swedish dic…
Browse files Browse the repository at this point in the history
…tionary

Resolves: #126

Similar for Norwegian, Danish, and Finnish.
  • Loading branch information
mike-fabian committed Aug 5, 2020
1 parent 113c329 commit 7cf78ee
Show file tree
Hide file tree
Showing 4 changed files with 209 additions and 40 deletions.
141 changes: 111 additions & 30 deletions engine/hunspell_suggest.py
Expand Up @@ -61,13 +61,103 @@
# letter of a word until the candidate lookup table pops up.
MAX_WORDS = 100

# List of languages where accent insensitive matching makes sense:
ACCENT_LANGUAGES = {
'af': '',
'ast': '',
'az': '',
'be': '',
'bg': '',
'br': '',
'bs': '',
'ca': '',
'cs': '',
'csb': '',
'cv': '',
'cy': '',
'da': 'æÆøØåÅ',
'de': '',
'dsb': '',
'el': '',
'en': '',
'es': '',
'eu': '',
'fi': 'åÅäÄöÖ',
'fo': '',
'fr': '',
'fur': '',
'fy': '',
'ga': '',
'gd': '',
'gl': '',
'grc': '',
'gv': '',
'haw': '',
'hr': '',
'hsb': '',
'ht': '',
'hu': '',
'ia': '',
'is': '',
'it': '',
'kk': '',
'ku': '',
'ky': '',
'lb': '',
'ln': '',
'lv': '',
'mg': '',
'mi': '',
'mk': '',
'mn': '',
'mos': '',
'mt': '',
'nb': 'æÆøØåÅ',
'nds': '',
'nl': '',
'nn': 'æÆøØåÅ',
'nr': '',
'nso': '',
'ny': '',
'oc': '',
'pl': '',
'plt': '',
'pt': '',
'qu': '',
'quh': '',
'ru': '',
'sc': '',
'se': '',
'sh': '',
'shs': '',
'sk': '',
'sl': '',
'smj': '',
'sq': '',
'sr': '',
'ss': '',
'st': '',
'sv': 'åÅäÄöÖ',
'tet': '',
'tk': '',
'tn': '',
'ts': '',
'uk': '',
'uz': '',
've': '',
'vi': '',
'wa': '',
'xh': '',
}

class Dictionary:
'''A class to hold a hunspell dictionary
'''
def __init__(self, name='en_US'):
if DEBUG_LEVEL > 1:
LOGGER.debug('Dictionary.__init__(name=%s)\n', name)
self.name = name
self.language = self.name.split('_')[0]
self.dic_path = ''
self.encoding = 'UTF-8'
self.words = []
Expand All @@ -90,21 +180,10 @@ def load_dictionary(self):
self.encoding,
self.words) = itb_util.get_hunspell_dictionary_wordlist(self.name)
if self.words:
# List of languages where accent insensitive matching makes sense:
accent_languages = (
'af', 'ast', 'az', 'be', 'bg', 'br', 'bs', 'ca', 'cs', 'csb',
'cv', 'cy', 'da', 'de', 'dsb', 'el', 'en', 'es', 'eu', 'fi', 'fo',
'fr', 'fur', 'fy', 'ga', 'gd', 'gl', 'grc', 'gv', 'haw', 'hr',
'hsb', 'ht', 'hu', 'ia', 'is', 'it', 'kk', 'ku', 'ky', 'lb',
'ln', 'lv', 'mg', 'mi', 'mk', 'mn', 'mos', 'mt', 'nb', 'nds',
'nl', 'nn', 'nr', 'nso', 'ny', 'oc', 'pl', 'plt', 'pt', 'qu',
'quh', 'ru', 'sc', 'se', 'sh', 'shs', 'sk', 'sl', 'smj', 'sq',
'sr', 'ss', 'st', 'sv', 'tet', 'tk', 'tn', 'ts', 'uk', 'uz',
've', 'vi', 'wa', 'xh',
)
if self.name.split('_')[0] in accent_languages:
if self.language in ACCENT_LANGUAGES:
self.word_pairs = [
(x, itb_util.remove_accents(x))
(x, itb_util.remove_accents(
x, keep=ACCENT_LANGUAGES[self.language]))
for x in self.words
]
for word in self.words:
Expand Down Expand Up @@ -561,9 +640,6 @@ def suggest(self, input_phrase):
# make sure input_phrase is in the internal normalization form (NFD):
input_phrase = unicodedata.normalize(
itb_util.NORMALIZATION_FORM_INTERNAL, input_phrase)
input_phrase_no_accents = unicodedata.normalize(
itb_util.NORMALIZATION_FORM_INTERNAL,
itb_util.remove_accents(input_phrase))
# But enchant and pyhunspell want NFC as input, make a copy in NFC:
input_phrase_nfc = unicodedata.normalize('NFC', input_phrase)

Expand All @@ -578,7 +654,11 @@ def suggest(self, input_phrase):
suggested_words.update([
(x[0], 0)
for x in dictionary.word_pairs
if x[1].startswith(input_phrase_no_accents)])
if x[1].startswith(
itb_util.remove_accents(
input_phrase,
keep=ACCENT_LANGUAGES[
dictionary.language]))])
else:
suggested_words.update([
(x, 0)
Expand All @@ -602,18 +682,19 @@ def suggest(self, input_phrase):
for x in
dictionary.spellcheck_suggest(input_phrase)
]
suggested_words.update([
(suggestion, -1)
for suggestion in extra_suggestions
if suggestion not in suggested_words])
for word in suggested_words:
if (suggested_words[word] == -1
and
itb_util.remove_accents(word)
== itb_util.remove_accents(input_phrase)):
# This spell checking correction is actually even
# an accent insensitive match, adjust accordingly:
suggested_words[word] = 0
for suggestion in extra_suggestions:
if suggestion not in suggested_words:
if (dictionary.word_pairs
and
itb_util.remove_accents(
suggestion,
keep=ACCENT_LANGUAGES[dictionary.language])
== itb_util.remove_accents(
input_phrase,
keep=ACCENT_LANGUAGES[dictionary.language])):
suggested_words[suggestion] = 0
else:
suggested_words[suggestion] = -1
sorted_suggestions = sorted(
suggested_words.items(),
key=lambda x: (
Expand Down
39 changes: 31 additions & 8 deletions engine/itb_util.py
Expand Up @@ -2602,17 +2602,19 @@ def is_ascii(text):
ord('ł'): 'l',
}

def remove_accents(text):
def remove_accents(text, keep=''):
'''Removes accents from the text
Returns the text with all accents removed
Using “from unidecode import unidecode” is more
Using “from unidecode import unidecode” is maybe more
sophisticated, but I am not sure whether I can require
“unidecode”.
“unidecode”. And maybe it cannot easily keep some accents for some
languages.
:param text: The text to change
:type text: string
:param keep: A string of characters which should be kept unchanged
:type keep: string
:return: The text with some or all accents removed in NORMALIZATION_FORM_INTERNAL
:rtype: string
Examples:
Expand All @@ -2623,10 +2625,31 @@ def remove_accents(text):
>>> remove_accents('ÅÆ挜ijøßẞü')
'AAEaeOEoeijossSSu'
>>> remove_accents('abcÅøßẞüxyz')
'abcAossSSuxyz'
>>> unicodedata.normalize('NFC', remove_accents('abcÅøßẞüxyz', keep='åÅØø'))
'abcÅøssSSuxyz'
>>> unicodedata.normalize('NFC', remove_accents('alkoholförgiftning', keep='åÅÖö'))
'alkoholförgiftning'
'''
return ''.join([
x for x in unicodedata.normalize('NFKD', text)
if unicodedata.category(x) != 'Mn']).translate(TRANS_TABLE)
if not keep:
result = ''.join([
x for x in unicodedata.normalize('NFKD', text)
if unicodedata.category(x) != 'Mn']).translate(TRANS_TABLE)
return unicodedata.normalize(NORMALIZATION_FORM_INTERNAL, result)
result = ''
keep = unicodedata.normalize('NFC', keep)
for char in unicodedata.normalize('NFC', text):
if char in keep:
result += char
continue
result += ''.join([
x for x in unicodedata.normalize('NFKD', char)
if unicodedata.category(x) != 'Mn']).translate(TRANS_TABLE)
return unicodedata.normalize(NORMALIZATION_FORM_INTERNAL, result)

def is_right_to_left_messages():
'''
Expand Down
35 changes: 33 additions & 2 deletions tests/test_hunspell_suggest.py
Expand Up @@ -189,8 +189,10 @@ def test_fi_FI_dictionary_file(self):
('kissajuttu', 0),
('kissamaiseksi',0)])
self.assertEqual(
h.suggest('Pariisin-suurlahettila'),
[('Pariisin-suurla\u0308hettila\u0308s', 0)])
h.suggest('Pariisin-suurlähettila'),
[('Pariisin-suurla\u0308hettila\u0308s', 0),
('Pariisin-suurlähetetila', -1),
('Pariisin-suurlähettiala', -1)])

@unittest.skipUnless(
IMPORT_LIBVOIKKO_SUCCESSFUL,
Expand Down Expand Up @@ -259,5 +261,34 @@ def test_fi_FI_spellcheck_suggest_voikko(self):
d.spellcheck_suggest_voikko('kisssa'),
['kissa', 'kissaa', 'kisassa', 'kisussa'])

@unittest.skipUnless(
itb_util.get_hunspell_dictionary_wordlist('sv_SE')[0],
"Skipping because no Swedisch dictionary could be found. ")
def test_sv_SE(self):
h = hunspell_suggest.Hunspell(['sv_SE'])
self.assertEqual(
h.suggest('östgo'),
[('östgot', 0),
('östgöte', 0),
('östgotisk', 0),
('östgötsk', 0),
('östgötska', 0)])
self.assertEqual(
h.suggest('östgot'),
[('östgot', 0),
('östgotisk', 0),
('Östgot', -1)])
self.assertEqual(
h.suggest('östgö'),
[('östgöte', 0),
('östgötsk', 0),
('östgötska', 0)])
self.assertEqual(
h.suggest('östgöt')[0:4],
[('östgöte', 0),
('östgötsk', 0),
('östgötska', 0),
('östgot', -1)])

if __name__ == '__main__':
unittest.main()
34 changes: 34 additions & 0 deletions tests/test_itb_util.py
Expand Up @@ -24,6 +24,7 @@
import sys
import locale
import unittest
import unicodedata

from gi import require_version
require_version('IBus', '1.0')
Expand Down Expand Up @@ -65,5 +66,38 @@ def test_is_right_to_left_messages(self):
locale.setlocale(locale.LC_ALL, 'C')
self.assertEqual(itb_util.is_right_to_left_messages(), False)

def test_remove_accents(self):
self.assertEqual(
itb_util.remove_accents('abcÅøßẞüxyz'),
'abcAossSSuxyz')
self.assertEqual(
itb_util.remove_accents(
unicodedata.normalize('NFD', 'abcÅøßẞüxyz')),
'abcAossSSuxyz')
self.assertEqual(
unicodedata.normalize(
'NFC',
itb_util.remove_accents('abcÅøßẞüxyz', keep='åÅØø')),
'abcÅøssSSuxyz')
self.assertEqual(
unicodedata.normalize(
'NFC',
itb_util.remove_accents(
unicodedata.normalize('NFD', 'abcÅøßẞüxyz'),
keep=unicodedata.normalize('NFD', 'åÅØø'))),
'abcÅøssSSuxyz')
self.assertEqual(
unicodedata.normalize(
'NFC',
itb_util.remove_accents('alkoholförgiftning', keep='åÅÖö')),
'alkoholförgiftning')
self.assertEqual(
unicodedata.normalize(
'NFC',
itb_util.remove_accents(
unicodedata.normalize('NFD', 'alkoholförgiftning'),
keep=unicodedata.normalize('NFD', 'åÅÖö'))),
'alkoholförgiftning')

if __name__ == '__main__':
unittest.main()

0 comments on commit 7cf78ee

Please sign in to comment.