Treat characters 'ÅåÄäÖö' as special when matching in the Swedish dic…

…tionary Resolves: #126 Similar for Norwegian, Danish, and Finnish.
mike-fabian · Aug 5, 2020 · 7cf78ee · 7cf78ee
1 parent 113c329
commit 7cf78ee
Show file tree

Hide file tree

Showing 4 changed files with 209 additions and 40 deletions.
diff --git a/engine/hunspell_suggest.py b/engine/hunspell_suggest.py
@@ -61,13 +61,103 @@
 # letter of a word until the candidate lookup table pops up.
 MAX_WORDS = 100
 
+# List of languages where accent insensitive matching makes sense:
+ACCENT_LANGUAGES = {
+    'af': '',
+    'ast': '',
+    'az': '',
+    'be': '',
+    'bg': '',
+    'br': '',
+    'bs': '',
+    'ca': '',
+    'cs': '',
+    'csb': '',
+    'cv': '',
+    'cy': '',
+    'da': 'æÆøØåÅ',
+    'de': '',
+    'dsb': '',
+    'el': '',
+    'en': '',
+    'es': '',
+    'eu': '',
+    'fi': 'åÅäÄöÖ',
+    'fo': '',
+    'fr': '',
+    'fur': '',
+    'fy': '',
+    'ga': '',
+    'gd': '',
+    'gl': '',
+    'grc': '',
+    'gv': '',
+    'haw': '',
+    'hr': '',
+    'hsb': '',
+    'ht': '',
+    'hu': '',
+    'ia': '',
+    'is': '',
+    'it': '',
+    'kk': '',
+    'ku': '',
+    'ky': '',
+    'lb': '',
+    'ln': '',
+    'lv': '',
+    'mg': '',
+    'mi': '',
+    'mk': '',
+    'mn': '',
+    'mos': '',
+    'mt': '',
+    'nb': 'æÆøØåÅ',
+    'nds': '',
+    'nl': '',
+    'nn': 'æÆøØåÅ',
+    'nr': '',
+    'nso': '',
+    'ny': '',
+    'oc': '',
+    'pl': '',
+    'plt': '',
+    'pt': '',
+    'qu': '',
+    'quh': '',
+    'ru': '',
+    'sc': '',
+    'se': '',
+    'sh': '',
+    'shs': '',
+    'sk': '',
+    'sl': '',
+    'smj': '',
+    'sq': '',
+    'sr': '',
+    'ss': '',
+    'st': '',
+    'sv': 'åÅäÄöÖ',
+    'tet': '',
+    'tk': '',
+    'tn': '',
+    'ts': '',
+    'uk': '',
+    'uz': '',
+    've': '',
+    'vi': '',
+    'wa': '',
+    'xh': '',
+}
+
 class Dictionary:
     '''A class to hold a hunspell dictionary
     '''
     def __init__(self, name='en_US'):
         if DEBUG_LEVEL > 1:
             LOGGER.debug('Dictionary.__init__(name=%s)\n', name)
         self.name = name
+        self.language = self.name.split('_')[0]
         self.dic_path = ''
         self.encoding = 'UTF-8'
         self.words = []
@@ -90,21 +180,10 @@ def load_dictionary(self):
          self.encoding,
          self.words) = itb_util.get_hunspell_dictionary_wordlist(self.name)
         if self.words:
-            # List of languages where accent insensitive matching makes sense:
-            accent_languages = (
-                'af', 'ast', 'az', 'be', 'bg', 'br', 'bs', 'ca', 'cs', 'csb',
-                'cv', 'cy', 'da', 'de', 'dsb', 'el', 'en', 'es', 'eu', 'fi', 'fo',
-                'fr', 'fur', 'fy', 'ga', 'gd', 'gl', 'grc', 'gv', 'haw', 'hr',
-                'hsb', 'ht', 'hu', 'ia', 'is', 'it', 'kk', 'ku', 'ky', 'lb',
-                'ln', 'lv', 'mg', 'mi', 'mk', 'mn', 'mos', 'mt', 'nb', 'nds',
-                'nl', 'nn', 'nr', 'nso', 'ny', 'oc', 'pl', 'plt', 'pt', 'qu',
-                'quh', 'ru', 'sc', 'se', 'sh', 'shs', 'sk', 'sl', 'smj', 'sq',
-                'sr', 'ss', 'st', 'sv', 'tet', 'tk', 'tn', 'ts', 'uk', 'uz',
-                've', 'vi', 'wa', 'xh',
-            )
-            if self.name.split('_')[0] in accent_languages:
+            if self.language in ACCENT_LANGUAGES:
                 self.word_pairs = [
-                    (x, itb_util.remove_accents(x))
+                    (x, itb_util.remove_accents(
+                        x, keep=ACCENT_LANGUAGES[self.language]))
                     for x in self.words
                 ]
             for word in self.words:
@@ -561,9 +640,6 @@ def suggest(self, input_phrase):
         # make sure input_phrase is in the internal normalization form (NFD):
         input_phrase = unicodedata.normalize(
             itb_util.NORMALIZATION_FORM_INTERNAL, input_phrase)
-        input_phrase_no_accents = unicodedata.normalize(
-            itb_util.NORMALIZATION_FORM_INTERNAL,
-            itb_util.remove_accents(input_phrase))
         # But enchant and pyhunspell want NFC as input, make a copy in NFC:
         input_phrase_nfc = unicodedata.normalize('NFC', input_phrase)
 
@@ -578,7 +654,11 @@ def suggest(self, input_phrase):
                         suggested_words.update([
                             (x[0], 0)
                             for x in dictionary.word_pairs
-                            if x[1].startswith(input_phrase_no_accents)])
+                            if x[1].startswith(
+                                    itb_util.remove_accents(
+                                        input_phrase,
+                                        keep=ACCENT_LANGUAGES[
+                                            dictionary.language]))])
                     else:
                         suggested_words.update([
                             (x, 0)
@@ -602,18 +682,19 @@ def suggest(self, input_phrase):
                         for x in
                         dictionary.spellcheck_suggest(input_phrase)
                     ]
-                    suggested_words.update([
-                        (suggestion, -1)
-                        for suggestion in extra_suggestions
-                        if suggestion not in suggested_words])
-        for word in suggested_words:
-            if (suggested_words[word] == -1
-                    and
-                    itb_util.remove_accents(word)
-                    == itb_util.remove_accents(input_phrase)):
-                # This spell checking correction is actually even
-                # an accent insensitive match, adjust accordingly:
-                suggested_words[word] = 0
+                    for suggestion in extra_suggestions:
+                        if suggestion not in suggested_words:
+                            if (dictionary.word_pairs
+                                and
+                                itb_util.remove_accents(
+                                    suggestion,
+                                    keep=ACCENT_LANGUAGES[dictionary.language])
+                                == itb_util.remove_accents(
+                                    input_phrase,
+                                    keep=ACCENT_LANGUAGES[dictionary.language])):
+                                suggested_words[suggestion] = 0
+                            else:
+                                suggested_words[suggestion] = -1
         sorted_suggestions = sorted(
             suggested_words.items(),
             key=lambda x: (

diff --git a/engine/itb_util.py b/engine/itb_util.py
@@ -2602,17 +2602,19 @@ def is_ascii(text):
     ord('ł'): 'l',
 }
 
-def remove_accents(text):
+def remove_accents(text, keep=''):
     '''Removes accents from the text
 
-    Returns the text with all accents removed
-
-    Using “from unidecode import unidecode” is more
+    Using “from unidecode import unidecode” is maybe more
     sophisticated, but I am not sure whether I can require
-    “unidecode”.
+    “unidecode”. And maybe it cannot easily keep some accents for some
+    languages.
 
     :param text: The text to change
     :type text: string
+    :param keep: A string of characters which should be kept unchanged
+    :type keep: string
+    :return: The text with some or all accents removed in NORMALIZATION_FORM_INTERNAL
     :rtype: string
 
     Examples:
@@ -2623,10 +2625,31 @@ def remove_accents(text):
     >>> remove_accents('ÅÆæŒœĳøßẞü')
     'AAEaeOEoeijossSSu'
 
+    >>> remove_accents('abcÅøßẞüxyz')
+    'abcAossSSuxyz'
+
+    >>> unicodedata.normalize('NFC', remove_accents('abcÅøßẞüxyz', keep='åÅØø'))
+    'abcÅøssSSuxyz'
+
+    >>> unicodedata.normalize('NFC', remove_accents('alkoholförgiftning', keep='åÅÖö'))
+    'alkoholförgiftning'
+
     '''
-    return ''.join([
-        x for x in unicodedata.normalize('NFKD', text)
-        if unicodedata.category(x) != 'Mn']).translate(TRANS_TABLE)
+    if not keep:
+        result = ''.join([
+            x for x in unicodedata.normalize('NFKD', text)
+            if unicodedata.category(x) != 'Mn']).translate(TRANS_TABLE)
+        return unicodedata.normalize(NORMALIZATION_FORM_INTERNAL, result)
+    result = ''
+    keep = unicodedata.normalize('NFC', keep)
+    for char in unicodedata.normalize('NFC', text):
+        if char in keep:
+            result += char
+            continue
+        result += ''.join([
+            x for x in unicodedata.normalize('NFKD', char)
+            if unicodedata.category(x) != 'Mn']).translate(TRANS_TABLE)
+    return unicodedata.normalize(NORMALIZATION_FORM_INTERNAL, result)
 
 def is_right_to_left_messages():
     '''

diff --git a/tests/test_hunspell_suggest.py b/tests/test_hunspell_suggest.py
@@ -189,8 +189,10 @@ def test_fi_FI_dictionary_file(self):
              ('kissajuttu', 0),
              ('kissamaiseksi',0)])
         self.assertEqual(
-            h.suggest('Pariisin-suurlahettila'),
-            [('Pariisin-suurla\u0308hettila\u0308s', 0)])
+            h.suggest('Pariisin-suurlähettila'),
+            [('Pariisin-suurla\u0308hettila\u0308s', 0),
+             ('Pariisin-suurlähetetila', -1),
+             ('Pariisin-suurlähettiala', -1)])
 
     @unittest.skipUnless(
         IMPORT_LIBVOIKKO_SUCCESSFUL,
@@ -259,5 +261,34 @@ def test_fi_FI_spellcheck_suggest_voikko(self):
             d.spellcheck_suggest_voikko('kisssa'),
             ['kissa', 'kissaa', 'kisassa', 'kisussa'])
 
+    @unittest.skipUnless(
+        itb_util.get_hunspell_dictionary_wordlist('sv_SE')[0],
+        "Skipping because no Swedisch dictionary could be found. ")
+    def test_sv_SE(self):
+        h = hunspell_suggest.Hunspell(['sv_SE'])
+        self.assertEqual(
+            h.suggest('östgo'),
+            [('östgot', 0),
+             ('östgöte', 0),
+             ('östgotisk', 0),
+             ('östgötsk', 0),
+             ('östgötska', 0)])
+        self.assertEqual(
+            h.suggest('östgot'),
+            [('östgot', 0),
+             ('östgotisk', 0),
+             ('Östgot', -1)])
+        self.assertEqual(
+            h.suggest('östgö'),
+            [('östgöte', 0),
+             ('östgötsk', 0),
+             ('östgötska', 0)])
+        self.assertEqual(
+            h.suggest('östgöt')[0:4],
+            [('östgöte', 0),
+             ('östgötsk', 0),
+             ('östgötska', 0),
+             ('östgot', -1)])
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/test_itb_util.py b/tests/test_itb_util.py
@@ -24,6 +24,7 @@
 import sys
 import locale
 import unittest
+import unicodedata
 
 from gi import require_version
 require_version('IBus', '1.0')
@@ -65,5 +66,38 @@ def test_is_right_to_left_messages(self):
         locale.setlocale(locale.LC_ALL, 'C')
         self.assertEqual(itb_util.is_right_to_left_messages(), False)
 
+    def test_remove_accents(self):
+        self.assertEqual(
+            itb_util.remove_accents('abcÅøßẞüxyz'),
+            'abcAossSSuxyz')
+        self.assertEqual(
+            itb_util.remove_accents(
+                unicodedata.normalize('NFD', 'abcÅøßẞüxyz')),
+            'abcAossSSuxyz')
+        self.assertEqual(
+            unicodedata.normalize(
+                'NFC',
+                itb_util.remove_accents('abcÅøßẞüxyz', keep='åÅØø')),
+            'abcÅøssSSuxyz')
+        self.assertEqual(
+            unicodedata.normalize(
+                'NFC',
+                itb_util.remove_accents(
+                    unicodedata.normalize('NFD', 'abcÅøßẞüxyz'),
+                    keep=unicodedata.normalize('NFD', 'åÅØø'))),
+            'abcÅøssSSuxyz')
+        self.assertEqual(
+            unicodedata.normalize(
+                'NFC',
+                itb_util.remove_accents('alkoholförgiftning', keep='åÅÖö')),
+            'alkoholförgiftning')
+        self.assertEqual(
+            unicodedata.normalize(
+                'NFC',
+                itb_util.remove_accents(
+                    unicodedata.normalize('NFD', 'alkoholförgiftning'),
+                    keep=unicodedata.normalize('NFD', 'åÅÖö'))),
+            'alkoholförgiftning')
+
 if __name__ == '__main__':
     unittest.main()