feat/ get_gender_es (MycroftAI#23)

Co-authored-by: jarbasal <jarbasai@mailfence.com>
ChanceNCounter · May 9, 2021 · a91dd6f · a91dd6f
1 parent 3041954
commit a91dd6f
Show file tree

Hide file tree

Showing 3 changed files with 92 additions and 23 deletions.
diff --git a/lingua_nostra/lang/common_data_es.py b/lingua_nostra/lang/common_data_es.py
@@ -51,7 +51,6 @@
     80: 'ochenta',
     90: 'noventa'
 }
-
 _STRING_NUM_ES = {
     "cero": 0,
     "un": 1,
@@ -117,8 +116,6 @@
     "novecientos": 900,
     "novecientas": 900,
     "mil": 1000}
-
-
 _FRACTION_STRING_ES = {
     2: 'medio',
     3: 'tercio',
@@ -171,8 +168,6 @@
     (1e336, "sexquinquagintillón"),
     (1e366, "unsexagintillón")
 ])
-
-
 _SHORT_SCALE_ES = OrderedDict([
     (100, 'centena'),
     (1000, 'millar'),
@@ -280,7 +275,6 @@
     1e3: "milésimo"
 }
 
-
 _SHORT_ORDINAL_STRING_ES = {
     1e6: "millonésimo",
     1e9: "milmillonésimo",
@@ -296,7 +290,6 @@
 }
 _SHORT_ORDINAL_STRING_ES.update(_ORDINAL_STRING_BASE_ES)
 
-
 _LONG_ORDINAL_STRING_ES = {
     1e6: "millonésimo",
     1e12: "billionth",
@@ -311,3 +304,41 @@
     # TODO > 1e60
 }
 _LONG_ORDINAL_STRING_ES.update(_ORDINAL_STRING_BASE_ES)
+
+# word rules for gender
+_FEMALE_ENDINGS_ES = ["a", "as", "triz"]
+_MALE_ENDINGS_ES = ["o", "os"]
+
+# special cases, word lookup for words not covered by above rule
+_GENDERS_ES = {
+    "mujer": "f",
+    "mujeres": "f",
+    "madre": "f",
+    "hombre": "m",
+    "padre": "m",
+    "mapa": "m",
+    "poema": "m",
+    "problema": "m",
+    "día": "m",
+    "moto": "f",
+    "radio": "f",
+    "mano": "f",
+    "foto": "f",
+    "amor": "m",
+    "corazón": "m",
+    "árbol": "m",
+    "canción": "f",
+    "ciudad": "f",
+    "flor": "f",
+    "jefe": "m",
+    "café": "m",
+    "baile": "m",
+    "gente": "f",
+    "serie": "f"
+}
+
+# context rules for gender
+_MALE_DETERMINANTS_ES = ["lo", "los", "este", "estos", "ese", "esos",
+                         "un", "unos", "aquel", "aquellos", "el"]
+_FEMALE_DETERMINANTS_ES = ["la", "las", "esta", "estas", "esa", "esas",
+                           "una", "unas", "aquella", "aquellas"]
diff --git a/lingua_nostra/lang/parse_es.py b/lingua_nostra/lang/parse_es.py
@@ -19,7 +19,9 @@
 from lingua_nostra.time import now_local
 from lingua_nostra.lang.format_es import pronounce_number_es
 from lingua_nostra.lang.parse_common import *
-from lingua_nostra.lang.common_data_es import _ARTICLES_ES, _STRING_NUM_ES
+from lingua_nostra.lang.common_data_es import _ARTICLES_ES, _STRING_NUM_ES, \
+    _GENDERS_ES, _FEMALE_DETERMINANTS_ES, _MALE_DETERMINANTS_ES, \
+    _FEMALE_ENDINGS_ES, _MALE_ENDINGS_ES
 from lingua_nostra.parse import normalize_decimals
 
 
@@ -1106,23 +1108,34 @@ def get_gender_es(word, context=""):
         str: The code "m" (male), "f" (female) or "n" (neutral) for the gender,
              or None if unknown/or unused in the given language.
     """
-    # Next rules are imprecise and incompleted, but is a good starting point.
-    # For more detailed explanation, see
-    # http://www.wikilengua.org/index.php/Género_gramatical
-    word = word.rstrip("s")
-    gender = False
-    words = context.split(" ")
+    # parse gender taking context into account
+    word = word.lower()
+    words = context.lower().split(" ")
     for idx, w in enumerate(words):
         if w == word and idx != 0:
-            previous = words[idx - 1]
-            gender = get_gender_es(previous)
-            break
-    if not gender:
-        if word[-1] == "a":
-            gender = "f"
-        if word[-1] == "o" or word[-1] == "e":
-            gender = "m"
-    return gender
+            # in spanish usually the previous word (a determinant)
+            # assigns gender to the next word
+            previous = words[idx - 1].lower()
+            if previous in _MALE_DETERMINANTS_ES:
+                return "m"
+            elif previous in _FEMALE_DETERMINANTS_ES:
+                return "f"
+    # get gender using only the individual word
+    # see if this word has the gender defined
+    if word in _GENDERS_ES:
+        return _GENDERS_ES[word]
+    singular = word.rstrip("s")
+    if singular in _GENDERS_ES:
+        return _GENDERS_ES[singular]
+    # in spanish the last vowel usually defines the gender of a word
+    # the gender of the determinant takes precedence over this rule
+    for end_str in _FEMALE_ENDINGS_ES:
+        if word.endswith(end_str):
+            return "f"
+    for end_str in _MALE_ENDINGS_ES:
+        if word.endswith(end_str):
+            return "m"
+    return None
 
 
 class SpanishNormalizer(Normalizer):

diff --git a/test/test_parse_es.py b/test/test_parse_es.py
@@ -21,6 +21,7 @@
                                  extract_datetime)
 from lingua_nostra.lang.parse_es import extract_datetime_es, is_fractional_es
 from lingua_nostra.time import default_timezone
+from lingua_nostra.parse import get_gender
 
 
 def setUpModule():
@@ -244,5 +245,29 @@ def test_extract_datetime_relative_failing(self):
             lang='es')[0], datetime(1997, 12, 29, 21))
 
 
+
+class TestExtractGender(unittest.TestCase):
+    def test_gender_es(self):
+        # words with well defined grammatical gender rules
+        self.assertEqual(get_gender("vaca", lang="es"), "f")
+        self.assertEqual(get_gender("cabalo", lang="es"), "m")
+        self.assertEqual(get_gender("vacas", lang="es"), "f")
+
+        # words specifically defined in a lookup dictionary
+        self.assertEqual(get_gender("hombre", lang="es"), "m")
+        self.assertEqual(get_gender("mujer", lang="es"), "f")
+        self.assertEqual(get_gender("hombres", lang="es"), "m")
+        self.assertEqual(get_gender("mujeres", lang="es"), "f")
+
+        # words where gender rules do not work but context does
+        self.assertEqual(get_gender("buey", lang="es"), None)
+        self.assertEqual(get_gender("buey", "el buey come hierba", lang="es"), "m")
+        self.assertEqual(get_gender("hombre", "este hombre come bueyes",
+                                    lang="es"), "m")
+        self.assertEqual(get_gender("cantante", lang="es"), None)
+        self.assertEqual(get_gender("cantante", "esa cantante es muy buena",
+                                    lang="es"), "f")
+
+
 if __name__ == "__main__":
     unittest.main()