Skip to content

Commit

Permalink
feat/ get_gender_es (MycroftAI#23)
Browse files Browse the repository at this point in the history
Co-authored-by: jarbasal <jarbasai@mailfence.com>
  • Loading branch information
JarbasAl and JarbasAl committed May 9, 2021
1 parent 3041954 commit a91dd6f
Show file tree
Hide file tree
Showing 3 changed files with 92 additions and 23 deletions.
45 changes: 38 additions & 7 deletions lingua_nostra/lang/common_data_es.py
Expand Up @@ -51,7 +51,6 @@
80: 'ochenta',
90: 'noventa'
}

_STRING_NUM_ES = {
"cero": 0,
"un": 1,
Expand Down Expand Up @@ -117,8 +116,6 @@
"novecientos": 900,
"novecientas": 900,
"mil": 1000}


_FRACTION_STRING_ES = {
2: 'medio',
3: 'tercio',
Expand Down Expand Up @@ -171,8 +168,6 @@
(1e336, "sexquinquagintillón"),
(1e366, "unsexagintillón")
])


_SHORT_SCALE_ES = OrderedDict([
(100, 'centena'),
(1000, 'millar'),
Expand Down Expand Up @@ -280,7 +275,6 @@
1e3: "milésimo"
}


_SHORT_ORDINAL_STRING_ES = {
1e6: "millonésimo",
1e9: "milmillonésimo",
Expand All @@ -296,7 +290,6 @@
}
_SHORT_ORDINAL_STRING_ES.update(_ORDINAL_STRING_BASE_ES)


_LONG_ORDINAL_STRING_ES = {
1e6: "millonésimo",
1e12: "billionth",
Expand All @@ -311,3 +304,41 @@
# TODO > 1e60
}
_LONG_ORDINAL_STRING_ES.update(_ORDINAL_STRING_BASE_ES)

# word rules for gender
_FEMALE_ENDINGS_ES = ["a", "as", "triz"]
_MALE_ENDINGS_ES = ["o", "os"]

# special cases, word lookup for words not covered by above rule
_GENDERS_ES = {
"mujer": "f",
"mujeres": "f",
"madre": "f",
"hombre": "m",
"padre": "m",
"mapa": "m",
"poema": "m",
"problema": "m",
"día": "m",
"moto": "f",
"radio": "f",
"mano": "f",
"foto": "f",
"amor": "m",
"corazón": "m",
"árbol": "m",
"canción": "f",
"ciudad": "f",
"flor": "f",
"jefe": "m",
"café": "m",
"baile": "m",
"gente": "f",
"serie": "f"
}

# context rules for gender
_MALE_DETERMINANTS_ES = ["lo", "los", "este", "estos", "ese", "esos",
"un", "unos", "aquel", "aquellos", "el"]
_FEMALE_DETERMINANTS_ES = ["la", "las", "esta", "estas", "esa", "esas",
"una", "unas", "aquella", "aquellas"]
45 changes: 29 additions & 16 deletions lingua_nostra/lang/parse_es.py
Expand Up @@ -19,7 +19,9 @@
from lingua_nostra.time import now_local
from lingua_nostra.lang.format_es import pronounce_number_es
from lingua_nostra.lang.parse_common import *
from lingua_nostra.lang.common_data_es import _ARTICLES_ES, _STRING_NUM_ES
from lingua_nostra.lang.common_data_es import _ARTICLES_ES, _STRING_NUM_ES, \
_GENDERS_ES, _FEMALE_DETERMINANTS_ES, _MALE_DETERMINANTS_ES, \
_FEMALE_ENDINGS_ES, _MALE_ENDINGS_ES
from lingua_nostra.parse import normalize_decimals


Expand Down Expand Up @@ -1106,23 +1108,34 @@ def get_gender_es(word, context=""):
str: The code "m" (male), "f" (female) or "n" (neutral) for the gender,
or None if unknown/or unused in the given language.
"""
# Next rules are imprecise and incompleted, but is a good starting point.
# For more detailed explanation, see
# http://www.wikilengua.org/index.php/Género_gramatical
word = word.rstrip("s")
gender = False
words = context.split(" ")
# parse gender taking context into account
word = word.lower()
words = context.lower().split(" ")
for idx, w in enumerate(words):
if w == word and idx != 0:
previous = words[idx - 1]
gender = get_gender_es(previous)
break
if not gender:
if word[-1] == "a":
gender = "f"
if word[-1] == "o" or word[-1] == "e":
gender = "m"
return gender
# in spanish usually the previous word (a determinant)
# assigns gender to the next word
previous = words[idx - 1].lower()
if previous in _MALE_DETERMINANTS_ES:
return "m"
elif previous in _FEMALE_DETERMINANTS_ES:
return "f"
# get gender using only the individual word
# see if this word has the gender defined
if word in _GENDERS_ES:
return _GENDERS_ES[word]
singular = word.rstrip("s")
if singular in _GENDERS_ES:
return _GENDERS_ES[singular]
# in spanish the last vowel usually defines the gender of a word
# the gender of the determinant takes precedence over this rule
for end_str in _FEMALE_ENDINGS_ES:
if word.endswith(end_str):
return "f"
for end_str in _MALE_ENDINGS_ES:
if word.endswith(end_str):
return "m"
return None


class SpanishNormalizer(Normalizer):
Expand Down
25 changes: 25 additions & 0 deletions test/test_parse_es.py
Expand Up @@ -21,6 +21,7 @@
extract_datetime)
from lingua_nostra.lang.parse_es import extract_datetime_es, is_fractional_es
from lingua_nostra.time import default_timezone
from lingua_nostra.parse import get_gender


def setUpModule():
Expand Down Expand Up @@ -244,5 +245,29 @@ def test_extract_datetime_relative_failing(self):
lang='es')[0], datetime(1997, 12, 29, 21))



class TestExtractGender(unittest.TestCase):
def test_gender_es(self):
# words with well defined grammatical gender rules
self.assertEqual(get_gender("vaca", lang="es"), "f")
self.assertEqual(get_gender("cabalo", lang="es"), "m")
self.assertEqual(get_gender("vacas", lang="es"), "f")

# words specifically defined in a lookup dictionary
self.assertEqual(get_gender("hombre", lang="es"), "m")
self.assertEqual(get_gender("mujer", lang="es"), "f")
self.assertEqual(get_gender("hombres", lang="es"), "m")
self.assertEqual(get_gender("mujeres", lang="es"), "f")

# words where gender rules do not work but context does
self.assertEqual(get_gender("buey", lang="es"), None)
self.assertEqual(get_gender("buey", "el buey come hierba", lang="es"), "m")
self.assertEqual(get_gender("hombre", "este hombre come bueyes",
lang="es"), "m")
self.assertEqual(get_gender("cantante", lang="es"), None)
self.assertEqual(get_gender("cantante", "esa cantante es muy buena",
lang="es"), "f")


if __name__ == "__main__":
unittest.main()

0 comments on commit a91dd6f

Please sign in to comment.