# Spanish Conjugations

In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import pandas as pd

In [3]:
import json

In [4]:
import re

Load this [conjugations spreadsheet](https://github.com/ghidinelli/fred-jehle-spanish-verbs) converted to CSV:

In [5]:
conjugations = pd.read_csv('conjugations.csv', encoding='utf-8').fillna('')

In [6]:
meta = pd.read_csv('meta.csv', encoding='utf-8')

Check for any duplicate translations:

In [7]:
def check_meta():
    seen = {}
    for i, metarow in meta.iterrows():
        trans = metarow['trad_en']
        if trans in seen:
            print(f'Duplicate ({seen[trans]}, {i}): "{trans}"')
        else:
            seen[trans] = i
            
check_meta()

Define the pronoun headings used in the `conjugations` table:

In [8]:
tpronouns = 'yo', 'tu', 'el', 'ns', 'vs', 'ellos'
npronouns = len(tpronouns)

Define the pronouns to use in the applications:

In [9]:
apronouns = ['yo', 'tú', 'él/ella/usted', 'nosostros', 'vosotros', 'ellos/ellas/udstedes']

Tabulate the list of possible tenses, indentified with the key (mood,time) in the order they appear. Note that reflexive verbs use one extra tense.

In [10]:
def get_tenses():
    tenses = []
    # Lookup tenses for a reflexive verb since they have two forms of
    # 'Indicativo,Presente progresivo'.
    sel = (conjugations['verbo'] == 'aburrirse')
    for _, row in conjugations[sel].iterrows():
        tenses.append(f'{row["modo"]},{row["tiempo"]}')
    return tenses

In [11]:
tenses = get_tenses()
print(tenses)

['Indicativo,Presente', 'Indicativo,Pretérito', 'Indicativo,Futuro', 'Condicional,Condicional', 'Indicativo,Imperfecto', 'Indicativo,Presente progresivo', 'Indicativo,Presente progresivo', 'Indicativo,Pretérito perfecto', 'Indicativo,Pluscuamperfecto', 'Indicativo,Futuro perfecto', 'Condicional,Condicional perfecto', 'Indicativo,Pretérito anterior', 'Subjuntivo,Subjuntivo presente', 'Subjuntivo,Subjuntivo imperfecto', 'Subjuntivo,Subjuntivo imperfecto', 'Subjuntivo,Subjuntivo futuro', 'Subjuntivo,Subjuntivo pretérito perfecto', 'Subjuntivo,Subjuntivo pluscuamperfecto', 'Subjuntivo,Subjuntivo pluscuamperfecto', 'Subjuntivo,Subjuntivo futuro perfecto', 'Imperativo,Imperativo positivo', 'Imperativo,Imperativo negativo']


Tabulate the regular conjugations of each tense for both non-reflexive and reflexive verbs:

In [12]:
def get_regular(prototypes='hablar,comer,vivir,afeitarse,esconderse,aburrirse'):
    regular = {}
    for proto in prototypes.split(','):
        reflexive = proto.endswith('se')
        n = 4 if reflexive else 2
        stem, ending = proto[:-n], proto[-n:]
        stem_pattern = re.compile(f'\\b{stem}')
        sel = (conjugations['verbo'] == proto)
        conjs = []
        idx = 0
        for _, row in conjugations[sel].iterrows():
            key = f'{row["modo"]},{row["tiempo"]}'
            if idx == 6 and not reflexive:
                # Skip the missing non-reflexive tense.
                conjs.append([])
                idx += 1
            assert tenses[idx] == key
            conj = [re.sub(stem_pattern, '_', row[p]) for p in tpronouns]
            conjs.append(conj)
            idx += 1
        regular[ending] = conjs
    return regular

In [13]:
regular = get_regular()

Build the dictionary used internally by the application:

In [14]:
def build_dictionary(maxverbs=None):
    conjdata = []
    nverb = ntense = npronoun = 0
    nreg_verb = nreg_tense = nreg_pronoun = 0
    for _, metarow in meta.iterrows():
        if nverb == maxverbs:
            break
        name = infinitive = metarow['verbo']
        assert name not in conjugations, f'Found duplicate verb {name}'
        if metarow['reflexivo']:
            reflexive = True
            # Remove -se suffix from infinitive of reflexive verbs.
            assert infinitive[-2:] == 'se', f'Unexpected reflexive ending for {infinitive}'
            infinitive = infinitive[:-2]
            ntenses = len(tenses)
        else:
            reflexive = False
            ntenses = len(tenses) - 1
        assert infinitive[-2:] in ('ar', 'er', 'ir', 'ír'), f'Unexpected ending for {infinitive}'
        stem, ending = infinitive[:-2], infinitive[-2:]
        if ending == 'ír': # remove the accent
            ending = 'ir'
        if reflexive:
            ending += 'se'
        if stem:
            stem_pattern = re.compile(f'\\b{stem}')
            replace_stem = lambda conjugation: re.sub(stem_pattern, '_', conjugation)
        else:
            # The verbs "ir" and "irse" need special handling.
            replace_stem = lambda conjugation: conjugation
        info = dict(name=name, com=metarow['comun'], en=metarow['trad_en'])
        conjs = []
        tense_idx = 0
        irregular = False
        sel = (conjugations['verbo'] == name)
        assert np.count_nonzero(sel) == ntenses, f'Unexpected number of tenses for {name}'
        for _, row in conjugations[sel].iterrows():
            key = f'{row["modo"]},{row["tiempo"]}'
            if tense_idx == 6 and not reflexive:
                # Skip the missing non-reflexive tense.
                conjs.append([])
                tense_idx += 1
            assert tenses[tense_idx] == key
            # Replace the verb stem with "_" for each pronoun.
            conj = [replace_stem(row[p]) for p in tpronouns]
            # Flag any regular conjugations.
            reg = regular[ending][tense_idx]
            if conj == reg:
                conj = '*'
                nreg_tense += 1
                nreg_pronoun += npronouns
            else:
                irregular = True
                # Check for individual pronouns with regular conjugations.
                for j in range(npronouns):
                    if conj[j] == reg[j]:
                        conj[j] = '*'
                        nreg_pronoun += 1
            tense_idx += 1
            ntense += 1
            npronoun += npronouns
            conjs.append(conj)
        assert len(conjs) == len(tenses)
        if not irregular:
            # All tenses of this verb are regular.
            conjs = '$'
            nreg_verb += 1
            # Check that this verb is not flagged as irregular.
            assert not metarow['irregular']
        conjdata.append(dict(info=info, conjs=conjs))
        nverb += 1
    print(f'Converted {nverb} verbs.')
    print(f'{nreg_verb} / {nverb} ({100. * nreg_verb / nverb:.1f}%) verbs are regular.')
    print(f'{nreg_tense} / {ntense} ({100. * nreg_tense / ntense:.1f}%) tenses are regular.')
    print(f'{nreg_pronoun} / {npronoun} ({100. * nreg_pronoun / npronoun:.1f}%) pronouns are regular.')
    return dict(pronouns=apronouns, regular=regular, conjugations=conjdata)

d = build_dictionary()

Converted 637 verbs.
287 / 637 (45.1%) verbs are regular.
11269 / 13448 (83.8%) tenses are regular.
70534 / 80688 (87.4%) pronouns are regular.


Save an indented version for debugging and a whitespace-compressed version for use in the application:

In [15]:
with open('conjugo_data_indented.js', 'w', encoding='utf8') as f:
    f.write(f'initData({json.dumps(d, ensure_ascii=False, indent=2)})')

In [16]:
with open('conjugo_data.js', 'w', encoding='utf8') as f:
    f.write(f'initData({json.dumps(d, ensure_ascii=False, separators=(",",":"))})')

In [17]:
!ls -lh conjugo_data*.js

-rw-r--r--+ 1 david  staff   220K Jul 28 17:38 conjugo_data.js
-rw-r--r--+ 1 david  staff   498K Jul 28 17:38 conjugo_data_indented.js
