In [17]:
import nltk
import numpy as np
from nltk.tokenize import word_tokenize

# Entrenamiento

In [2]:
from conllu import parse_incr

In [3]:
word_list = []
with open('../data/UD_Spanish-AnCora/es_ancora-ud-dev.conllu','r',encoding='utf-8') as f:
    for token_list in parse_incr(f):
        word_list.append(token_list)

In [4]:
word_list[0][0]

{'id': 1,
 'form': 'El',
 'lemma': 'el',
 'upos': 'DET',
 'xpos': None,
 'feats': {'Definite': 'Def',
  'Gender': 'Masc',
  'Number': 'Sing',
  'PronType': 'Art'},
 'head': 2,
 'deprel': 'det',
 'deps': None,
 'misc': None}

### Entrenamiento del modelo - cálculo de probabilidades

$$P(t_i|w_i, t_{i-1}) = \frac{C(t_i, w_i, t_{i-1})}{C(w_i, t_{i-1})}$$
* `Prob_dict` = $\large C(t_i, w_i, t_{i-1})$
* `Context_dict` = $\large C(w_i, t_{i-1})$

In [5]:
def counter_dict(dict_t,tag):
    if tag in dict_t.keys():
        dict_t[tag] += 1
    else:
        dict_t[tag] = 1
    return dict_t

In [111]:
Prob_dict = {}
Context_dict = {}
Unique_prob = {}

with open('../data/UD_Spanish-AnCora/es_ancora-ud-train.conllu','r',encoding='utf-8') as f:
    for token_list in parse_incr(f):
        pre_tag = 'NONE'
        for token in token_list:
        # Prob_dict
            tag = token['upos']+'|'+token['form'].lower()+','+ pre_tag
            Prob_dict = counter_dict(Prob_dict,tag)

        # Context Dict

            tag_c = token['form'].lower()+','+ pre_tag
            Context_dict = counter_dict(Context_dict,tag_c)

        # Unique Dict

    #         tag_u = token['upos']+'|'+token['form']+','+ pre_tag

    #         Unique_prob[tag_u] = Prob_dict[tag]/Context_dict[tag_c]

            pre_tag = token['upos']


In [112]:
for key in Prob_dict.keys():
    if len(key.split('|')) == 3:
        Unique_prob[key] = Prob_dict[key]/Context_dict['|'+key.split('|')[-1]]
    else:
        Unique_prob[key] = Prob_dict[key]/Context_dict[key.split('|')[1]]

In [113]:
# Probabilidad inicial
inicialTagProb = {}
count = 0
with open('../data/UD_Spanish-AnCora/es_ancora-ud-train.conllu','r',encoding='utf-8') as f:
    for token_list in parse_incr(f):
        count += 1
        token = token_list[0]['upos']
        if token in inicialTagProb.keys():
            inicialTagProb[token] += 1
        else: 
            inicialTagProb[token] = 1
for key in inicialTagProb.keys():
    inicialTagProb[key] /= count

In [114]:
StateSetDict = {}
for idx, p in enumerate(set([w.split('|')[0] for w in Prob_dict])):
    StateSetDict[p] = idx
StateSetDict

{'PUNCT': 0,
 'VERB': 1,
 'ADV': 2,
 'NOUN': 3,
 'PART': 4,
 '_': 5,
 'AUX': 6,
 'PRON': 7,
 'SYM': 8,
 'INTJ': 9,
 'DET': 10,
 'ADP': 11,
 'CCONJ': 12,
 'NUM': 13,
 'SCONJ': 14,
 'ADJ': 15,
 'X': 16,
 'PROPN': 17}

In [13]:
inicialTagProb

{'DET': 0.34799021321216356,
 'ADP': 0.14931842013282068,
 'VERB': 0.04557846906675987,
 'ADV': 0.07577770010485844,
 'PROPN': 0.10506815798671792,
 'PRON': 0.04173365955959455,
 'ADJ': 0.010136315973435861,
 'CCONJ': 0.036980076896190144,
 'PART': 0.002446696959105208,
 'PUNCT': 0.09143656064313177,
 'NOUN': 0.025026214610276126,
 'NUM': 0.0068507514854945824,
 '_': 0.009227542817196784,
 'SCONJ': 0.027123383432366307,
 'AUX': 0.022789234533379936,
 'INTJ': 0.0020272631946871723,
 'SYM': 0.0004893393918210416}

# Entrenamiento 

### Construcción del algoritmo de Viterbi

Dada una secuencia de palabras $\{p_1, p_2, \dots, p_n \}$, y un conjunto de categorias gramaticales dadas por la convención `upos`, se considera la matriz de probabilidades de Viterbi así:

$$
\begin{array}{c c}
\begin{array}{c c c c}
\text{ADJ} \\
\text{ADV}\\
\text{PRON} \\
\vdots \\
{}
\end{array} 
&
\left[
\begin{array}{c c c c}
\nu_1(\text{ADJ}) & \nu_2(\text{ADJ}) & \dots  & \nu_n(\text{ADJ})\\
\nu_1(\text{ADV}) & \nu_2(\text{ADV}) & \dots  & \nu_n(\text{ADV})\\ 
\nu_1(\text{PRON}) & \nu_2(\text{PRON}) & \dots  & \nu_n(\text{PRON})\\
\vdots & \vdots & \dots & \vdots \\ \hdashline
p_1 & p_2 & \dots & p_n 
\end{array}
\right] 
\end{array}
$$

Donde las probabilidades de Viterbi en la primera columna (para una categoria $i$) están dadas por: 

$$
\nu_1(i) = \underbrace{\rho_i^{(0)}}_{\text{probabilidad inicial}} \times P(i \vert p_1, \text{"None"})
$$

y para las siguientes columnas: 

$$
\nu_{t}(j) = \max_i \{ \overbrace{\nu_{t-1}(i)}^{\text{estado anterior}} \times P(j \vert p_t, i) \}
$$


In [146]:
def MEMM(sentence, inicialTagProb=inicialTagProb, StateSetDict=StateSetDict, Unique_prob=Unique_prob, Context_dict=Context_dict):
    s = word_tokenize(sentence)
    viterbi_matrix = np.zeros([18,len(s)])
    prev_tag = 'NONE'
    result_list = []
    #Primera columna de la matriz
    for key in StateSetDict.keys():
        tag_row = StateSetDict[key]
        word_tag =  key+'|'+s[0]+','+prev_tag
        if word_tag in Unique_prob:
            viterbi_matrix[tag_row,0] = inicialTagProb[key]*Unique_prob[word_tag]
            
    #Segunda Columna de la matriz
    for col in range(1,len(s)):
    #Recorro de la segunda columna en adelante
        for key in StateSetDict.keys():
            tag_row = StateSetDict[key]
            for key_p in StateSetDict.keys():
                word_tag = key+'|'+s[col]+','+ key_p
                if word_tag in Unique_prob:
                    possible_probs = []
                    for p_key in StateSetDict.keys():
                        prev_tag_row = StateSetDict[p_key]
                        prev_word_tag = s[col-1]+','+p_key
                        if prev_word_tag in Context_dict.keys():
                            #print(prev_word_tag, prev_tag_row)
                            if viterbi_matrix[prev_tag_row,col-1] > 0:
                                viterbi_prob = viterbi_matrix[prev_tag_row,col-1] * Unique_prob[word_tag]
                                possible_probs.append(viterbi_prob)
        
                    viterbi_matrix[tag_row,col] = max(possible_probs)
                    #print(possible_probs)

    # Retornar el resultado 
    for idx,col in enumerate(s):
        resul_idx = np.argmax(viterbi_matrix[:,idx])
        for key in StateSetDict.keys():
            if StateSetDict[key] == resul_idx:
                result_list.append((col,key))
                
    return result_list

In [147]:
MEMM('el mundo es pequeño')

[('el', 'DET'), ('mundo', 'PROPN'), ('es', 'AUX'), ('pequeño', 'ADJ')]