In [1]:
import conllu
from io import open
import numpy as np

In [2]:
data_train = conllu.parse_incr(open("pt_gsd-ud-train.conllu",'r',encoding = "utf-8")) # парсим данные

In [3]:
def getSentence(tokenlist):
    sentence = [["*","*"]] + [["*","*"]]
    for i in range(len(tokenlist)):
        sentence += [[tokenlist[i]["lemma"], tokenlist[i]["upostag"]]]
    sentence += [["STOP","STOP"]]
    return sentence

In [4]:
trigrams = {} 
bigrams = {} 
unigrams = {} 
tagWords = {} 

In [5]:
for tokenlist in data_train:
    sentence = getSentence(tokenlist)
    # считаем триграммы
    for i in range(len(sentence)-2):  
        count = trigrams.get( (sentence[i][1], sentence[i+1][1], sentence[i+2][1]), 0)
        trigrams.update( { (sentence[i][1], sentence[i+1][1], sentence[i+2][1]): count+1 } )
    # считаем биграммы
    for i in range(len(sentence)-1):
        count = bigrams.get( (sentence[i][1], sentence[i+1][1]), 0)
        bigrams.update( { (sentence[i][1], sentence[i+1][1]): count+1 } )
    # считаем униграммы
    for i in range(len(sentence)):
        count = unigrams.get( (sentence[i][1]), 0)
        unigrams.update( { (sentence[i][1]): count+1 } )
        count = tagWords.get( (sentence[i][1], sentence[i][0]), 0)
        tagWords.update( { (sentence[i][1], sentence[i][0]): count+1 } )

In [6]:
tags = list(unigrams.keys())# уникальные теги
tags.remove('*')
tags.remove('STOP')

In [7]:
def q(s, u, v):
    eps = 1e-7
    return trigrams.get((u, v, s), eps) / bigrams.get((u, v), eps)


def e(x, s):
    eps = 1e-7
    return tagWords.get((s, x), eps) / unigrams.get((s), eps)


def getTags(n): # в начале предложения - *, * 
    if n == -1 or n == 0:
        return ['*']
    else:
        return tags

In [8]:
def viterbi(sentence):
    pi = {(0, '*', '*'): 1}
    backpointers = {}
    n = len(sentence) - 2
    y = [""] * (n + 1)
    
    for k in range(1, n + 1):
        word = sentence[k]
        for u in getTags(k-1):
            for v in getTags(k):
                # находим вероятности для каждой цепочки длины k для всех пар тегов (u, v)
                w = getTags(k-2)
                piTemp = list(map(lambda wi:
                            pi.get((k-1, wi, u)) *
                            q(v, wi, u) *
                            e(word, v), w))
                piMax = max(piTemp)
                bp = w[piTemp.index(piMax)]
                pi.update({(k, u, v): piMax})
                backpointers.update({(k, u, v): bp})
    # вероятности всех цепочек длины n - 2 для всех пар тегов (u, v)
    ends = {}
    for u in getTags(n-1):
        for v in getTags(n):
            ends.update({(n, u, v): pi.get((n, u, v)) * q("STOP", u, v)})
    # нахождение двух последник тегов
    endMax = max(list(ends.values()))
    for (n, u, v), val in ends.items():
        if val == endMax:
            (y[n-1], y[n]) = (u,v)
    # восстановление ответа
    for k in range(n-2, 0, -1):
        y[k] = backpointers.get((k + 2, y[k+1], y[k+2]))
    return y[1:]

In [9]:
data_test = conllu.parse_incr(open("pt_gsd-ud-test.conllu",'r',encoding = "utf-8"))

In [10]:
test_tags = [] # теги корпуса
predict_tags = [] # теги, предсказанные алгоритмом

In [11]:
for tokenlist in data_test:
    # получаем предложение
    sentence=["*"] + [tokenlist[i]["lemma"] for i in range(len(tokenlist)) ] + ["STOP"]
    # получаем оригинальные теги
    test_tags.append([tokenlist[i]["upostag"] for i in range(len(tokenlist)) ])
    # получаем предсказанные таги
    predict_tags.append(viterbi(sentence))

In [12]:
err = 0.
for tag in range(len(test_tags)):
    err += np.mean(np.array(test_tags[tag]) != np.array(predict_tags[tag]))
print('Error = ', err/len(test_tags))

Error =  0.29640079044549783
