In [44]:
from ufal.udpipe import Model, Pipeline
from collections import defaultdict
from bs4 import BeautifulSoup

In [84]:
def make_tag(s):
    if s == "_":
        return dict()
    return dict(elem.split("=", maxsplit=1) for elem in s.split("|"))

def is_verb(pos, tag):
    return (pos in ["VERB", "AUX"]) and ("Case" not in tag)

In [96]:
class SyntaxTreeGraph:
    
    def __init__(self, data):
        self._make_edges(data)
    
    def __len__(self):
        return self.length
    
    def _make_edges(self, data):
        self.nodes_number = len(data)
        self.edges = [[] for _ in range(self.nodes_number+1)]
        self.deps = [[] for _ in range(self.nodes_number+1)]
        self.tags = [(elem[3], make_tag(elem[5])) for elem in data]
        for child, elem in enumerate(data, 1):
            root, dep = int(elem[6]), elem[7]
            self.edges[root].append(child)
            self.deps[root].append(dep)
        return
    
    def _order(self):
        color = [0] * (self.nodes_number + 1)
        stack = [0]
        order = []
        while len(stack) > 0:
            v = stack[-1]
            if color[v] == 0:
                color[v] = 1
                stack.extend(self.edges[v])
                continue
            elif color[v] == 1:
                color[v] = 2
                order.append(v)
            stack.pop()
        return order[::-1]
    
    def get_indexes(self):
#         print(self.edges)
        verb_indexes = self._get_verb_indexes()
        subj_indexes, obj_indexes = self._get_noun_indexes(verb_indexes)
        return {"verb": verb_indexes, "subj": subj_indexes, "obj": obj_indexes}
    
    def _get_verb_indexes(self):
        parents_for_verbs, answer = [0], []
        while len(parents_for_verbs) > 0:
#             for parent_index in parents_for_verbs:
            parent_index = parents_for_verbs.pop()
            curr_dep_data = self.edges[parent_index], self.deps[parent_index]
            for verb_index, dep in zip(*curr_dep_data):
                pos, tag = self.tags[verb_index-1]
                if dep in ["root", "xcomp"] and is_verb(*self.tags[verb_index-1]):
                    answer.append(verb_index)
                    parents_for_verbs.append(verb_index)
        return answer
    
    def _get_noun_indexes(self, verb_indexes):
        subj_indexes, obj_indexes = [], []
        for verb_index in verb_indexes:
            is_passive = (self.tags[verb_index-1][-1].get("Voice") == "Pass")
            curr_dep_data = self.edges[verb_index], self.deps[verb_index]
            for dep_index, dep in zip(*curr_dep_data):
                if dep == "nsubj":
                    dest = obj_indexes if is_passive else subj_indexes
                elif dep in ["nsubj:pass", "obj"]:
                    dest = obj_indexes
                else:
                    continue
                dest.append(dep_index)
        return subj_indexes, obj_indexes

SyntaxError: invalid syntax (<ipython-input-96-fecd8b263f63>, line 40)

In [40]:
edges = [(2, ""), (4, ""), (2, ""), (0, ""), (6, ""), (4, "")]
graph = SyntaxTreeGraph(edges)
print(graph._order())    

[0, 4, 2, 1, 3, 6, 5]


In [7]:
model = Model.load("russian-syntagrus-ud-2.0-170801.udpipe")

In [9]:
pipeline = Pipeline(model, "tokenize", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu")

In [24]:
def prettify_UD_output(s):
    lines = s.split("\n")
    state = 0
    answer = []
    for line in lines:
        if state == 0:
            if line.startswith("# sent_id"):
                iter(lines).__next__()
                state = 2
                curr_sent = []
                continue
        elif state == 2:
            state = 1
        elif state == 1:
            if line == "":
                answer.append(curr_sent)
                curr_sent, state = [], 0
                continue
            curr_sent.append(line.split("\t"))
    if curr_sent != []:
        answer.append(curr_sent)
    return answer

In [41]:
def read_paraphrases(infile):
    with open(infile, "r", encoding="utf-8") as fin:
        soup = BeautifulSoup(fin.read(), "lxml")
    paraphrases = soup.find_all("paraphrase")
    pairs, targets = [], []
    for elem in paraphrases:
        first, second, target = elem.find_all(attrs={"name": ["text_1", "text_2", "class"]})
        pairs.append([first.text, second.text])
        targets.append(int(target.text))
    return pairs, targets

In [None]:
pairs, targets = read_paraphrases("paraphraser/paraphrases_train.xml")
phrases = [phrase for pair in pairs for phrase in pair]

In [97]:
parses = []
with open("parses.out", "w", encoding="utf8") as fout:
    for i, phrase in enumerate(phrases[:20], 1):
        if i % 100 == 0:
            print("{} phrases parsed".format(i))
        if phrase[-1] not in ".?!":
            phrase += "."
        parse = prettify_UD_output(pipeline.process(phrase))[0]
        fout.write("\n".join("\t".join(elem[:8]) for elem in parse) + "\n\n")
        parses.append(parse)
parse_data = []
for parse in parses:
    parse_data.append(SyntaxTreeGraph(parse))
for j, elem in enumerate(parse_data):
    for key, indexes in sorted(elem.get_indexes().items()):
        for index in indexes:
            print(j, key, index, parses[j][index-1][1])

0 verb 2 разрешат
0 verb 3 стрелять
1 subj 1 Полиции
1 verb 2 могут
1 verb 3 разрешить
1 verb 4 стрелять
2 subj 1 Право
2 verb 7 решили
2 verb 8 ограничить
3 subj 1 Правила
3 verb 7 уточнят
4 obj 5 положение
4 subj 1 Президент
4 verb 3 ввел
5 obj 8 положение
5 subj 1 Власти
5 verb 3 угрожают
5 verb 4 ввести
6 subj 6 вопрос
6 verb 5 волнует
7 subj 1 Самолеты
7 verb 3 вывезут
8 subj 7 самолета
8 verb 5 вернулись
9 subj 1 Самолеты
9 verb 3 вывезут
10 obj 3 отпечатки
10 subj 1 Приставы
10 verb 2 соберут
11 subj 1 Приставы
11 verb 2 снимут
12 obj 11 дело
12 verb 10 заведено
13 subj 2 дебошир
13 verb 3 отказывается
13 verb 4 возвращаться
14 subj 1 ЦИК
14 verb 2 хочет
14 verb 3 отказаться
15 subj 1 ЦИК
15 verb 2 может
15 verb 3 отказаться
16 subj 1 Суд
16 verb 3 оставил
18 obj 4 добычу
18 subj 1 Страны
18 verb 3 сократили
19 obj 3 полномочия
19 subj 1 Обама
19 verb 2 продлил
