In [1]:
from lxml import etree

In [2]:
with open("../data/annot.opcorpora.xml", "rt") as f:
    root = etree.fromstring(f.read().encode())

In [3]:
def _parse_text_el(text_el):
    for paragraph in text_el.find('paragraphs').getchildren():
        for sentence in paragraph.getchildren():
            # TODO: Clean
            for token in sentence.find('tokens').getchildren():
                text = token.get('text')
                lemma = token.find(".//l").get('t')
                yield text, lemma
            yield None, None

# tuple(_parse_text_el(text))

In [4]:
good_chars = "абвгдежзийклмнопрстуфхцчшщъыьэюя"
good_chars += good_chars.upper()

def _cleanup(raw_text):
    return "".join(ch.lower() if ch in good_chars else ' '
                   for ch in raw_text)

def _do_filter(_out):
    for text, lemma in _out:
        if text is None:
            yield text, lemma
            continue

        if text := _cleanup(text).strip():
            yield text, _cleanup(lemma).strip()

# tuple(_do_filter(_parse_text_el(text)))

In [5]:
def _do_sentence(_out):
    sentence = []
    result = []

    for text, lemma in _out:
        if text is None:
            yield " ".join(sentence), tuple(result)
            sentence.clear()
            result.clear()
            continue

        sentence.append(text)
        result.append(lemma)

    if sentence:
        yield " ".join(sentence), result

# tuple(_do_sentence(_do_filter(_parse_text_el(text))))

In [6]:
def _gen_sentence(text):
    yield from _do_sentence(_do_filter(_parse_text_el(text)))

# tuple(_gen_sentence(text))

In [7]:
def gen_sentences(*texts, count=10):
    sentences = {}
    for text in texts:
        for sentence, lemmas in _gen_sentence(text):
            if exist := sentences.get(sentence):
                continue
                assert exist == lemmas, (exist, lemmas)
            sentences[sentence] = lemmas

            if len(sentences) >= count:
                break

        if len(sentences) >= count:
                break

    return sentences

sentences = gen_sentences(*root.getchildren()[1:])

In [8]:
len(sentences)

10

In [10]:
class AbstractLemmatizer:
    def __call__(self, sentence: str) -> tuple:
        raise NotImplementedError()

In [11]:
class Dummy(AbstractLemmatizer):
    def __call__(self, sentence):
        return tuple(sentence.split())

In [12]:
class MyStem3(AbstractLemmatizer):
    def __init__(self):
        import pymystem3
        self.m = m = pymystem3.Mystem()

    @staticmethod
    def check_word(analyze_result):
        orig_word: str = analyze_result['text'].strip()

        try:
            word: str = analyze_result['analysis'][0]['lex'].strip()
        except (KeyError, IndexError):
            word: str = orig_word

        return word

    def __call__(self, sentence):
        return tuple(x for word in self.m.analyze(sentence)
            if (x := self.check_word(word))
            )

In [24]:
class Morphy2(Dummy):
    def __init__(self):
        import pymorphy2
        self.m = pymorphy2.MorphAnalyzer()

    def _parse(self, word):
        return self.m.parse(word)[0].normal_form

    def __call__(self, sentence: str):
        return tuple(
            self._parse(word)
            for word in super(Morphy2, self).__call__(sentence)
        )


In [25]:
lemmatizers = [
    Dummy(),
    MyStem3(),
    Morphy2()
]

In [14]:
def _test(text: str, lemmas_: tuple, l: AbstractLemmatizer) -> float:
    result = set(lemmas_)
    return len(set(l(text)).intersection(result)) / len(result)


_test(*tuple(sentences.items())[0], MyStem3())

0.6

In [20]:
def do_test(sentences: dict, l: AbstractLemmatizer):
    for text, lemmas in sentences.items():
        yield _test(text, lemmas, l)

In [26]:
for l in lemmatizers:
    print(l, sum(do_test(sentences, l)))

<__main__.Dummy object at 0x11593f040> 5.351091851696689
<__main__.MyStem3 object at 0x11593f190> 8.302871165572778
<__main__.Morphy2 object at 0x1159589d0> 8.471871654331332
