In [1]:
from typing import *
from lxml import etree
from time import time

In [2]:
with open("../data/annot.opcorpora.xml", "rt") as f:
    root = etree.fromstring(f.read().encode())

In [3]:
def _parse_text_el(text_el):
    for paragraph in text_el.find('paragraphs').getchildren():
        for sentence in paragraph.getchildren():
            # TODO: Clean
            for token in sentence.find('tokens').getchildren():
                text = token.get('text')
                lemma = token.find(".//l").get('t')
                yield text, lemma
            yield None, None

# tuple(_parse_text_el(text))

In [4]:
good_chars = "абвгдежзийклмнопрстуфхцчшщъыьэюя"
good_chars += good_chars.upper()

def _cleanup(raw_text):
    return "".join(ch.lower() if ch in good_chars else ' '
                   for ch in raw_text)

def _do_filter(_out):
    for text, lemma in _out:
        if text is None:
            yield text, lemma
            continue

        if text := _cleanup(text).strip():
            yield text, _cleanup(lemma).strip()

# tuple(_do_filter(_parse_text_el(text)))

In [5]:
def _do_sentence(_out):
    sentence = []
    result = []

    for text, lemma in _out:
        if text is None:
            yield " ".join(sentence), tuple(result)
            sentence.clear()
            result.clear()
            continue

        sentence.append(text)
        result.append(lemma)

    if sentence:
        yield " ".join(sentence), result

# tuple(_do_sentence(_do_filter(_parse_text_el(text))))

In [6]:
def _gen_sentence(text):
    yield from _do_sentence(_do_filter(_parse_text_el(text)))

# tuple(_gen_sentence(text))

In [7]:
def gen_sentences(*texts, count=10):
    sentences = {}
    for text in texts:
        for sentence, lemmas in _gen_sentence(text):
            if exist := sentences.get(sentence):
                continue
                assert exist == lemmas, (exist, lemmas)
            sentences[sentence] = lemmas

            if count is not None and len(sentences) >= count:
                break

        if count is not None and len(sentences) >= count:
                break

    return sentences

sentences = gen_sentences(*root.getchildren()[1:], count=None)

In [8]:
len(sentences)

106892

In [9]:
class AbstractLemmatizer:
    def __call__(self, sentence: str) -> tuple:
        raise NotImplementedError()

In [10]:
class Dummy(AbstractLemmatizer):
    def _parse(self, word):
        return word

    def __call__(self, sentence: str):
        return tuple(
            self._parse(word)
            for word in sentence.split()
        )

In [11]:
class MyStem3(AbstractLemmatizer):
    def __init__(self):
        import pymystem3
        self.m = m = pymystem3.Mystem()

    @staticmethod
    def check_word(analyze_result):
        orig_word: str = analyze_result['text'].strip()

        try:
            word: str = analyze_result['analysis'][0]['lex'].strip()
        except (KeyError, IndexError):
            word: str = orig_word

        return word

    def __call__(self, sentence):
        return tuple(x for word in self.m.analyze(sentence)
            if (x := self.check_word(word))
            )

In [12]:
class Morphy2(Dummy):
    def __init__(self):
        import pymorphy2
        self.m = pymorphy2.MorphAnalyzer()

    def _parse(self, word):
        return self.m.parse(word)[0].normal_form

In [13]:
class NLTKL(Dummy):
    def __init__(self):
        from nltk.stem.snowball import RussianStemmer
        self.m = RussianStemmer()

    def _parse(self, word):
        return self.m.stem(word)


In [14]:
class SpaCY(Dummy):
    def __init__(self):
        from spacy.lemmatizer import Lemmatizer
        from spacy.lookups import Lookups
        lookups = Lookups()
        lookups.add_table("lemma_rules", {"noun": [["s", ""]]})
        self.l = Lemmatizer(lookups)
    def _parse(self, word):
        return self.l(word, "NOUN")[0]

In [15]:
lemmatizers = [
    Dummy(),
    MyStem3(),
    Morphy2(),
    NLTKL(),
]

In [16]:
def _test(text: str, lemmas_: tuple, l: AbstractLemmatizer) -> Tuple[int, int]:
    result = set(lemmas_)
    return len(set(l(text)).intersection(result)), len(result)


print(_test(*tuple(sentences.items())[0], lemmatizers[-1]))

lemmatizers[-1](tuple(sentences.items())[0][0])


(1, 5)


('школ', 'злослов', 'уч', 'прикус', 'язык')

In [17]:
def do_test(sentences: dict, l: AbstractLemmatizer):
    good = 0
    all_ = 0
    start = time()
    for text, lemmas in sentences.items():
        _g, _a = _test(text, lemmas, l)
        good += _g
        all_ += _a

    finish = time() - start

    return good, all_, finish

In [18]:
for l in lemmatizers:
    g, a, tm = tuple(do_test(sentences, l))
    print(f"{l.__class__.__name__} : {100 * g / a:.3f}% : {a / tm:.4f} words/s")

Dummy : 48.550% : 1860743.2113 words/s
MyStem3 : 80.071% : 18537.3361 words/s
Morphy2 : 79.994% : 4078.2828 words/s
NLTKL : 34.932% : 18279.6556 words/s
