In [58]:
import sys
import csv
import unicodedata
import numpy as np

In [59]:
from pymystem3 import Mystem
from multiprocessing import Pool
from IPython.display import display
from ipywidgets import FloatProgress
from sklearn.pipeline import Pipeline
from parsers.text_utils import DefaultTextProcessor, Lemmatizer

In [60]:
csv.field_size_limit(sys.maxsize)

9223372036854775807

Разобьём процесс на две части — токенизацию документов (без фильтрации) и, собственно, лемматизацию.

In [87]:
%%time

# Проделаем токенизацию с сохранением промежуточного
# состояния в ruwiki.tonekized.csv.tmp

tokenizer = DefaultTextProcessor()

# 1361758 — предподсчитанное кол-во документов
f = FloatProgress(min=0, max=1361758)
display(f)

unused_char = '\U00037b84'
def strip_accents(s):
    s = s.replace("й", unused_char)
    return "".join((c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")).replace(unused_char, "й")

with open("../datasets/ruwiki/ruwiki.plain.csv", "r") as infile:
    with open("ruwiki.tonekized.csv.tmp", "w") as outfile:
        reader = csv.reader(infile)
        writer = csv.writer(outfile)
        count = 0
        cached_rows = []
        for title, text in reader:
            text = strip_accents(text)
            tokens = tokenizer.fit_transform(text)
            cached_rows.append((title, " ".join(tokens)))
            count += 1
            if count % 1000 == 0:
                writer.writerows(cached_rows)
                outfile.flush()
                f.value += len(cached_rows)
                cached_rows = []
        # Запишем оставшиеся строчки
        writer.writerows(cached_rows)
        f.value += len(cached_rows)

CPU times: user 38min 48s, sys: 15.2 s, total: 39min 4s
Wall time: 39min 4s


In [88]:
%%time

# Теперь сделаем лемматизацию всех документов при помощи pymystem3
# Распараллеливая процесс на N_PROCS процессоров

N_PROCS = 4

# 1361758 — предподсчитанное кол-во документов
f = FloatProgress(min=0, max=1361758)
display(f)

m = Mystem()

def lemmatize(text):
    return "".join(m.lemmatize(text)).strip()

with open("ruwiki.tonekized.csv.tmp", "r") as infile:
    with open("../datasets/ruwiki/ruwiki.lemmatized.csv", "w") as outfile:
        reader = csv.reader(infile)
        writer = csv.writer(outfile)
        count = 0
        cached_titles = []
        cached_texts = []
        for title, text in reader:
            cached_titles.append(title)
            cached_texts.append(text)
            count += 1
            if count % 1000 == 0:
                with Pool(N_PROCS) as p:
                    lemmatized_texts = p.map(lemmatize, cached_texts)
                writer.writerows(zip(cached_titles, lemmatized_texts))
                outfile.flush()
                f.value += len(cached_titles)
                cached_texts = []
                cached_titles = []
        # Запишем оставшиеся строчки
        with Pool(N_PROCS) as p:
            lemmatized_texts = p.map(lemmatize, cached_texts)
        writer.writerows(zip(cached_titles, lemmatized_texts))
        f.value += len(cached_titles)

CPU times: user 5min 27s, sys: 1min 47s, total: 7min 14s
Wall time: 3h 33min 19s


---