## Парсер Постнауки

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import re

In [3]:
from text_utils import BaseSource, BaseProcessor, BaseSink
from text_utils import DefaultTextProcessor, DefaultDocumentProcessor, DefaultCollectionProcessor
from text_utils import UciBowSink, MongoDbSink

In [4]:
from sklearn.pipeline import Pipeline
from ipywidgets import FloatProgress
from IPython.display import display
from pathlib import Path

Определим сначала пайплайн для одного документа (`PostnaukaFileSource`, `PostnaukaFileProcessor`).

In [5]:
post_id_trim = re.compile("\[post id=\"\d+\"\]")

In [6]:
class PostnaukaFileSource(BaseSource):
    def fit(self, params, *args):
        (text_path, meta_path) = params
        self.text_path = text_path
        self.meta_path = meta_path
        return self

In [7]:
class PostnaukaFileProcessor(BaseProcessor):
    def __init__(self, stop_words):
        self.doc_pipeline = Pipeline([
            ("text-processor",     DefaultTextProcessor()),
            ("document-processor", DefaultDocumentProcessor(stop_lemmas=stop_words)),
        ])

    def transform(self, src, *args):
        # Parse text file
        with src.text_path.open() as fi:
            title = fi.readline().strip()
            fi.readline()
            description = fi.readline().strip()
            fi.readline()
            text = fi.read()
            text = post_id_trim.sub("", text)
        # Parse meta file
        flat_tags = []
        authors = []
        with src.meta_path.open() as fi:
            for ln in fi:
                toks = re.split("\s+", ln, 2)
                if toks[0] == "post_tag":
                    flat_tags.append(toks[-1].strip().lower())
                elif toks[0] == "author":
                    authors.append(toks[-1].strip().lower())
        # Run inner pipeline to form modalities
        modalities = self.doc_pipeline.fit_transform(text)
        # Finally, make a document and return it
        doc = {}
        doc["title"] = title
        doc["description"] = description
        doc["modalities"] = modalities
        doc["modalities"]["flat_tag"] = flat_tags
        doc["modalities"]["authors"] = authors
        doc["markdown"] = text
        return doc

Теперь определим пайплайн всей коллекции файлов на диске (`PostnaukaCollectionSource`, `PostnaukaCollectionProcessor`).

In [8]:
class PostnaukaCollectionSource(BaseSource):
    def fit(self, root_path, *args):
        stop_words = (root_path / "stopwords.txt").open().read().split()
        self.root_path = root_path
        # We will spawn this pipeline in parallel for each document
        self.file_parser = Pipeline([
            ("take-file-name",      PostnaukaFileSource()),
            ("convert-to-document", PostnaukaFileProcessor(stop_words)),
        ])
        # Save source state
        self.vocab_file = (root_path / "vocab.pn.txt").open("w")
        self.docword_file = (root_path / "docword.pn.txt").open("w")
        self.files_paths = sorted(root_path.glob("raw_data/*.txt"))
        self.metas_paths = sorted(root_path.glob("raw_data/meta/*_meta.txt"))
        return self

In [9]:
class PostnaukaCollectionProcessor(BaseProcessor):
    def transform(self, src, *args):
        docs = []
        f = FloatProgress(min=0, max=len(src.files_paths))
        display(f)
        for file_path, meta_path in zip(src.files_paths, src.metas_paths):
            # TODO: run these in parallel threads
            doc = src.file_parser.fit_transform((file_path, meta_path))
            docs.append(doc)
            f.value += 1
        docs = DefaultCollectionProcessor(min_len=1, min_df=2).fit_transform(docs)
        # Save Markdown texts in MongoDB
        MongoDbSink("postnauka").fit_transform(docs)
        # Save collection UCI BOW format
        UciBowSink(src.vocab_file, src.docword_file).fit_transform(docs)

Построим парсер Постнауки из пайплайна, определенного выше.

In [10]:
postnauka_parser = Pipeline([
    ("take-root-path",         PostnaukaCollectionSource()),
    ("process-the-collection", PostnaukaCollectionProcessor()),
])

Запустим парсер.

In [11]:
root_path = Path("../datasets/postnauka")

In [12]:
%%time

postnauka_parser.fit_transform(root_path)

CPU times: user 2min 46s, sys: 7.93 s, total: 2min 54s
Wall time: 5min 30s


---