### Парсер Постнауки

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import bson

In [3]:
from text_utils import BaseSource, BaseProcessor, BaseSink
from text_utils import DefaultTextProcessor, DefaultDocumentProcessor, DefaultCollectionProcessor
from text_utils import UciBowSink, MongoDbSink

In [4]:
from sklearn.pipeline import Pipeline
from ipywidgets import FloatProgress
from IPython.display import display
from pathlib import Path

Определим пайплайн всей коллекции Хабрахабра из BSON-дампа (`HabrahabrCollectionSource`, `HabrahabrCollectionProcessor`).

In [5]:
class HabrahabrCollectionSource(BaseSource):
    def fit(self, root_path, *args):
        stop_words = (root_path / "stopwords.txt").open().read().split()
        self.root_path = root_path
        self.bson_file = (root_path / "habrahabr.bson").open("rb")
        self.doc_pipeline = Pipeline([
            ("text-processor",     DefaultTextProcessor()),
            ("document-processor", DefaultDocumentProcessor(stop_lemmas=stop_words)),
        ])
        # Save source state
        self.vocab_file = (root_path / "vocab.pn.txt").open("w")
        self.docword_file = (root_path / "docword.pn.txt").open("w")
        return self

In [10]:
class HabrahabrCollectionProcessor(BaseProcessor):
    def transform(self, src, *args):
        docs = []
        for doc_id, bson_doc in enumerate(bson.decode_file_iter(src.bson_file)):
            if bson_doc["company_blog"] is None:
                doc = {}
                doc["title"] = bson_doc["title"]
                doc["url"] = bson_doc["url"]
                doc["modalities"] = src.doc_pipeline.fit_transform(bson_doc["content_html"])
                doc["modalities"]["flat_tag"] = bson_doc["tags"]
                doc["modalities"]["authors"] = [bson_doc["author_user"]]
                doc["modalities"]["hubs"] = bson_doc["hubs"]
                doc["markdown"] = bson_doc["content_html"]
                doc["doc_id"] = doc_id + 1
                docs.append(doc)
        docs = DefaultCollectionProcessor(min_len=1, min_df=2).fit_transform(docs)
        # Save Markdown texts in MongoDB
        MongoDbSink("habrahabr", id_func=lambda doc: "habr_%d" % doc["doc_id"]).fit_transform(docs)
        # Save collection UCI BOW format
        UciBowSink(src.vocab_file, src.docword_file).fit_transform(docs)

Построим парсер Хабрахабра из пайплайна, определенного выше.

In [11]:
habrahabr_parser = Pipeline([
    ("take-root-path",         HabrahabrCollectionSource()),
    ("process-the-collection", HabrahabrCollectionProcessor()),
])

Запустим парсер.

In [12]:
root_path = Path("../datasets/habrahabr")

In [None]:
%%time

habrahabr_parser.fit_transform(root_path)

---