# Хабрахабр

In [4]:
from collections import Counter
from pymongo import MongoClient
from sklearn.pipeline import Pipeline
from parsers.text_utils import DefaultTextProcessor, DefaultDocumentProcessor

Перегоняем данные из базы `test.habrahabr` в базу `datasets.habrahabr` с изменением формата и сохраняем на диске в формате Vowpal Wabbit.

In [5]:
client = MongoClient()
in_collection = client["test"]["habrahabr"]
out_collection = client["datasets"]["habrahabr"]

In [36]:
stop_words = open("../datasets/habrahabr/stopwords.txt").read().split()
top_5_percent_words = open("../datasets/habrahabr/top5_percent_words.txt").read().split()
stop_lemmas = list(set(stop_words).union(set(top_5_percent_words)))
doc_pipeline = Pipeline([
    ("text-processor",     DefaultTextProcessor(token_pattern="(?u)\\b\\p{L}+\\b")),
    ("document-processor", DefaultDocumentProcessor(stop_lemmas=stop_lemmas)),
])

Подсчитаем количество встречающихся слов.

In [18]:
%%time

word_counter = Counter()

with open("../datasets/habrahabr/habrahabr.txt", "w") as vw_file:
    for doc_id, mongo_doc in enumerate(in_collection.find({ "company_blog": None }), 1):
        modalities = doc_pipeline.fit_transform(mongo_doc["content_html"])
        word_counter.update(modalities["text"])

CPU times: user 8min 35s, sys: 5.54 s, total: 8min 41s
Wall time: 44min 17s


Запишем результат в БД и на диск.

In [31]:
top5_percent_words = list(map(lambda p: p[0], word_counter.most_common(int(len(word_counter) * 0.05))))

In [32]:
open("../datasets/habrahabr/top5_percent_words.txt", "w").write("\n".join(top5_percent_words))

266801

---

In [37]:
%%time

with open("../datasets/habrahabr/habrahabr.txt", "w") as vw_file:
    for doc_id, mongo_doc in enumerate(in_collection.find({ "company_blog": None }), 1):
        doc = {}
        doc["_id"] = "habr_%d" % doc_id
        doc["title"] = mongo_doc["title"]
        doc["url"] = mongo_doc["url"]
        doc["modalities"] = doc_pipeline.fit_transform(mongo_doc["content_html"])
        doc["modalities"]["flat_tag"] = mongo_doc["tags"]
        doc["modalities"]["authors"] = [mongo_doc["author_user"]]
        doc["modalities"]["hubs"] = mongo_doc["hubs"]
        doc["markdown"] = mongo_doc["content_html"]
        # Записать в Vowpal Wabbit
        modalities_str = " ".join(map(lambda p: "|%s %s" % (p[0],
                         " ".join(map(lambda t: "_".join(t.split()), p[1]))), doc["modalities"].items()))
        vw_file.write("%s %s\n" % (doc["_id"], modalities_str))
        # Записать в MongoDB
        out_collection.insert_one(doc)

CPU times: user 9min 42s, sys: 12.2 s, total: 9min 54s
Wall time: 48min 41s


---