# Хабрахабр

In [19]:
import regex

In [20]:
from collections import Counter
from pymongo import MongoClient
from sklearn.pipeline import Pipeline
from parsers.text_utils import DefaultTextProcessor, DefaultDocumentProcessor

Перегоняем данные из базы `test.habrahabr` в базу `datasets.habrahabr` с изменением формата и сохраняем на диске в формате Vowpal Wabbit.

In [21]:
client = MongoClient()
in_collection = client["test"]["habrahabr"]
out_collection = client["datasets"]["habrahabr"]

In [22]:
stop_words = open("../datasets/habrahabr/stopwords.txt").read().split()
top_words = open("../datasets/habrahabr/topwords.txt").read().split()
stop_lemmas = list(set(stop_words).union(set(top_words)))
doc_pipeline = Pipeline([
    ("text-processor",     DefaultTextProcessor(token_pattern="(?u)\\b\\p{L}+\\b")),
    ("document-processor", DefaultDocumentProcessor(stop_lemmas=stop_lemmas)),
])

Подсчитаем количество встречающихся слов.

In [4]:
%%time

word_counter = Counter()

with open("../datasets/habrahabr/habrahabr.txt", "w") as vw_file:
    for doc_id, mongo_doc in enumerate(in_collection.find({ "company_blog": None }), 1):
        modalities = doc_pipeline.fit_transform(mongo_doc["content_html"])
        word_counter.update(modalities["text"])

CPU times: user 8min 33s, sys: 5.89 s, total: 8min 39s
Wall time: 44min 18s


Запишем результат в БД и на диск.

In [19]:
top_words = list(map(lambda p: p[0], word_counter.most_common(10)))

In [20]:
open("../datasets/habrahabr/top_words.txt", "w").write("\n".join(top_words))

87

---

In [29]:
%%time

# Фикс, чтобы сделать имена авторов на Хабре

for doc in out_collection.find():
    doc["authors_names"] = doc["modalities"]["authors"]
    out_collection.save(doc)



CPU times: user 1min 29s, sys: 3.79 s, total: 1min 33s
Wall time: 2min 11s


In [11]:
%%time

# TODO: вынести разнесение токенов по двум модальностям (MOD и MOD_habr) в отдельный модуль

pn_vocab = {"text": set(), "flat_tag": set()}

for doc in open("../datasets/postnauka/postnauka.txt"):
    tokens = doc.split()
    for token in tokens[1:]:
        if token.startswith("|"):
            cur_mod = token[1:]
        else:
            if cur_mod == "text" or cur_mod == "flat_tag":
                pn_vocab[cur_mod].add(token)

CPU times: user 1.51 s, sys: 6 ms, total: 1.52 s
Wall time: 1.52 s


In [12]:
len(pn_vocab["text"]) + len(pn_vocab["flat_tag"])

45093

In [13]:
def preprocess_tag(tag):
    return "_".join(regex.findall("(?u)\\b\\p{L}+\\b", tag.strip().lower()))

In [18]:
%%time

with open("../datasets/habrahabr/habrahabr.txt", "w") as vw_file:
    for doc_id, mongo_doc in enumerate(in_collection.find({ "company_blog": None }), 1):
        doc = {}
        doc["_id"] = "habr_%d" % doc_id
        doc["title"] = mongo_doc["title"]
        doc["url"] = mongo_doc["url"]
        doc["modalities"] = {"text_habr": [], "text": [], "flat_tag_habr": [], "flat_tag": []}
        modalities = doc_pipeline.fit_transform(mongo_doc["content_html"])
        for token in modalities["text"]:
            if token in pn_vocab["text"]:
                doc["modalities"]["text"].append(token)
            else:
                doc["modalities"]["text_habr"].append(token)
        for token in map(preprocess_tag, mongo_doc["tags"]):
            if token in pn_vocab["flat_tag"]:
                doc["modalities"]["flat_tag"].append(token)
            else:
                doc["modalities"]["flat_tag_habr"].append(token)
        doc["modalities"]["authors"] = [mongo_doc["author_user"]]
        doc["modalities"]["hubs"] = mongo_doc["hubs"]
        doc["markdown"] = mongo_doc["content_html"]
        # TODO: подтягивать имена авторов с Хабра
        doc["authors_names"] = doc["modalities"]["authors"]
        # Фильтрация коротких документов из Хабра
        if len(doc["modalities"]["text"]) + len(doc["modalities"]["text_habr"]) > 100:
            # Записать в Vowpal Wabbit
            modalities_str = " ".join(map(lambda p: "|%s %s" % (p[0],
                             " ".join(map(lambda t: "_".join(t.split()), p[1]))), doc["modalities"].items()))
            vw_file.write("%s %s\n" % (doc["_id"], modalities_str))
            # Записать в MongoDB
            out_collection.insert_one(doc)

CPU times: user 12min 9s, sys: 15.2 s, total: 12min 24s
Wall time: 1h 3min 17s


---