# Хабрахабр

In [1]:
%matplotlib inline
import regex
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from collections import Counter, defaultdict
from pymongo import MongoClient
from sklearn.pipeline import Pipeline
from parsers.text_utils import DefaultTextProcessor, DefaultDocumentProcessor
from ipywidgets import FloatProgress
from IPython.display import display

Перегоняем данные из базы `test.habrahabr` в базу `datasets.habrahabr` с изменением формата и сохраняем на диске в формате Vowpal Wabbit.

In [3]:
client = MongoClient()
in_collection = client["test"]["habrahabr"]
out_collection = client["datasets"]["habrahabr"]

In [4]:
stop_words = open("../datasets/habrahabr/stopwords.txt").read().split()
rare_words = open("../datasets/habrahabr/rarewords.txt").read().split()
stop_lemmas = set(stop_words).union(set(rare_words))
doc_pipeline = Pipeline([
    ("text-processor",     DefaultTextProcessor(token_pattern="(?u)\\b\\p{L}+\\b")),
    ("document-processor", DefaultDocumentProcessor(stop_lemmas=stop_lemmas)),
])

---

In [5]:
%%time

# TODO: вынести разнесение токенов по двум модальностям (MOD и MOD_habr) в отдельный модуль

pn_vocab = {"text": set(), "flat_tag": set()}

for doc in open("../datasets/postnauka/postnauka.txt"):
    tokens = doc.split()
    for token in tokens[1:]:
        if token.startswith("|"):
            cur_mod = token[1:]
        else:
            if cur_mod == "text" or cur_mod == "flat_tag":
                pn_vocab[cur_mod].add(token)

CPU times: user 1.63 s, sys: 26 ms, total: 1.66 s
Wall time: 1.67 s


In [6]:
len(pn_vocab["text"]) + len(pn_vocab["flat_tag"])

44995

In [7]:
def preprocess_tag(tag):
    return "_".join(regex.findall("(?u)\\b\\p{L}+\\b", tag.strip().lower()))

In [8]:
%%time

docs_count = in_collection.count({ "company_blog": None })
f = FloatProgress(min=0, max=docs_count)
display(f)

with open("../datasets/habrahabr/habrahabr.txt", "w") as vw_file:
    for doc_id, mongo_doc in enumerate(in_collection.find({ "company_blog": None }), 1):
        doc = {}
        doc["_id"] = "habr_%d" % doc_id
        doc["title"] = mongo_doc["title"]
        doc["url"] = mongo_doc["url"]
        doc["modalities"] = {"text_habr": [], "text": [], "flat_tag_habr": [], "flat_tag": []}
        modalities = doc_pipeline.fit_transform(mongo_doc["content_html"])
        for token in modalities["text"]:
            if token in pn_vocab["text"]:
                doc["modalities"]["text"].append(token)
            else:
                doc["modalities"]["text_habr"].append(token)
        for token in map(preprocess_tag, mongo_doc["tags"]):
            if token in pn_vocab["flat_tag"]:
                doc["modalities"]["flat_tag"].append(token)
            else:
                doc["modalities"]["flat_tag_habr"].append(token)
        doc["modalities"]["authors"] = [mongo_doc["author_user"]]
        doc["modalities"]["hubs"] = mongo_doc["hubs"]
        doc["markdown"] = mongo_doc["content_html"]
        # TODO: подтягивать имена авторов с Хабра
        doc["authors_names"] = doc["modalities"]["authors"]
        # Фильтрация коротких документов из Хабра
        if len(doc["modalities"]["text"]) + len(doc["modalities"]["text_habr"]) > 100:
            # Записать в Vowpal Wabbit
            modalities_str = " ".join(map(lambda p: "|%s %s" % (p[0],
                             " ".join(map(lambda t: "_".join(t.split()), p[1]))), doc["modalities"].items()))
            vw_file.write("%s %s\n" % (doc["_id"], modalities_str))
            # Записать в MongoDB
            out_collection.insert_one(doc)
        # Увеличить счетчик прогресс-бара
        f.value += 1

CPU times: user 16min 31s, sys: 35 s, total: 17min 6s
Wall time: 1h 8min 36s


---

### Фильтрация слов с низким DF

In [28]:
%%time

docs_count = in_collection.count({ "company_blog": None })
f = FloatProgress(min=0, max=docs_count)
display(f)

word_counter = defaultdict(set)

for doc_id, mongo_doc in enumerate(in_collection.find({ "company_blog": None }), 1):
    modalities = doc_pipeline.fit_transform(mongo_doc["content_html"])
    for word in modalities["text"]:
        word_counter[word].add(doc_id)
    # Увеличить счетчик прогресс-бара
    f.value += 1

CPU times: user 18min 34s, sys: 29.3 s, total: 19min 3s
Wall time: 1h 7min 55s


In [29]:
words = list(word_counter.items())

In [30]:
len(word_counter)

602833

In [68]:
rare_words = set(map(lambda p: p[0], filter(lambda p: len(p[1]) <= 1, words)))

In [69]:
print(len(rare_words))
print(len(rare_words) / len(words))

384972
0.6386047213739129


In [70]:
open("../datasets/habrahabr/rarewords.txt", "w").write("\n".join(rare_words))

4415124

---