In [279]:
import regex
import collections
from lxml import html
from pymongo import MongoClient
from urllib.request import urlopen
from urllib.error import HTTPError
from sklearn.pipeline import Pipeline
from parsers.text_utils import DefaultTextProcessor, DefaultDocumentProcessor

### Загрузка страниц с веб-сайта elementy.ru

In [220]:
maybe = lambda f, x: f(x) if x else None

In [25]:
def process_html(text):
    return text.replace("\xa0", " ")

In [61]:
def process_tag(text):
    return regex.sub("\s", "_", process_html(text).strip()).lower()

In [261]:
%%time

# Парсинг всех страниц
pages_ids = list(range(431231, 433629))
pages = []

for i, page_id in enumerate(pages_ids, 1):
    try:
        page_url = "http://elementy.ru/nauchno-populyarnaya_biblioteka/%d/" % page_id
        page = html.parse(urlopen(page_url))

        title = process_html(page.findtext("//h1"))
        tags = list(map(lambda p: process_tag(p.text),
                        page.findall("//div[@class='mb itemhead newslist']/div/a")[1:-1]))
        article = page.find("//div[@class='itemblock']/div[@class='memo']")

        summary = maybe(process_html, article.findtext("./p[@class='Intro']"))
        text = []
        content_flag = False
        for elem in article.iterfind("p"):
            if len(elem.classes) > 0:
                continue
            # TODO: filter wrong paragraphs
            # TODO: can also be non-paragraphs (h3, ol, etc)
            text.append(process_html(elem.text_content()))
        text = "\n\n".join(text)
        
        pages.append((page_id, title, tags, summary, text))
    except Exception:
        pass
    
    if i % 100 == 0:
        print("Parsed %d/%d pages" % (len(pages), i))

Parsed 98/100 pages
Parsed 198/200 pages
Parsed 296/300 pages
Parsed 392/400 pages
Parsed 491/500 pages
Parsed 589/600 pages
Parsed 686/700 pages
Parsed 784/800 pages
Parsed 881/900 pages
Parsed 981/1000 pages
Parsed 1079/1100 pages
Parsed 1176/1200 pages
Parsed 1273/1300 pages
Parsed 1370/1400 pages
Parsed 1466/1500 pages
Parsed 1566/1600 pages
Parsed 1658/1700 pages
Parsed 1752/1800 pages
Parsed 1848/1900 pages
Parsed 1946/2000 pages
Parsed 2038/2100 pages
Parsed 2129/2200 pages
Parsed 2223/2300 pages
CPU times: user 45.4 s, sys: 3.45 s, total: 48.9 s
Wall time: 28min 17s


In [267]:
len(pages)

2300

### Парсинг

In [292]:
stop_words = open("../datasets/elementy/stopwords.txt").read().split()
rare_words = open("../datasets/elementy/rarewords.txt").read().split()
stop_lemmas = set(stop_words).union(set(rare_words))
doc_pipeline = Pipeline([
    ("text-processor",     DefaultTextProcessor(token_pattern="(?u)\\b\\p{L}+\\b")),
    ("document-processor", DefaultDocumentProcessor(stop_lemmas=stop_lemmas)),
])

In [293]:
%%time

# TODO: вынести разнесение токенов по двум модальностям (MOD и MOD_habr) в отдельный модуль

pn_vocab = {"text": set(), "flat_tag": set()}

for doc in open("../datasets/postnauka/postnauka.txt"):
    tokens = doc.split()
    for token in tokens[1:]:
        if token.startswith("|"):
            cur_mod = token[1:]
        else:
            if cur_mod == "text" or cur_mod == "flat_tag":
                pn_vocab[cur_mod].add(token)

CPU times: user 1.48 s, sys: 19 ms, total: 1.5 s
Wall time: 1.52 s


In [294]:
len(pn_vocab["text"]) + len(pn_vocab["flat_tag"])

44995

In [295]:
client = MongoClient()
out_collection = client["datasets"]["elementy"]

In [296]:
%%time

with open("../datasets/elementy/elementy.txt", "w") as vw_file:
    for i, page in enumerate(pages, 1):
        page_id, title, tags, summary, text = page
        doc = {}
        doc["_id"] = "elem_%d" % page_id
        doc["title"] = title
        doc["url"] = "http://elementy.ru/nauchno-populyarnaya_biblioteka/%d/" % page_id
        doc["modalities"] = {"text_elem": [], "text": [], "flat_tag_elem": [], "flat_tag": []}
        modalities = doc_pipeline.fit_transform(text)
        for token in modalities["text"]:
            if token in pn_vocab["text"]:
                doc["modalities"]["text"].append(token)
            else:
                doc["modalities"]["text"].append(token)
                doc["modalities"]["text_elem"].append(token)
        for token in tags:
            if token in pn_vocab["flat_tag"]:
                doc["modalities"]["flat_tag"].append(token)
            else:
                doc["modalities"]["flat_tag"].append(token)
                doc["modalities"]["flat_tag_elem"].append(token)
        doc["summary"] = summary
        doc["markdown"] = text
        # Фильтрация коротких документов из Элементов
        if len(doc["modalities"]["text"]) > 100:
            # Записать в Vowpal Wabbit
            modalities_str = " ".join(map(lambda p: "|%s %s" % (p[0],
                             " ".join(map(lambda t: "_".join(t.split()), p[1]))), doc["modalities"].items()))
            vw_file.write("%s %s\n" % (doc["_id"], modalities_str))
            # Записать в MongoDB
            out_collection.insert_one(doc)
        if i % 500 == 0:
            print("Written %d pages" % i)

Written 500 pages
Written 1000 pages
Written 1500 pages
Written 2000 pages
CPU times: user 38.1 s, sys: 612 ms, total: 38.7 s
Wall time: 2min 54s


### Фильтрация слов с низким DF

In [285]:
%%time

word_counter = collections.defaultdict(set)

for i, page in enumerate(pages, 1):
    page_id, _, _, _, text = page
    modalities = doc_pipeline.fit_transform(text)
    for word in modalities["text"]:
        word_counter[word].add(page_id)
    if i % 500 == 0:
        print("Processed %d pages" % i)

Processed 500 pages
Processed 1000 pages
Processed 1500 pages
Processed 2000 pages
CPU times: user 32.1 s, sys: 214 ms, total: 32.4 s
Wall time: 2min 50s


In [286]:
words = list(word_counter.items())

In [287]:
len(word_counter)

79946

In [288]:
rare_words = set(map(lambda p: p[0], filter(lambda p: len(p[1]) <= 1, words)))

In [289]:
print(len(rare_words))
print(len(rare_words) / len(words))

39494
0.49400845570760266


In [291]:
open("../datasets/elementy/rarewords.txt", "w").write("\n".join(rare_words))

388252

---