In [1]:
import os
import collections
import pymysql
import pymysql.cursors
import pymongo

In [2]:
id_func = lambda doc_id: "pn_%d" % doc_id
ids = sorted(map(lambda s: int(s[:-9]), os.listdir("../datasets/postnauka/raw_data/meta")))
ids_map = dict(zip(ids, map(id_func, range(1, len(ids) + 1))))

In [3]:
mongodb = pymongo.MongoClient()

In [4]:
%%time

# Дополним посты на ПостНауке именами авторов.

conn = pymysql.connect(host="localhost",
                       user="root",
                       password="",
                       db="postnauka",
                       charset="utf8",
                       cursorclass=pymysql.cursors.DictCursor)

authors_names = collections.defaultdict(list)

try:
    with conn.cursor() as cur:
        q = """
        -- Получить имена авторов
        select tr.object_id as post_id, t.term_id as author_id, tt.description as author_name
        from pn_term_taxonomy tt
        join pn_terms t on (t.term_id = tt.term_id)
        join pn_term_relationships tr on (tr.term_taxonomy_id = tt.term_taxonomy_id)
        join pn_posts p on (p.id = tr.object_id)
        where p.post_type = 'post' and p.post_status = 'publish' and tt.taxonomy = 'author'
        order by tr.object_id, t.term_id
        """
        cur.execute(q)
        for row in cur:
            doc_id = ids_map[row["post_id"]]
            author_str = row["author_name"].split()[:-3]
            author_name = " ".join(author_str[:len(author_str) // 2])
            authors_names[doc_id].append(author_name)
finally:
    conn.close()

CPU times: user 101 ms, sys: 3 ms, total: 104 ms
Wall time: 235 ms


In [5]:
%%time

for doc_id, names in authors_names.items():
    mongodb["datasets"]["postnauka"].update_one({"_id": doc_id}, {"$set": {"authors_names": names}})

CPU times: user 1.03 s, sys: 78 ms, total: 1.1 s
Wall time: 2.49 s


In [6]:
%%time

# Заполним недостающие значения

for doc_id in (ids_map.values() - authors_names.keys()):
    mongodb["datasets"]["postnauka"].update_one({"_id": doc_id}, {"$set": {"authors_names": []}})

CPU times: user 3 ms, sys: 0 ns, total: 3 ms
Wall time: 2.26 ms


In [7]:
(ids_map.values() - authors_names.keys())

{'pn_2523', 'pn_2751', 'pn_3080', 'pn_3444'}

---