In [1]:
import numpy as np
import pandas as pd
import collections
import pymongo
import regex
import csv

In [2]:
PATH = "arxiv.csv"

### Экспериментируем

In [59]:
%%time

df = pd.DataFrame.from_csv(PATH)

CPU times: user 4.06 s, sys: 246 ms, total: 4.3 s
Wall time: 4.68 s


In [121]:
df.head()

Unnamed: 0_level_0,publisher,contributor,type,language,relation,description,title,coverage,rights,format,source,creator,date,subject
identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
"['http://arxiv.org/abs/0704.0001', 'Phys.Rev.D76:013009,2007', 'doi:10.1103/PhysRevD.76.013009']",[],[],['text'],[],[],[' A fully differential calculation in pertur...,['Calculation of prompt diphoton production cr...,[],[],[],[],"['Balázs, C.', 'Berger, E. L.', 'Nadolsky, P. ...","['2007-04-02', '2007-07-24']",['High Energy Physics - Phenomenology']
['http://arxiv.org/abs/0704.0002'],[],[],['text'],[],[],"[' We describe a new algorithm, the $(k,\\ell...",['Sparsity-certifying Graph Decompositions'],[],[],[],[],"['Streinu, Ileana', 'Theran, Louis']","['2007-03-30', '2008-12-13']","['Mathematics - Combinatorics', 'Computer Scie..."
['http://arxiv.org/abs/0704.0003'],[],[],['text'],[],[],"["" The evolution of Earth-Moon system is desc...",['The evolution of the Earth-Moon system based...,[],[],[],[],"['Pan, Hongjun']","['2007-04-01', '2008-01-12']",['Physics - General Physics']
['http://arxiv.org/abs/0704.0004'],[],[],['text'],[],[],[' We show that a determinant of Stirling cyc...,['A determinant of Stirling cycle numbers coun...,[],[],[],[],"['Callan, David']",['2007-03-30'],"['Mathematics - Combinatorics', '05A15']"
"['http://arxiv.org/abs/0704.0005', 'Illinois J. Math. 52 (2008) no.2, 681-689']",[],[],['text'],[],[],[' In this paper we show how to compute the $...,['From dyadic $\\Lambda_{\\alpha}$ to $\\Lambd...,[],[],[],[],"['Abu-Shammala, Wael', 'Torchinsky, Alberto']",['2007-04-02'],"['Mathematics - Classical Analysis and ODEs', ..."


Идентификаторы:

In [159]:
%%time

doc_id = []

prefix = "http://arxiv.org/abs/"
prefix_len = len(prefix)

for ids in df.index:
    doc_id.append("arxiv_" + eval(ids)[0][prefix_len:])

assert len(df) == len(doc_id)

CPU times: user 4.56 s, sys: 42 ms, total: 4.6 s
Wall time: 4.6 s


Заголовки:

In [125]:
%%time

title = []

for titles in df["title"]:
    title.append(eval(titles)[0])

assert len(df) == len(title)

CPU times: user 3.63 s, sys: 44 ms, total: 3.68 s
Wall time: 3.68 s


Описания:

In [139]:
%%time

description = []

for descs in df["description"]:
    description.append(eval(descs)[0])

assert len(df) == len(description)

CPU times: user 7.03 s, sys: 118 ms, total: 7.15 s
Wall time: 7.15 s


In [140]:
sum(map(len, description))

255111394

Даты:

In [97]:
%%time

first_date = []
last_date = []

for dates in df["date"]:
    dates = eval(dates)
    if len(dates):
        first_date.append(dates[0])
        last_date.append(dates[-1])
    else:
        first_date.append(None)
        last_date.append(None)

assert len(df) == len(first_date) == len(last_date)

CPU times: user 3.65 s, sys: 27 ms, total: 3.67 s
Wall time: 3.67 s


Категории:

In [98]:
%%time

cat0_subjects = []
cat1_subjects = []
other_subjects = []

subject_regex = regex.compile("^[a-z\s-,]+$")

for subjs in df["subject"]:
    s0, s1 = set(), set()
    for subj in eval(subjs):
        subj = subj.lower().strip()
        parts = subj.split(" - ")
        if subject_regex.match(subj) and len(parts) <= 2:
            s0.add("%s" % parts[0])
            if len(parts) == 2:
                s1.add("%s - %s" % (parts[0], parts[1]))
        else:
            other_subjects.append(subj)
    cat0_subjects.append(list(s0))
    cat1_subjects.append(list(s1))

assert len(df) == len(cat0_subjects) == len(cat1_subjects)

CPU times: user 8.95 s, sys: 76 ms, total: 9.03 s
Wall time: 9.03 s


In [76]:
collections.Counter(other_subjects).most_common(10)

[('f.2.2', 605),
 ('f.4.1', 364),
 ('g.2.2', 350),
 ('57m25', 307),
 ('f.1.1', 220),
 ('g.3', 213),
 ('05a15', 206),
 ('60k35', 197),
 ('g.2.1', 177),
 ('57m27', 161)]

### Собираем всё вместе

In [3]:
%%time

fieldnames = ("doc_id", "identifiers", "cat0_subject", "cat1_subject", "first_date", "last_date", "authors",
              "title", "description")
subject_regex = regex.compile("^[a-z_\-,]+$")
prefix = "http://arxiv.org/abs/"
prefix_len = len(prefix)

with open(PATH) as infile, open("arxiv_clean.csv", "w") as outfile:
    reader = csv.DictReader(infile)
    writer = csv.DictWriter(outfile, fieldnames)
    writer.writeheader()
    for row in reader:
        article = {}
        article["authors"] = row["creator"]
        # Идентификаторы
        ids = row["identifier"]
        article["identifiers"] = ids
        article["doc_id"] = "arxiv_" + eval(ids)[0][prefix_len:]
        # Категории
        s0, s1 = set(), set()
        for subj in eval(row["subject"]):
            subj = subj.lower().strip().replace(" ", "_")
            parts = subj.split("_-_")
            if subject_regex.match(subj) and len(parts) <= 2:
                s0.add("%s" % parts[0])
                if len(parts) == 2:
                    s1.add("%s_-_%s" % (parts[0], parts[1]))
        article["cat0_subject"] = list(s0)
        article["cat1_subject"] = list(s1)
        # Даты
        dates = eval(row["date"])
        if len(dates):
            article["first_date"] = dates[0]
            article["last_date"] = dates[-1]
        # Заголовок и описание
        article["title"] = eval(row["title"])[0]
        article["description"] = eval(row["description"])[0]
        writer.writerow(article)

CPU times: user 3min 42s, sys: 3.44 s, total: 3min 46s
Wall time: 3min 47s


---