In [8]:
import logging
logger = logging.getLogger("spacy")
logger.setLevel(logging.ERROR)

import dask.bag as db
import json
import pandas as pd

docs = db.read_text('../data/raw/arxiv-metadata-oai-snapshot.json').map(json.loads)

In [9]:
# Total number of documents
total_docs = docs.count().compute()

In [10]:
print(total_docs)

2860945


In [11]:
# Looking at one document:
docs.take(1)

({'id': '0704.0001',
  'submitter': 'Pavel Nadolsky',
  'authors': "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan",
  'title': 'Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies',
  'comments': '37 pages, 15 figures; published version',
  'journal-ref': 'Phys.Rev.D76:013009,2007',
  'doi': '10.1103/PhysRevD.76.013009',
  'report-no': 'ANL-HEP-PR-07-12',
  'categories': 'hep-ph',
  'license': None,
  'abstract': '  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with d

In [12]:
# The dataset is very huge. Not sure if the whole set can be used. Let's start with a subset of the data.
# This procedure was recommended in the ArXiv dataset itself

get_latest_version = lambda x: x['versions'][-1]['created']

# get only necessary fields of the metadata file
trim = lambda x: {'id': x['id'],
                  'authors': x['authors'],
                  'title': x['title'],
                  'doi': x['doi'],
                  'category': x['categories'].split(' '),
                  'abstract': x['abstract'],}

# filter for papers published on or after 2019-01-01
columns = ['id', 'category', 'abstract']
docs_df = (docs.filter(lambda x: int(get_latest_version(x).split(' ')[3]) >2018)
           .map(trim).
           compute())

# convert to pandas
docs_df = pd.DataFrame(docs_df)

In [13]:
# Save trimmed dataset
docs_df.to_csv('../data/processed/trimmed_arxiv_doc_2018.csv', index=False)

In [18]:
docs_df.head(10)

Unnamed: 0,id,authors,title,doi,category,abstract
0,704.0033,"Maxim A. Yurkin, Valeri P. Maltsev, Alfons G. ...",Convergence of the discrete dipole approximati...,10.1364/JOSAA.23.002578 10.1364/JOSAA.32.002407,"[physics.optics, physics.comp-ph]",We performed a rigorous theoretical converge...
1,704.0038,"Maxim A. Yurkin, Alfons G. Hoekstra",The discrete dipole approximation: an overview...,10.1016/j.jqsrt.2007.01.034 10.1016/j.jqsrt.20...,"[physics.optics, physics.comp-ph]",We present a review of the discrete dipole a...
2,704.0479,T.Geisser,The affine part of the Picard scheme,,"[math.AG, math.KT]",We describe the maximal torus and maximal un...
3,704.1445,Yasha Gindikin and Vladimir A. Sablikov,Deformed Wigner crystal in a one-dimensional q...,10.1103/PhysRevB.76.045122,"[cond-mat.str-el, cond-mat.mes-hall]",The spatial Fourier spectrum of the electron...
4,704.1476,Chris Austin,TeV-scale gravity in Horava-Witten theory on a...,,[hep-th],The field equations and boundary conditions ...
5,704.2253,"William T. Reach, Michael S. Kelley, Mark V. S...",A survey of debris trails from short-period co...,10.1016/j.icarus.2007.03.031,[astro-ph],We observed 34 comets using the 24 micron ca...
6,705.0033,"Nikos Frantzikinakis, Randall McCutcheon",Ergodic Theory: Recurrence,,[math.DS],We survey the impact of the Poincar\'e recur...
7,705.0344,J. P. Pridham,Unifying derived deformation theories,,[math.AG],We develop a framework for derived deformati...
8,705.0825,Ram Gopal Vishwakarma (Zacatecas University),Einstein's Theory of Gravity in the Presence o...,10.1007/s10509-009-0016-8,"[gr-qc, astro-ph, hep-th]",The mysterious `dark energy' needed to expla...
9,705.1155,Kerry M. Soileau,State Vector Determination By A Single Trackin...,,[astro-ph],Using only a single tracking satellite capab...
