In [1]:
import json
from tqdm import tqdm_notebook
import re
import pandas as pd
import datetime
now = datetime.datetime.now()


data_file = '../input/arxiv/arxiv-metadata-oai-snapshot.json'

def read_metadata():
    with open(data_file, 'r') as f:
        for line in f:
            yield line

In [2]:
category_map = {'astro-ph': 'Astrophysics',
                'astro-ph.CO': 'Cosmology and Nongalactic Astrophysics',
                'astro-ph.EP': 'Earth and Planetary Astrophysics',
                'astro-ph.GA': 'Astrophysics of Galaxies',
                'astro-ph.HE': 'High Energy Astrophysical Phenomena',
                'astro-ph.IM': 'Instrumentation and Methods for Astrophysics',
                'astro-ph.SR': 'Solar and Stellar Astrophysics',
                'cond-mat.dis-nn': 'Disordered Systems and Neural Networks',
                'cond-mat.mes-hall': 'Mesoscale and Nanoscale Physics',
                'cond-mat.mtrl-sci': 'Materials Science',
                'cond-mat.other': 'Other Condensed Matter',
                'cond-mat.quant-gas': 'Quantum Gases',
                'cond-mat.soft': 'Soft Condensed Matter',
                'cond-mat.stat-mech': 'Statistical Mechanics',
                'cond-mat.str-el': 'Strongly Correlated Electrons',
                'cond-mat.supr-con': 'Superconductivity',
                'cs.AI': 'Artificial Intelligence',
                'cs.AR': 'Hardware Architecture',
                'cs.CC': 'Computational Complexity',
                'cs.CE': 'Computational Engineering, Finance, and Science',
                'cs.CG': 'Computational Geometry',
                'cs.CL': 'Computation and Language',
                'cs.CR': 'Cryptography and Security',
                'cs.CV': 'Computer Vision and Pattern Recognition',
                'cs.CY': 'Computers and Society',
                'cs.DB': 'Databases',
                'cs.DC': 'Distributed, Parallel, and Cluster Computing',
                'cs.DL': 'Digital Libraries',
                'cs.DM': 'Discrete Mathematics',
                'cs.DS': 'Data Structures and Algorithms',
                'cs.ET': 'Emerging Technologies',
                'cs.FL': 'Formal Languages and Automata Theory',
                'cs.GL': 'General Literature',
                'cs.GR': 'Graphics',
                'cs.GT': 'Computer Science and Game Theory',
                'cs.HC': 'Human-Computer Interaction',
                'cs.IR': 'Information Retrieval',
                'cs.IT': 'Information Theory',
                'cs.LG': 'Machine Learning',
                'cs.LO': 'Logic in Computer Science',
                'cs.MA': 'Multiagent Systems',
                'cs.MM': 'Multimedia',
                'cs.MS': 'Mathematical Software',
                'cs.NA': 'Numerical Analysis',
                'cs.NE': 'Neural and Evolutionary Computing',
                'cs.NI': 'Networking and Internet Architecture',
                'cs.OH': 'Other Computer Science',
                'cs.OS': 'Operating Systems',
                'cs.PF': 'Performance',
                'cs.PL': 'Programming Languages',
                'cs.RO': 'Robotics',
                'cs.SC': 'Symbolic Computation',
                'cs.SD': 'Sound',
                'cs.SE': 'Software Engineering',
                'cs.SI': 'Social and Information Networks',
                'cs.SY': 'Systems and Control',
                'econ.EM': 'Econometrics',
                'eess.AS': 'Audio and Speech Processing',
                'eess.IV': 'Image and Video Processing',
                'eess.SP': 'Signal Processing',
                'gr-qc': 'General Relativity and Quantum Cosmology',
                'hep-ex': 'High Energy Physics - Experiment',
                'hep-lat': 'High Energy Physics - Lattice',
                'hep-ph': 'High Energy Physics - Phenomenology',
                'hep-th': 'High Energy Physics - Theory',
                'math.AC': 'Commutative Algebra',
                'math.AG': 'Algebraic Geometry',
                'math.AP': 'Analysis of PDEs',
                'math.AT': 'Algebraic Topology',
                'math.CA': 'Classical Analysis and ODEs',
                'math.CO': 'Combinatorics',
                'math.CT': 'Category Theory',
                'math.CV': 'Complex Variables',
                'math.DG': 'Differential Geometry',
                'math.DS': 'Dynamical Systems',
                'math.FA': 'Functional Analysis',
                'math.GM': 'General Mathematics',
                'math.GN': 'General Topology',
                'math.GR': 'Group Theory',
                'math.GT': 'Geometric Topology',
                'math.HO': 'History and Overview',
                'math.IT': 'Information Theory',
                'math.KT': 'K-Theory and Homology',
                'math.LO': 'Logic',
                'math.MG': 'Metric Geometry',
                'math.MP': 'Mathematical Physics',
                'math.NA': 'Numerical Analysis',
                'math.NT': 'Number Theory',
                'math.OA': 'Operator Algebras',
                'math.OC': 'Optimization and Control',
                'math.PR': 'Probability',
                'math.QA': 'Quantum Algebra',
                'math.RA': 'Rings and Algebras',
                'math.RT': 'Representation Theory',
                'math.SG': 'Symplectic Geometry',
                'math.SP': 'Spectral Theory',
                'math.ST': 'Statistics Theory',
                'math-ph': 'Mathematical Physics',
                'nlin.AO': 'Adaptation and Self-Organizing Systems',
                'nlin.CD': 'Chaotic Dynamics',
                'nlin.CG': 'Cellular Automata and Lattice Gases',
                'nlin.PS': 'Pattern Formation and Solitons',
                'nlin.SI': 'Exactly Solvable and Integrable Systems',
                'nucl-ex': 'Nuclear Experiment',
                'nucl-th': 'Nuclear Theory',
                'physics.acc-ph': 'Accelerator Physics',
                'physics.ao-ph': 'Atmospheric and Oceanic Physics',
                'physics.app-ph': 'Applied Physics',
                'physics.atm-clus': 'Atomic and Molecular Clusters',
                'physics.atom-ph': 'Atomic Physics',
                'physics.bio-ph': 'Biological Physics',
                'physics.chem-ph': 'Chemical Physics',
                'physics.class-ph': 'Classical Physics',
                'physics.comp-ph': 'Computational Physics',
                'physics.data-an': 'Data Analysis, Statistics and Probability',
                'physics.ed-ph': 'Physics Education',
                'physics.flu-dyn': 'Fluid Dynamics',
                'physics.gen-ph': 'General Physics',
                'physics.geo-ph': 'Geophysics',
                'physics.hist-ph': 'History and Philosophy of Physics',
                'physics.ins-det': 'Instrumentation and Detectors',
                'physics.med-ph': 'Medical Physics',
                'physics.optics': 'Optics',
                'physics.plasm-ph': 'Plasma Physics',
                'physics.pop-ph': 'Popular Physics',
                'physics.soc-ph': 'Physics and Society',
                'physics.space-ph': 'Space Physics',
                'q-bio.BM': 'Biomolecules',
                'q-bio.CB': 'Cell Behavior',
                'q-bio.GN': 'Genomics',
                'q-bio.MN': 'Molecular Networks',
                'q-bio.NC': 'Neurons and Cognition',
                'q-bio.OT': 'Other Quantitative Biology',
                'q-bio.PE': 'Populations and Evolution',
                'q-bio.QM': 'Quantitative Methods',
                'q-bio.SC': 'Subcellular Processes',
                'q-bio.TO': 'Tissues and Organs',
                'q-fin.CP': 'Computational Finance',
                'q-fin.EC': 'Economics',
                'q-fin.GN': 'General Finance',
                'q-fin.MF': 'Mathematical Finance',
                'q-fin.PM': 'Portfolio Management',
                'q-fin.PR': 'Pricing of Securities',
                'q-fin.RM': 'Risk Management',
                'q-fin.ST': 'Statistical Finance',
                'q-fin.TR': 'Trading and Market Microstructure',
                'quant-ph': 'Quantum Physics',
                'stat.AP': 'Applications',
                'stat.CO': 'Computation',
                'stat.ME': 'Methodology',
                'stat.ML': 'Machine Learning',
                'stat.OT': 'Other Statistics',
                'stat.TH': 'Statistics Theory'}

In [3]:
metadata = read_metadata()
flatten_data = []
years_to_download = 50
count = 0
for paper in tqdm_notebook(metadata):
    paper_dict = json.loads(paper)
    ref = paper_dict.get('journal-ref')
    category = paper_dict.get('categories')
    try:
        year = int(ref[-4:])
        if category in category_map.keys() and now.year-year < years_to_download:
            paper_dict['year'] = year
            flatten_data.append(paper_dict)
            count+=1
#             if count>10000:
#                 break
    except:
        continue

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [4]:
df = pd.DataFrame.from_dict(flatten_data)

In [5]:
df.categories = [category_map[i] for i in df.categories]
df['pdf_link'] = ["https://arxiv.org/pdf/{}".format(i) for i in df['id']]
df['pages'] = [re.findall('[0-9]+ page', str(i), re.I)[-1] if re.search('[0-9]+ page', str(i), re.I) else 'N/A' for i in df['comments']]

In [6]:
#authors in authors_parsed, rest not needed
useless_columns = ['authors', 'journal-ref', 'doi', 'report-no', 'license', 'versions', 'comments']
df.drop(useless_columns, axis=1, inplace=True)

In [7]:
df.head()

Unnamed: 0,id,submitter,title,categories,abstract,update_date,authors_parsed,year,pdf_link,pages
0,704.0001,Pavel Nadolsky,Calculation of prompt diphoton production cros...,High Energy Physics - Phenomenology,A fully differential calculation in perturba...,2008-11-26,"[[Balázs, C., ], [Berger, E. L., ], [Nadolsky,...",2007,https://arxiv.org/pdf/0704.0001,37 page
1,704.0007,Alejandro Corichi,Polymer Quantum Mechanics and its Continuum Limit,General Relativity and Quantum Cosmology,A rather non-standard quantum representation...,2008-11-26,"[[Corichi, Alejandro, ], [Vukasinac, Tatjana, ...",2007,https://arxiv.org/pdf/0704.0007,16 page
2,704.0009,Paul Harvey,"The Spitzer c2d Survey of Large, Nearby, Inste...",Astrophysics,We discuss the results from the combined IRA...,2010-03-18,"[[Harvey, Paul, ], [Merin, Bruno, ], [Huard, T...",2007,https://arxiv.org/pdf/0704.0009,
3,704.0015,Christian Stahn,Fermionic superstring loop amplitudes in the p...,High Energy Physics - Theory,The pure spinor formulation of the ten-dimen...,2009-11-13,"[[Stahn, Christian, ]]",2007,https://arxiv.org/pdf/0704.0015,22 page
4,704.0016,Li Tong,Lifetime of doubly charmed baryons,High Energy Physics - Phenomenology,"In this work, we evaluate the lifetimes of t...",2008-12-18,"[[Chang, Chao-Hsi, ], [Li, Tong, ], [Li, Xue-Q...",2008,https://arxiv.org/pdf/0704.0016,17 page


In [8]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134307 entries, 0 to 134306
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   id              134307 non-null  object
 1   submitter       130894 non-null  object
 2   title           134307 non-null  object
 3   categories      134307 non-null  object
 4   abstract        134307 non-null  object
 5   update_date     134307 non-null  object
 6   authors_parsed  134307 non-null  object
 7   year            134307 non-null  int64 
 8   pdf_link        134307 non-null  object
 9   pages           134307 non-null  object
dtypes: int64(1), object(9)
memory usage: 201.8 MB


In [9]:
df.to_pickle('arxiv-last50years-data.pickle')