In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from pprint import pprint

In [2]:
SUBJECT = 'stat.ML'
YEAR = '18'

In [3]:
url = 'https://arxiv.org/list/{subj}/{year}'\
    .format(subj=SUBJECT, year=YEAR)

In [4]:
req = requests.get(url)
print("Status code:", req.status_code)

Status code: 200


In [5]:
soup = BeautifulSoup(req.text, "lxml")

In [6]:
paperinfo = soup.findAll('dd')
paperlink = soup.findAll('dt')

In [7]:
df = pd.DataFrame()

### Loading paper id

In [8]:
ids = [x.findAll('a')[1].text[6:] for x in paperlink]
ids[:5]

['1801.00085', '1801.00315', '1801.00364', '1801.00507', '1801.00636']

In [9]:
df['id'] = ids

### Loading titles

In [11]:
titles = [info.find('div', {'class': 'list-title'}).text[8:-1] for info in paperinfo]
titles[:5]

['Learning Structural Weight Uncertainty for Sequential Decision-Making',
 'Learning Relevant Features of Data with Multi-scale Tensor Networks',
 'Estimation and Inference of Treatment Effects with $L_2$-Boosting in  High-Dimensional Settings',
 'Towards Practical Conditional Risk Minimization',
 'Transferable neural networks for enhanced sampling of protein dynamics']

In [12]:
df['title'] = titles

### Loading authors

In [13]:
authors = [[author.text for author in info.findAll('a')] for info in paperinfo]
authors[:5]

[['Ruiyi Zhang', 'Chunyuan Li', 'Changyou Chen', 'Lawrence Carin'],
 ['E.M. Stoudenmire'],
 ['Ye Luo', 'Martin Spindler'],
 ['Alexander Zimin', 'Christoph Lampert'],
 ['Mohammad M. Sultan', 'Hannah K. Wayment-Steele', 'Vijay S. Pande']]

In [14]:
df['authors'] = authors

### Loading primary subjects

In [15]:
subjid = lambda x: x[x.find('(')+1:x.find(')')]

In [16]:
primsubj = [subjid(info.find('span', {'class': 'primary-subject'}).text) for info in paperinfo]
primsubj[:5]

['stat.ML', 'stat.ML', 'stat.ML', 'stat.ML', 'stat.ML']

In [17]:
df['primary subject'] = primsubj

### Loading secondary subjects

In [18]:
getsec = lambda x: [subjid(subj) for subj in  x[x.find(';')+2:-2].split(';')]

In [19]:
secsubj = [getsec(info.find('div', {'class': 'list-subjects'}).text) for info in paperinfo]
secsubj[:5]

[['cs.AI', 'cs.LG'],
 ['cond-mat.stat-mech', 'cond-mat.str-el', 'cs.LG'],
 ['econ.EM', 'stat.ME'],
 ['cs.LG'],
 ['q-bio.BM']]

In [20]:
df['secondary subjects'] = secsubj

### Data preview

In [21]:
df

Unnamed: 0,id,title,authors,primary subject,secondary subjects
0,1801.00085,Learning Structural Weight Uncertainty for Seq...,"[Ruiyi Zhang, Chunyuan Li, Changyou Chen, Lawr...",stat.ML,"[cs.AI, cs.LG]"
1,1801.00315,Learning Relevant Features of Data with Multi-...,[E.M. Stoudenmire],stat.ML,"[cond-mat.stat-mech, cond-mat.str-el, cs.LG]"
2,1801.00364,Estimation and Inference of Treatment Effects ...,"[Ye Luo, Martin Spindler]",stat.ML,"[econ.EM, stat.ME]"
3,1801.00507,Towards Practical Conditional Risk Minimization,"[Alexander Zimin, Christoph Lampert]",stat.ML,[cs.LG]
4,1801.00636,Transferable neural networks for enhanced samp...,"[Mohammad M. Sultan, Hannah K. Wayment-Steele,...",stat.ML,[q-bio.BM]
5,1801.00668,Random Euler Complex-Valued Nonlinear Filters,"[Jiashu Zhang, Sheng Zhang, Defang Li]",stat.ML,[eess.SP]
6,1801.00681,A novel improved fuzzy support vector machine ...,"[Shuheng Wang, Guohao Li, Yifan Bao]",stat.ML,[q-fin.ST]
7,1801.00753,Probabilistic supervised learning,"[Frithjof Gressmann, Franz J. Király, Bilal Ma...",stat.ML,"[cs.LG, math.ST, stat.ME]"
8,1801.00857,Optimal Bayesian Transfer Learning,"[Alireza Karbalayghareh, Xiaoning Qian, Edward...",stat.ML,"[cs.CV, cs.LG]"
9,1801.01061,Intrinsic Gaussian processes on complex constr...,"[Mu Niu, Pokman Cheung, Lizhen Lin, Zhenwen Da...",stat.ML,[cs.LG]
