In [None]:
import pandas as pd
from PyPDF2 import PdfReader
import time
from copy import deepcopy

In [None]:
reader=PdfReader("msc2020.pdf")
n_pages=len(reader.pages)
df=pd.DataFrame(columns=['class','subject'])

def isMSC_xx(string):                         #The head subject e.g. 14Gxx which itself divides into more refined subjects.        
    if len(string)==5 and string[0:2].isnumeric() and string[2].isupper() and string[3:5]=='xx':
        return True
    return False

for page_number in range(n_pages):
    page_content=reader.pages[page_number].extract_text().split("\n")  #Splitting the page into lines. 
    for line in page_content:
        line_reduced=line.split('[')[0]      #Avoiding expressions such as '[See also...]'
        head=line_reduced.split(' ')[0]
        if isMSC_xx(head):
            MSC=head[:3]
            description=line_reduced.removeprefix(head)
            df.loc[df.shape[0]]=[MSC,description]
                        
        
df

https://pypi.org/project/arxiv/

https://github.com/lukasschwab/arxiv.py

https://arxiv.org/category_taxonomy

https://info.arxiv.org/help/api/user-manual.html#_details_of_atom_results_returned

Example

In [None]:
import arxiv

# Construct the default API client.
client = arxiv.Client()

#Search for the 50 most recent articles matching the provided category. 
search = arxiv.Search(
  query = "cat:math.AG",
  max_results = 50,
  sort_by = arxiv.SortCriterion.SubmittedDate
)

results = client.results(search)

# `results` is a generator; you can iterate over its elements one by one...
for r in client.results(search):
    print(r.primary_category,r.categories)

In [None]:
math_categories=['math.AC', 'math.AG', 'math.AP', 'math.AT', 'math.CA', 'math.CO', 'math.CT', 'math.CV',
                'math.DG', 'math.DS', 'math.FA', 'math.GM', 'math.GN', 'math.GR', 'math.GT', 'math.HO', 'math.IT',
                'math.KT', 'math.LO', 'math.MG', 'math.MP', 'math.NA', 'math.NT', 'math.OA', 'math.OC', 'math.PR',
                'math.QA', 'math.RA', 'math.RT', 'math.SG', 'math.SP', 'math.ST']

classes=df['class'].to_list()

In [None]:
raw_data=pd.DataFrame(columns=['primary_category','title','authors','url','last_updated','MSC'])

In [None]:
time_start=time.time()

# Construct the default API client.
client = arxiv.Client(num_retries=5,page_size=7000,delay_seconds=5)
    
for category in math_categories:
    
    
    #Search for the 7000 most recent articles matching the provided category. 
    search = arxiv.Search(
      query = "cat:"+category,
      max_results = 7000,
      sort_by = arxiv.SortCriterion.SubmittedDate
    )
    
    results = client.results(search)
    
    double_list=list(map(lambda r:[r.primary_category,r.title,r.authors,r.entry_id,r.updated,r.categories[-1]],results))
    
    raw_data=pd.concat([raw_data,
                    pd.DataFrame(double_list,columns=['primary_category','title','authors','url','last_updated','MSC'])]
                   ,ignore_index=True)
    print(f'The category just scraped: {category}, the time elapsed so far: {time.time()-time_start}')

In [None]:
raw_data.to_csv('raw.csv',index=False)
raw_data

We begin processing the data. 

In [None]:
data=deepcopy(raw_data)
data['version']=data['url'].apply(lambda x:int(x.removeprefix('http://arxiv.org/abs/').split('v')[-1]))
data['identifier']=data['url'].apply(lambda x:x.removeprefix('http://arxiv.org/abs/').split('v')[0])
data

Removing some outrageous titles

In [None]:
data.sort_values(by=['version'],ascending=False).head(40)

Removing any preprint with more than 10 version.

In [None]:
too_many_v_list=data.index[data['version']>10].to_list()
bad_identifiers=set(data.loc[too_many_v_list,'identifier'])
data=data[data['identifier'].apply(lambda x:x not in bad_identifiers)]
data=data.sort_values(by=['version'],ascending=False,ignore_index=True)
data

Keeping only the oldest version

In [None]:
data.drop_duplicates(subset=['identifier'],keep='first',inplace=True,ignore_index=True)
data

In [None]:
def isMSC(string):                         #Is the string an MSC string, e.g. 20E18.        
    if len(string)==5 and string[0:2].isnumeric() and string[2].isupper() and string[0:2].isnumeric():
        return True
    return False

def extractMSC(string):      #Detecting terms of the form 16Rxx and returning the first three characters.
    terms=string.split(' ')  #Each term ends with ',' or ',\n'
    return list(set([term.strip('\n').strip(',')[0:3] for term in terms if isMSC(term.strip('\n').strip(','))]))
        
data['MSC']=data['MSC'].apply(extractMSC)
data=data[data['MSC'].apply(lambda x:len(x)>0)]
data=data.sort_values(by=['primary_category'],ignore_index=True)   #Sorting based on the primary category.
data

In [None]:
data.to_csv('processed.csv',index=False)