In [1]:
import arxiv
import re
import pandas as pd
from data_utils import clean_data, clean_authors

In [2]:
def format_query(author='',title='',cat='',abstract=''):
    """Returns a formatted arxiv query string to handle
    simple queries of at most one instance each of these fields.
    To leave a field unspecified, leave the corresponding argument blank.
    
    e.g. format_query(cat='math.AP') will return
    the string used to pull all articles
    with the subject tag 'PDEs',
    since Math.AP is the subject tag
    for 'Analysis of PDEs'.

    Args:
        author: string to search for in the author field.
        title: string to search for in the title field.
        cat: A valid arxiv subject tag. See the full list of these at:
        https://arxiv.org/category_taxonomy
        abstract: string to search for in the abstract field.

    Returns:
        properly formatted query string to return
        all results simultaneously matching all specified fields.
    """

    tags = [f'au:{author}', f'ti:{title}', f'cat:{cat}', f'abs:{abstract}'] 
    # the tag.endswith(':') below
    # is for filtering out tags that
    # we do not pass to the function
    query = ' AND '.join([tag for tag in tags if not tag.endswith(':')])
    return query



def query_to_df(query,max_results):
    """Returns the results of an arxiv API query in a pandas dataframe.

    Args:
        query: string defining an arxiv query
        formatted according to 
        https://info.arxiv.org/help/api/user-manual.html#51-details-of-query-construction
        
        max_results: positive integer specifying
        the maximum number of results returned.

    Returns:
        pandas dataframe with one column for
        indivial piece of metadata of a returned result.
        To see a list of these columns and their descriptions,
        see the documentation for the
        Results class of the arxiv package here:
        http://lukasschwab.me/arxiv.py/index.html#Result

        The 'links' column is dropped and
        the authors column is replaced by
        a single string of each author name
        separated by a comma.

    """
    search = arxiv.Search(
            query = query,
            max_results=max_results,
            sort_by=arxiv.SortCriterion.LastUpdatedDate
            )
    results = search.results()

    drop_cols = ['authors','links','_raw']
    df = pd.DataFrame()

    for result in results:
        row_dict = {k : v for (k,v) in vars(result).items() if k not in drop_cols}
        row_dict['authors'] = ','.join([author.name for author in result.authors])
        row = pd.Series(row_dict)
        df = pd.concat([df , row.to_frame().transpose()], axis = 0)

    return df.reset_index(drop=True,inplace=False)


In [3]:
#Tentative working dataset: 1000 most recent articles from:
#math physics(math.MP), pdes(math.AP), representation theory(math.RT), quantum algebra(math.QA), differential geometry(math.DG)

query = format_query(cat='math.MP')
physics = query_to_df(query=query,max_results=1000)

query = format_query(cat='math.AP')
pdes = query_to_df(query=query,max_results=1000)

query = format_query(cat='math.RT')
reps = query_to_df(query=query,max_results=1000)

query = format_query(cat='math.QA')
qalgs = query_to_df(query=query,max_results=1000)

query = format_query(cat='math.DG')
dgs = query_to_df(query=query,max_results=1000)

df = pd.concat([physics,pdes,reps,qalgs, dgs], axis = 0)

In [4]:
df.shape

(5000, 12)

In [5]:
df.columns

Index(['entry_id', 'updated', 'published', 'title', 'summary', 'comment',
       'journal_ref', 'doi', 'primary_category', 'categories', 'pdf_url',
       'authors'],
      dtype='object')

In [6]:
df = df.drop([ 'entry_id', 'comment', 'journal_ref', 'doi', 'pdf_url'], axis = 1)

In [7]:
def extract_date(date) :
    date_str = str(date)
    return round(float(date_str.split('-')[0])+float(date_str.split('-')[1])/12,2)

def get_category(category_str):
    return category_str[-2:].lower()

def get_categories(category_list):
    return [get_category(category_str) for category_str in category_list if '.' in category_str]

def get_authors(authors_str) :
    return [author for author in clean_authors(authors_str).split(',')]

In [8]:
df.updated = df.updated.apply(extract_date)
df.published = df.published.apply(extract_date)
df.title = df.title.apply(clean_data)
df.summary = df.summary.apply(clean_data)
df.primary_category = df.primary_category.apply(get_category)
df.categories = df.categories.apply(get_categories)
df.authors = df.authors.apply(get_authors)

In [9]:
df.head(5)

Unnamed: 0,updated,published,title,summary,primary_category,categories,authors
0,2023.42,2023.42,stationary solutions for the nonlinear schroed...,we construct stationary statistical solutions ...,ap,"[ap, mp, pr]","[benedetta ferrario, margherita zanella]"
1,2023.42,2023.33,entanglement of sections examples looking for ...,quantum information is about the entanglement ...,ph,"[at, mp]","[m h freedman, m b hastings]"
2,2023.42,2022.67,the calogero moser derivative nonlinear schroe...,we study the calogero moser derivative nls equ...,ap,"[ap, mp, si]","[patrick gerard, enno lenzmann]"
3,2023.42,2023.42,global in space stability of singularity forma...,we continue our work on the analysis of spatia...,ap,"[ap, dg, mp]",[irfan glogic]
4,2023.42,2023.42,monitored non adiabatic and coherent controlle...,recently measurement based quantum thermal mac...,ph,[mp],"[abdelkader el makouri, abdallah slaoui, rachi..."


In [10]:
df.to_csv('data/df_experiment',header = True)