In [4]:
import pandas as pd
import arxiv

from data_utils import clean_data, clean_authors

In [2]:
def format_query(author='',title='',cat='',abstract=''):
    """Returns a formatted arxiv query string to handle
    simple queries of at most one instance each of these fields.
    To leave a field unspecified, leave the corresponding argument blank.
    
    e.g. format_query(cat='math.AP') will return
    the string used to pull all articles
    with the subject tag 'PDEs',
    since Math.AP is the subject tag
    for 'Analysis of PDEs'.

    Args:
        author: string to search for in the author field.
        title: string to search for in the title field.
        cat: A valid arxiv subject tag. See the full list of these at:
        https://arxiv.org/category_taxonomy
        abstract: string to search for in the abstract field.

    Returns:
        properly formatted query string to return
        all results simultaneously matching all specified fields.
    """

    tags = [f'au:{author}', f'ti:{title}', f'cat:{cat}', f'abs:{abstract}'] 
    # the tag.endswith(':') below
    # is for filtering out tags that
    # we do not pass to the function
    query = ' AND '.join([tag for tag in tags if not tag.endswith(':')])
    return query



def query_to_df(query,max_results):
    """Returns the results of an arxiv API query in a pandas dataframe.

    Args:
        query: string defining an arxiv query
        formatted according to 
        https://info.arxiv.org/help/api/user-manual.html#51-details-of-query-construction
        
        max_results: positive integer specifying
        the maximum number of results returned.

    Returns:
        pandas dataframe with one column for
        indivial piece of metadata of a returned result.
        To see a list of these columns and their descriptions,
        see the documentation for the
        Results class of the arxiv package here:
        http://lukasschwab.me/arxiv.py/index.html#Result

        The 'links' column is dropped and
        the authors column is replaced by
        a single string of each author name
        separated by a comma.

    """
    search = arxiv.Search(
            query = query,
            max_results=max_results,
            sort_by=arxiv.SortCriterion.LastUpdatedDate
            )
    results = search.results()

    drop_cols = ['authors','links','_raw']
    df = pd.DataFrame()

    for result in results:
        row_dict = {k : v for (k,v) in vars(result).items() if k not in drop_cols}
        row_dict['authors'] = ','.join([author.name for author in result.authors])
        row = pd.Series(row_dict)
        df = pd.concat([df , row.to_frame().transpose()], axis = 0)

    return df.reset_index(drop=True,inplace=False)

In [3]:
## Example: Pulling the most recently updated 1000 articles
# with the primary subject category 'PDEs'

query = format_query(cat='math.AP')
pdes = query_to_df(query=query,max_results=1000)

pdes.head()

Unnamed: 0,entry_id,updated,published,title,summary,comment,journal_ref,doi,primary_category,categories,pdf_url,authors
0,http://arxiv.org/abs/2305.09622v1,2023-05-16 17:20:19+00:00,2023-05-16 17:20:19+00:00,Prescribing nearly constant curvatures on balls,In this paper we address two boundary cases of...,31 pages,,,math.AP,"[math.AP, math.DG, 35J25, 58J32]",http://arxiv.org/pdf/2305.09622v1,"Luca Battaglia,Sergio Cruz Blázquez,Angela Pis..."
1,http://arxiv.org/abs/2305.09618v1,2023-05-16 17:11:55+00:00,2023-05-16 17:11:55+00:00,Port-Hamiltonian formulation of Oseen flows,We present Oseen equations on Lipschitz domain...,19 pages,,,math.AP,"[math.AP, math.DS, math.FA, math.OC]",http://arxiv.org/pdf/2305.09618v1,"Timo Reis,Manuel Schaller"
2,http://arxiv.org/abs/2305.09609v1,2023-05-16 17:01:05+00:00,2023-05-16 17:01:05+00:00,On nonlocal Dirichlet problems with oscillatin...,"In this paper, a class of nonlocal fractional ...",,Discrete Contin. Dyn. Syst. Ser. S 16:6 (2023)...,10.3934/dcdss.2022130,math.AP,"[math.AP, math.FA, Primary: 47J30, 35R11, Seco...",http://arxiv.org/pdf/2305.09609v1,"Boštjan Gabrovšek,Giovanni Molica Bisci,Dušan ..."
3,http://arxiv.org/abs/2305.09582v1,2023-05-16 16:29:20+00:00,2023-05-16 16:29:20+00:00,Twisting in Hamiltonian Flows and Perfect Fluids,We establish a number of results that reveal a...,"32 pages, 7 figures",,,math.AP,"[math.AP, math.DS, physics.flu-dyn]",http://arxiv.org/pdf/2305.09582v1,"Theodore D. Drivas,Tarek M. Elgindi,In-Jee Jeong"
4,http://arxiv.org/abs/2305.09505v1,2023-05-16 14:58:52+00:00,2023-05-16 14:58:52+00:00,Explicit solution of the 1D Schrödinger equation,Evaluation of a product integral with values i...,28 pages,,,math.AP,"[math.AP, math-ph, math.MP, 34L40, 34A05, 34L25]",http://arxiv.org/pdf/2305.09505v1,Peter Gibson


In [5]:
## Clean the title and abstracts in the database
## Adding three new columns to the database

pdes['clean_title'] = pdes['title'].apply(clean_data)
pdes['clean_summary'] = pdes['summary'].apply(clean_data)
pdes['clean_authors'] = pdes['authors'].apply(clean_authors)

pdes.head()

Unnamed: 0,entry_id,updated,published,title,summary,comment,journal_ref,doi,primary_category,categories,pdf_url,authors,clean_title,clean_summary,clean_authors
0,http://arxiv.org/abs/2305.09622v1,2023-05-16 17:20:19+00:00,2023-05-16 17:20:19+00:00,Prescribing nearly constant curvatures on balls,In this paper we address two boundary cases of...,31 pages,,,math.AP,"[math.AP, math.DG, 35J25, 58J32]",http://arxiv.org/pdf/2305.09622v1,"Luca Battaglia,Sergio Cruz Blázquez,Angela Pis...",prescribing nearly constant curvatures on balls,in this paper we address two boundary cases of...,"luca battaglia,sergio cruz blazquez,angela pis..."
1,http://arxiv.org/abs/2305.09618v1,2023-05-16 17:11:55+00:00,2023-05-16 17:11:55+00:00,Port-Hamiltonian formulation of Oseen flows,We present Oseen equations on Lipschitz domain...,19 pages,,,math.AP,"[math.AP, math.DS, math.FA, math.OC]",http://arxiv.org/pdf/2305.09618v1,"Timo Reis,Manuel Schaller",port hamiltonian formulation of oseen flows,we present oseen equations on lipschitz domain...,"timo reis,manuel schaller"
2,http://arxiv.org/abs/2305.09609v1,2023-05-16 17:01:05+00:00,2023-05-16 17:01:05+00:00,On nonlocal Dirichlet problems with oscillatin...,"In this paper, a class of nonlocal fractional ...",,Discrete Contin. Dyn. Syst. Ser. S 16:6 (2023)...,10.3934/dcdss.2022130,math.AP,"[math.AP, math.FA, Primary: 47J30, 35R11, Seco...",http://arxiv.org/pdf/2305.09609v1,"Boštjan Gabrovšek,Giovanni Molica Bisci,Dušan ...",on nonlocal dirichlet problems with oscillatin...,in this paper a class of nonlocal fractional d...,"bostjan gabrovsek,giovanni molica bisci,dusan ..."
3,http://arxiv.org/abs/2305.09582v1,2023-05-16 16:29:20+00:00,2023-05-16 16:29:20+00:00,Twisting in Hamiltonian Flows and Perfect Fluids,We establish a number of results that reveal a...,"32 pages, 7 figures",,,math.AP,"[math.AP, math.DS, physics.flu-dyn]",http://arxiv.org/pdf/2305.09582v1,"Theodore D. Drivas,Tarek M. Elgindi,In-Jee Jeong",twisting in hamiltonian flows and perfect fluids,we establish a number of results that reveal a...,"theodore d drivas,tarek m elgindi,in jee jeong"
4,http://arxiv.org/abs/2305.09505v1,2023-05-16 14:58:52+00:00,2023-05-16 14:58:52+00:00,Explicit solution of the 1D Schrödinger equation,Evaluation of a product integral with values i...,28 pages,,,math.AP,"[math.AP, math-ph, math.MP, 34L40, 34A05, 34L25]",http://arxiv.org/pdf/2305.09505v1,Peter Gibson,explicit solution of the d schroedinger equation,evaluation of a product integral with values i...,peter gibson


In [6]:
## Verify we get desired clean data

for i in range(0,50):
    print("i =", i)
    print("CLEAN TITLE")
    print(repr(pdes['clean_title'][i])) 
    print()
    print("CLEAN AUTHORS")
    print(repr(pdes['clean_authors'][i])) 
    print()
    print("CLEAN SUMMARY")
    print(repr(pdes['clean_summary'][i]))
    print('\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n')

i = 0
CLEAN TITLE
'prescribing nearly constant curvatures on balls'

CLEAN AUTHORS
'luca battaglia,sergio cruz blazquez,angela pistoia'

CLEAN SUMMARY
'in this paper we address two boundary cases of the classical kazdan warner problem more precisely we consider the problem of prescribing the gaussian and boundary geodesic curvature on a disk of r and the scalar and mean curvature on a ball in higher dimensions via a conformal change of the metric we deal with the case of negative interior curvature and positive boundary curvature using a ljapunov schmidt procedure we obtain new existence results when the prescribed functions are close to constants'

+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

i = 1
CLEAN TITLE
'port hamiltonian formulation of oseen flows'

CLEAN AUTHORS
'timo reis,manuel schaller'

CLEAN SUMMARY
'we present oseen equations on lipschitz domains in a port hamiltonian context such equations arise for instance by linearization of the navier stokes equa