In [1]:
import arxiv
import re
import pandas as pd
from data_utils import clean_data, clean_authors

In [3]:
def extract_date(date) :
    date_str = str(date)
    return round(float(date_str.split('-')[0])+float(date_str.split('-')[1])/12,2)

def get_category(category_str):
    return category_str[-2:].lower()

def get_categories(category_list):
    return [get_category(category_str) for category_str in category_list if '.' in category_str]

def get_authors(authors_str) :
    return [author for author in clean_authors(authors_str).split(',')]

## USER INTEREST SETS

Here we can each list ArXiv paper IDs to be used as inputs to the model to test its recommendations. See the example `ethan` below, which are Ethan's papers of interest.

Note that there is no need to restrict our interest sets to particular dates, or to not intersect the library we are recommending from (i.e., our dataset that we are pulling from).

In [4]:
ethan = ['1802.03426', '2304.14481', '2303.03190', '2210.13418',
         '2210.12824', '2210.00661', '2007.02390', '1808.05860',
         '2005.12732','1804.05690']
jeeuhn = ['0905.0486', 'math/0006187', '2106.07444', '1402.0490', 
          '1512.08942', '1603.09235', 'math/0510265', 'math/0505056', 
          'math/0604379', '2209.02568']
mike = ['2207.13571','2207.13498','2211.09644','2001.10647',
        '2103.08093','2207.08245', '2207.01677','2205.08744',
        '2008.04406','1912.09845']
jenia = ['2010.14967', '1307.0493', 'quant-ph/0604014', '2201.05140', 
         '1111.1877', 'quant-ph/9912054', '1611.08286', '1507.02858', 
         'math-ph/0107001','1511.01241', 'math-ph/9904020', '2211.15336', 
         '2212.03719']

## Creating the test set 

In [5]:
# read in the dev set for format comparison
dev = pd.read_parquet('final_data/clean_dev_set.parquet')
dev.head()

Unnamed: 0.1,Unnamed: 0,title_raw,abstract_raw,authors,strip_cat,doc_string
0,0,Inverse Approximation Theory for Nonlinear Rec...,We prove an inverse approximation theorem for ...,"Shida Wang,Zhong Li,Qianxiao Li",[Dynamical Systems],inverse approximation theory for nonlinear rec...
1,1,Sharp Spectral Rates for Koopman Operator Lear...,Non-linear dynamical systems can be handily de...,"Vladimir Kostic,Karim Lounici,Pietro Novelli,M...",[Dynamical Systems],sharp spectral rates for koopman operator lear...
2,2,Clustering and Arnoux-Rauzy words,We characterize the clustering of a word under...,"Sébastien Ferenczi,Luca Q. Zamboni",[Dynamical Systems],clustering and arnoux rauzy wordswe characteri...
3,3,Node Embedding from Neural Hamiltonian Orbits ...,"In the graph node embedding problem, embedding...","Qiyu Kang,Kai Zhao,Yang Song,Sijie Wang,Wee Pe...",[Dynamical Systems],node embedding from neural hamiltonian orbits ...
4,4,Equidistribution of iterations of holomorphic ...,In this paper we analyze a certain family of h...,Nils Hemmingsson,[Dynamical Systems],equidistribution of iterations of holomorphic ...


In [55]:
id_list = ethan+jeeuhn+mike+jenia
search = arxiv.Search(id_list=id_list)
results = search.results()

drop_cols = ['authors','links','_raw']
df = pd.DataFrame()

for result in results:
    row_dict = {k : v for (k,v) in vars(result).items() if k not in drop_cols}
    row_dict['authors'] = ','.join([author.name for author in result.authors])
    row = pd.Series(row_dict)
    df = pd.concat([df , row.to_frame().transpose()], axis = 0)

df.reset_index(drop=True,inplace=True)

In [56]:
df['name'] = 'Ethan'
df['name'].iloc[10:20] = 'Jeeuhn'
df['name'].iloc[20:30] = 'Mike'
df['name'].iloc[30:] = 'Jenia'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['name'].iloc[10:20] = 'Jeeuhn'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['name'].iloc[20:30] = 'Mike'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['name'].iloc[30:] = 'Jenia'


In [57]:
cols = ['entry_id', 'title', 'summary', 'categories',
        'authors', 'name']

df = df[cols]

In [58]:
type(df['categories'].iloc[0])

list

In [59]:
from data_utils import category_map
categories = category_map()

def get_cat(lst):
    cats = list(set([categories[cat] for cat in lst if cat in categories.keys()]))
    return cats

In [60]:
get_cat(df['categories'].iloc[0])

['Machine Learning', 'Computational Geometry']

In [61]:
df['strip_cat'] = df['categories'].apply(get_cat)
df.sample(5)

Unnamed: 0,entry_id,title,summary,categories,authors,name,strip_cat
10,http://arxiv.org/abs/0905.0486v3,A geometric construction of colored HOMFLYPT h...,"The aim of this paper is two-fold. First, we g...","[math.GT, math.AG, 17B10, 57T10]","Ben Webster,Geordie Williamson",Jeeuhn,"[Algebraic Geometry, Geometric Topology]"
11,http://arxiv.org/abs/math/0006187v1,The Hard Lefschetz Theorem and the topology of...,We introduce the notion of lef line bundles on...,[math.AG],"Mark Andrea de Cataldo,Luca Migliorini",Jeeuhn,[Algebraic Geometry]
5,http://arxiv.org/abs/2210.00661v1,"Braids, entropies and fibered 2-fold branched ...",It is proved by Sakuma and Brooks that any clo...,[math.GT],"Susumu Hirose,Eiko Kin",Ethan,[Geometric Topology]
37,http://arxiv.org/abs/1507.02858v3,Non-Hermitian propagation of Hagedorn wavepackets,We investigate the time evolution of Hagedorn ...,"[math-ph, math.MP, math.NA, 42C05, 81Q12, 81S10]","Caroline Lasser,Roman Schubert,Stephanie Tropp...",Jenia,"[Mathematical Physics, Numerical Analysis]"
6,http://arxiv.org/abs/2007.02390v1,The (homological) persistence of gerrymandering,"We apply persistent homology, the dominant too...","[math.AT, physics.soc-ph, 55N31]","Moon Duchin,Tom Needham,Thomas Weighill",Ethan,"[Physics and Society, Algebraic Topology]"


In [62]:
df['title_clean'] = df['title'].apply(clean_data)
df['abstract_clean'] = df['summary'].apply(clean_data)
df['doc_string'] = df['title_clean'] + ' ' + df['abstract_clean']
df.sample(5)

Unnamed: 0,entry_id,title,summary,categories,authors,name,strip_cat,title_clean,abstract_clean,doc_string
34,http://arxiv.org/abs/1111.1877v2,Complexified coherent states and quantum evolu...,"The complex geometry underlying the Schr\""odin...","[math-ph, math.MP, quant-ph]","Eva-Maria Graefe,Roman Schubert",Jenia,"[Mathematical Physics, Quantum Physics]",complexified coherent states and quantum evolu...,the complex geometry underlying the schrodinge...,complexified coherent states and quantum evolu...
39,http://arxiv.org/abs/1511.01241v2,Semiclassical states associated to isotropic s...,We define classes of quantum states associated...,"[math.AP, math-ph, math.MP, 58J40, 81Q20]","Victor Guillemin,Alejandro Uribe,Zuoqin Wang",Jenia,"[Mathematical Physics, Analysis of PDEs]",semiclassical states associated to isotropic s...,we define classes of quantum states associated...,semiclassical states associated to isotropic s...
24,http://arxiv.org/abs/2103.08093v2,Around quantum ergodicity,We discuss Shnirelman's Quantum Ergodicity The...,"[math.AP, math-ph, math.MP, math.SP]",Semyon Dyatlov,Mike,"[Spectral Theory, Mathematical Physics, Analys...",around quantum ergodicity,we discuss shnirelmans quantum ergodicity theo...,around quantum ergodicity we discuss shnirelma...
1,http://arxiv.org/abs/2304.14481v1,"Endperiodic maps, splitting sequences, and bra...",We strengthen the unpublished theorem of Gabai...,[math.GT],"Michael P. Landry,Chi Cheuk Tsang",Ethan,[Geometric Topology],endperiodic maps splitting sequences and branc...,we strengthen the unpublished theorem of gabai...,endperiodic maps splitting sequences and branc...
22,http://arxiv.org/abs/2211.09644v1,Asymptotics for the spectral function on Zoll ...,"On a smooth, compact, Riemannian manifold with...","[math.AP, math.SP, 35P20, 58J50]","Yaiza Canzani,Jeffrey Galkowski,Blake Keeler",Mike,"[Spectral Theory, Analysis of PDEs]",asymptotics for the spectral function on zoll ...,on a smooth compact riemannian manifold withou...,asymptotics for the spectral function on zoll ...


In [63]:
print(df['title_clean'].iloc[0])
print(df['abstract_clean'].iloc[0])
print(df['doc_string'].iloc[0])

umap uniform manifold approximation and projection for dimension reduction
umap uniform manifold approximation and projection is a novel manifold learning technique for dimension reduction umap is constructed from a theoretical framework based in riemannian geometry and algebraic topology the result is a practical scalable algorithm that applies to real world data the umap algorithm is competitive with t sne for visualization quality and arguably preserves more of the global structure with superior run time performance furthermore umap has no computational restrictions on embedding dimension making it viable as a general purpose dimension reduction technique for machine learning
umap uniform manifold approximation and projection for dimension reduction umap uniform manifold approximation and projection is a novel manifold learning technique for dimension reduction umap is constructed from a theoretical framework based in riemannian geometry and algebraic topology the result is a practi

In [66]:
final_cols = ['title', 'summary', 'authors', 'strip_cat', 'doc_string', 'name']
df2 = df[final_cols].copy()
df2.head()

Unnamed: 0,title,summary,authors,strip_cat,doc_string,name
0,UMAP: Uniform Manifold Approximation and Proje...,UMAP (Uniform Manifold Approximation and Proje...,"Leland McInnes,John Healy,James Melville","[Machine Learning, Computational Geometry]",umap uniform manifold approximation and projec...,Ethan
1,"Endperiodic maps, splitting sequences, and bra...",We strengthen the unpublished theorem of Gabai...,"Michael P. Landry,Chi Cheuk Tsang",[Geometric Topology],endperiodic maps splitting sequences and branc...,Ethan
2,Train track combinatorics and cluster algebras,The concepts of train track was introduced by ...,Shunsuke Kano,"[Combinatorics, Geometric Topology]",train track combinatorics and cluster algebras...,Ethan
3,Standardly embedded train tracks and pseudo-An...,We show that given a fully-punctured pseudo-An...,"Eriko Hironaka,Chi Cheuk Tsang","[Geometric Topology, Dynamical Systems]",standardly embedded train tracks and pseudo an...,Ethan
4,Class number for pseudo-Anosovs,"Given two automorphisms of a group $G$, one is...","François Dahmani,Mahan Mj",[Group Theory],class number for pseudo anosovs given two auto...,Ethan


In [68]:
df2.rename({'title': 'title_raw',
           'summary': 'abstract_raw'},
           axis=1, inplace=True)

df2.head()

Unnamed: 0,title_raw,abstract_raw,authors,strip_cat,doc_string,name
0,UMAP: Uniform Manifold Approximation and Proje...,UMAP (Uniform Manifold Approximation and Proje...,"Leland McInnes,John Healy,James Melville","[Machine Learning, Computational Geometry]",umap uniform manifold approximation and projec...,Ethan
1,"Endperiodic maps, splitting sequences, and bra...",We strengthen the unpublished theorem of Gabai...,"Michael P. Landry,Chi Cheuk Tsang",[Geometric Topology],endperiodic maps splitting sequences and branc...,Ethan
2,Train track combinatorics and cluster algebras,The concepts of train track was introduced by ...,Shunsuke Kano,"[Combinatorics, Geometric Topology]",train track combinatorics and cluster algebras...,Ethan
3,Standardly embedded train tracks and pseudo-An...,We show that given a fully-punctured pseudo-An...,"Eriko Hironaka,Chi Cheuk Tsang","[Geometric Topology, Dynamical Systems]",standardly embedded train tracks and pseudo an...,Ethan
4,Class number for pseudo-Anosovs,"Given two automorphisms of a group $G$, one is...","François Dahmani,Mahan Mj",[Group Theory],class number for pseudo anosovs given two auto...,Ethan


In [69]:
df2.to_parquet('final_data/clean_test_set.parquet')

In [70]:
df3 = pd.read_parquet('final_data/clean_test_set.parquet')
df3.head()

Unnamed: 0,title_raw,abstract_raw,authors,strip_cat,doc_string,name
0,UMAP: Uniform Manifold Approximation and Proje...,UMAP (Uniform Manifold Approximation and Proje...,"Leland McInnes,John Healy,James Melville","[Machine Learning, Computational Geometry]",umap uniform manifold approximation and projec...,Ethan
1,"Endperiodic maps, splitting sequences, and bra...",We strengthen the unpublished theorem of Gabai...,"Michael P. Landry,Chi Cheuk Tsang",[Geometric Topology],endperiodic maps splitting sequences and branc...,Ethan
2,Train track combinatorics and cluster algebras,The concepts of train track was introduced by ...,Shunsuke Kano,"[Combinatorics, Geometric Topology]",train track combinatorics and cluster algebras...,Ethan
3,Standardly embedded train tracks and pseudo-An...,We show that given a fully-punctured pseudo-An...,"Eriko Hironaka,Chi Cheuk Tsang","[Geometric Topology, Dynamical Systems]",standardly embedded train tracks and pseudo an...,Ethan
4,Class number for pseudo-Anosovs,"Given two automorphisms of a group $G$, one is...","François Dahmani,Mahan Mj",[Group Theory],class number for pseudo anosovs given two auto...,Ethan
