In [1]:
!pip install bertopic
!pip install arxiv

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bertopic
  Downloading bertopic-0.15.0-py2.py3-none-any.whl (143 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/143.4 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.29.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m99.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.2/88.2 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentence-transformers>=0.4.1 (from

In [2]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

import os
import pandas as pd
import numpy as np
from bertopic import BERTopic
import arxiv
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from hdbscan import HDBSCAN

from google.colab import drive

drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/Arxiv_Recommender')
!pwd

Mounted at /content/drive
/content/drive/MyDrive/Arxiv_Recommender


In [3]:
import sys
sys.path.append('')
from data_utils import clean_data

In [66]:
def results_to_df(query_results):
  drop_cols = ['authors','links','_raw']
  df = pd.DataFrame()

  for result in query_results:
      row_dict = {k : v for (k,v) in vars(result).items() if k not in drop_cols}
      row_dict['authors'] = ','.join([author.name for author in result.authors])
      row = pd.Series(row_dict)
      df = pd.concat([df , row.to_frame().transpose()], axis = 0)

  return df.reset_index(drop=True,inplace=False)


# - train the optimal BERTopic model -

In [4]:
df_lib = pd.read_parquet('data/raw_data/filter_20k.parquet')
lib_vecs = pd.read_parquet('data/vector_embeddings/df_lib_vecs_20k_sbert.parquet').values
lib_abs = (df_lib.title + ' ' +  df_lib.abstract).to_list()
len(lib_abs)

20000

In [5]:
vectorizer_model = CountVectorizer(ngram_range=(2, 3), 
                                   stop_words="english")
umap_model = UMAP(n_neighbors=5, 
                      n_components=5,
                      min_dist = 0, 
                      metric='cosine', 
                      random_state = 623,
                      low_memory=False)
bertopic_model = BERTopic(embedding_model = 'all-MiniLM-L6-v2',
                              umap_model = umap_model,
                              vectorizer_model=vectorizer_model, 
                              calculate_probabilities=True,
                              verbose = True) 

In [6]:
lib_topics, lib_probs = bertopic_model.fit_transform(lib_abs, lib_vecs)

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

2023-06-03 00:00:47,283 - BERTopic - Reduced dimensionality
2023-06-03 00:03:12,360 - BERTopic - Clustered reduced embeddings


In [7]:
df_lib_topic_freqs = bertopic_model.get_topic_freq()
df_topic_info = bertopic_model.get_topic_info()
n_topics = df_lib_topic_freqs['Topic'].shape[0]

In [8]:
new_topics = bertopic_model.reduce_outliers(lib_abs, lib_topics, probabilities=lib_probs, 
                             threshold=0.05, strategy="probabilities")
bertopic_model.update_topics(lib_abs, topics=new_topics)

# - one to one arxiv recommender using BERTopic -

In [74]:
# euclidean norm
def norm(vec):
  return np.sqrt((vec**2).sum())

# cosine similarity function
def cos_sim(vec1,vec2):
  return (vec1 * vec2).sum()/(norm(vec1)*norm(vec2))

def get_title_abstracts(ids) :
  results =  arxiv.Search(id_list = ids).results()
  abstracts = []
  for result in results :
    abstracts.append(clean_data(result.title + ' ' + result.summary))
  return abstracts

def stratify_by_topics(df, vecs, topic_labels, n_topics) :
  # dfs_by_topics : a list where the ith entry is the sub-dataframe of df with topic i
  # arrs_by_topics : a list where the ith entry is the vector embedding 
  dfs_by_topics = []
  vecs_by_topics = []
  for i in range(n_topics) :
    idxs = [k for k,topic in enumerate(topic_labels) if topic == i]
    if idxs :
      dfs_by_topics.append(df.iloc[idxs])
      vecs_by_topics.append(vecs[idxs])
    else :
      dfs_by_topics.append(pd.DataFrame())
      vecs_by_topics.append(np.array([]))
  return dfs_by_topics, vecs_by_topics


def one_on_one_recommender(user_vecs,user_topics,lib_dfs_by_topics,lib_vecs_by_topics, df_topic_info) :
  papers_to_recommend =[]
  for i in range(user_vecs.shape[0]):
    # this part is unnecessary but I'm doing this to simplify notations
    # vector representing ith paper in the user input 
    user_vec = user_vecs[i].reshape(1,-1)
    # the topic index that ith paper in the user input belongs to
    user_topic = user_topics[i]

    # compute cosine similarity scores between ith paper in the user input and 
    # the daily papers(the papers we will recommend from) with the topic == user_topic
    sim_scores = [cos_sim(user_vec, vec) for vec in lib_vecs_by_topics[user_topic]]
    # check whether daily papers with topic == user_topic is empty or not
    if sim_scores :
      # if daily papers with topic == user_topic is not empty
      # append the title of the daily paper with highest similarity score with ith paper in the user input
      argmax = max(enumerate(sim_scores), key=lambda x: x[1])[0]
      papers_to_recommend.append(pd.DataFrame({'categories' : df_topic_info['Name'].iloc[user_topic+1],
                                               'title' : lib_dfs_by_topics[user_topic].iloc[argmax].title, 
                                               'summary' :lib_dfs_by_topics[user_topic].iloc[argmax].abstract},
                                              index = [i]))
    else : 
      # if daily papers with topic == user_topic is empty 
      papers_to_recommend.append(pd.DataFrame({},index = [i]))
  return pd.concat(papers_to_recommend, axis = 0)

In [10]:
ethan = ['1802.03426', '2304.14481', '2303.03190', '2210.13418',
         '2210.12824', '2210.00661', '2007.02390', '1808.05860',
         '2005.12732','1804.05690']
jeeuhn = ['0905.0486', 'math/0006187', '2106.07444', '1402.0490', 
          '1512.08942', '1603.09235', 'math/0510265', 'math/0505056', 
          'math/0604379', '2209.02568']
mike = ['2207.13571','2207.13498','2211.09644','2001.10647',
        '2103.08093','2207.08245', '2207.01677','2205.08744',
        '2008.04406','1912.09845']
jenia = ['2010.14967', '1307.0493', 'quant-ph/0604014', '2201.05140', 
         '1111.1877', 'quant-ph/9912054', '1611.08286', '1507.02858', 
         'math-ph/0107001','1511.01241', 'math-ph/9904020', '2211.15336', 
         '2212.03719']

# Ethan

In [11]:
lib_dfs_by_topics, lib_vecs_by_topics = stratify_by_topics(df_lib, lib_vecs, lib_topics, n_topics)

In [87]:
user_input_ids = ethan

user_abs = get_title_abstracts(user_input_ids)

# convert the list of abstracts of user input papers into vectors using
# sBERT inside the trained BERTopic model(i.e. topic_model) 
user_vecs = bertopic_model.embedding_model.embed(user_abs)

# inference the topics that the user input papers belong to
user_topics,_ = bertopic_model.transform(user_abs)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2023-06-03 00:44:06,678 - BERTopic - Reduced dimensionality
2023-06-03 00:44:06,794 - BERTopic - Calculated probabilities with HDBSCAN
2023-06-03 00:44:06,795 - BERTopic - Predicted clusters


In [88]:
print('For the following input papers :')
display(results_to_df(arxiv.Search(id_list = user_input_ids).results())[['categories','title','summary']])

For the following input papers :


Unnamed: 0,categories,title,summary
0,"[stat.ML, cs.CG, cs.LG]",UMAP: Uniform Manifold Approximation and Proje...,UMAP (Uniform Manifold Approximation and Proje...
1,[math.GT],"Endperiodic maps, splitting sequences, and bra...",We strengthen the unpublished theorem of Gabai...
2,"[math.GT, math.CO]",Train track combinatorics and cluster algebras,The concepts of train track was introduced by ...
3,"[math.GT, math.DS]",Standardly embedded train tracks and pseudo-An...,We show that given a fully-punctured pseudo-An...
4,[math.GR],Class number for pseudo-Anosovs,"Given two automorphisms of a group $G$, one is..."
5,[math.GT],"Braids, entropies and fibered 2-fold branched ...",It is proved by Sakuma and Brooks that any clo...
6,"[math.AT, physics.soc-ph, 55N31]",The (homological) persistence of gerrymandering,"We apply persistent homology, the dominant too..."
7,"[physics.soc-ph, math.MG, 91D20 (Primary), 05C...",Discrete geometry for electoral geography,"We discuss the ""compactness,"" or shape analysi..."
8,"[cs.SI, cs.CY, math.CO, 05C90, 05C70, 05C85]",Mathematics of Nested Districts: The Case of A...,"In eight states, a ""nesting rule"" requires tha..."
9,"[math.GT, math.DS, 37B10, 57M50, 58J50,]",You can hear the shape of a billiard table: Sy...,We give a complete characterization of the rel...


In [89]:
print('We recommend :')
display(one_on_one_recommender(user_vecs,user_topics,lib_dfs_by_topics,lib_vecs_by_topics,df_topic_info))

We recommend :


Unnamed: 0,categories,title,summary
0,,,
1,,,
2,92_morse function_abstract weak_morse function...,Laminations and 2filling rays on infinite type...,The loop graph of an infinite type surface i...
3,92_morse function_abstract weak_morse function...,A Linking/ Equivariant Variational Argument in...,"Let be a contact form on , let be its Reeb..."
4,138_partially hyperbolic_anosov flows_divergen...,Anomalous partially hyperbolic diffeomorphisms...,Let be a closed 3manifold which admits an A...
5,,,
6,,,
7,233_brownian map_branched coverings_real branc...,Scaling Limit of Random Planar Quadrangulation...,We discuss the scaling limit of large planar...
8,,,
9,47_outer billiard_periodic trajectories_period...,You can hear the shape of a billiard table: Sy...,We give a complete characterization of the r...


# Jeeuhn

In [90]:
user_input_ids = jeeuhn
#users_input_ids = mike
#users_input_ids = jenia

user_abs = get_title_abstracts(user_input_ids)

# convert the list of abstracts of user input papers into vectors using
# sBERT inside the trained BERTopic model(i.e. topic_model) 
user_vecs = bertopic_model.embedding_model.embed(user_abs)

# inference the topics that the user input papers belong to
user_topics,_ = bertopic_model.transform(user_abs)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2023-06-03 00:44:08,436 - BERTopic - Reduced dimensionality
2023-06-03 00:44:08,570 - BERTopic - Calculated probabilities with HDBSCAN
2023-06-03 00:44:08,571 - BERTopic - Predicted clusters


In [91]:
print('For the following input papers :')
display(results_to_df(arxiv.Search(id_list = user_input_ids).results())[['categories','title','summary']])

For the following input papers :


Unnamed: 0,categories,title,summary
0,"[math.GT, math.AG, 17B10, 57T10]",A geometric construction of colored HOMFLYPT h...,"The aim of this paper is two-fold. First, we g..."
1,[math.AG],The Hard Lefschetz Theorem and the topology of...,We introduce the notion of lef line bundles on...
2,"[math.RT, math.AG, math.QA]",From the Hecke Category to the Unipotent Locus,Let $W$ be the Weyl group of a split semisimpl...
3,"[math.SG, math.AG, math.GT]",Legendrian knots and constructible sheaves,We study the unwrapped Fukaya category of Lagr...
4,"[math.SG, math.AG, math.CO, math.GT]",Cluster varieties from Legendrian knots,Many interesting spaces --- including all posi...
5,"[math.AG, math.AT, math.RT]",The Hodge theory of the Decomposition Theorem ...,In its simplest form the Decomposition Theorem...
6,"[math.GT, math.QA, 57M25]",Triply-graded link homology and Hochschild hom...,We trade matrix factorizations and Koszul comp...
7,"[math.QA, math.GT, 57M25]",Matrix factorizations and link homology II,To a presentation of an oriented link as the c...
8,"[math.SG, math.GT, math.RT]",Constructible Sheaves and the Fukaya Category,"Let $X$ be a compact real analytic manifold, a..."
9,"[math.AG, math.RT]",The $P=W$ conjecture for $\mathrm{GL}_n$,We prove the $P=W$ conjecture for $\mathrm{GL}...


In [92]:
print('We recommend :')
display(one_on_one_recommender(user_vecs,user_topics,lib_dfs_by_topics,lib_vecs_by_topics,df_topic_info))

We recommend :


Unnamed: 0,categories,title,summary
0,129_quantum field_topological quantum_quantum ...,On ambiguity in knot polynomials for virtual k...,We claim that HOMFLY polynomials for virtual...
1,12_moduli space_higgs bundles_moduli spaces_li...,Log Picard algebroids and meromorphic line bun...,"We introduce logarithmic Picard algebroids, ..."
2,,,
3,,,
4,,,
5,12_moduli space_higgs bundles_moduli spaces_li...,"On a result of Gelfand, Kapranov, and Zelevinsky",In this paper I give new elementary proofs o...
6,7_vertex operator_conformal field_vertex opera...,Multilinear forms and graded algebras,In this paper we investigate the class of th...
7,129_quantum field_topological quantum_quantum ...,On ambiguity in knot polynomials for virtual k...,We claim that HOMFLY polynomials for virtual...
8,,,
9,,,


# Michael

In [93]:
user_input_ids = mike

user_abs = get_title_abstracts(user_input_ids)

# convert the list of abstracts of user input papers into vectors using
# sBERT inside the trained BERTopic model(i.e. topic_model) 
user_vecs = bertopic_model.embedding_model.embed(user_abs)

# inference the topics that the user input papers belong to
user_topics,_ = bertopic_model.transform(user_abs)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2023-06-03 00:44:10,097 - BERTopic - Reduced dimensionality
2023-06-03 00:44:10,209 - BERTopic - Calculated probabilities with HDBSCAN
2023-06-03 00:44:10,211 - BERTopic - Predicted clusters


In [94]:
print('For the following input papers :')
display(results_to_df(arxiv.Search(id_list = user_input_ids).results())[['categories','title','summary']])

For the following input papers :


Unnamed: 0,categories,title,summary
0,"[math-ph, math.MP, math.SP]",Scaling asymptotics of spectral Wigner functions,We prove that smooth Wigner-Weyl spectral sums...
1,"[math.SP, math.AP]",$2$-nodal domain theorems for higher dimension...,We prove that the real parts of equivariant (b...
2,"[math.AP, math.SP, 35P20, 58J50]",Asymptotics for the spectral function on Zoll ...,"On a smooth, compact, Riemannian manifold with..."
3,"[math.AP, 58J40, 35A23, 58K35]",Caustics of weakly Lagrangian distributions,We study semiclassical sequences of distributi...
4,"[math.AP, math-ph, math.MP, math.SP]",Around quantum ergodicity,We discuss Shnirelman's Quantum Ergodicity The...
5,"[math.SP, math-ph, math.AP, math.MP]",Classical Wave methods and modern gauge transf...,"In this article, we consider the asymptotic be..."
6,"[math-ph, math.MP, math.SP]",Scaling Asymptotics of Wigner Distributions of...,The main result of this article gives scaling ...
7,"[math.AP, math.DG, math.SP]",A proof of a Melrose's trace formula,We give a new proof ofan extension of the Chaz...
8,"[math-ph, math.MP, 81Q20 (Primary), 35S30 (Sec...",Reduction and Coherent States,We apply a quantum version of dimensional redu...
9,[math.AP],An introduction to microlocal complex deformat...,In this expository article we relate the prese...


In [95]:
print('We recommend :')
display(one_on_one_recommender(user_vecs,user_topics,lib_dfs_by_topics,lib_vecs_by_topics,df_topic_info))

We recommend :


Unnamed: 0,categories,title,summary
0,0_random matrices_random matrix_random matrix ...,Uniform Semiclassical Approximation for the Wi...,A new uniform asymptotic approximation for t...
1,18_lie group_riemannian foliations_singular ri...,A diameter gap for quotients of the unit sphere,We prove that for any isometric action of a ...
2,216_random waves_arithmetic random waves_excis...,Mean of the norm for normalized random waves o...,This article concerns upper bounds for norms...
3,11_schr odinger_odinger operators_schr odinger...,Weyl law for semiclassical resonances with ran...,In this work we consider semiclassical Schr\...
4,,,
5,11_schr odinger_odinger operators_schr odinger...,Short Loops and Pointwise Spectral Asymptotics,We consider pointwise semiclassical spectral...
6,0_random matrices_random matrix_random matrix ...,Uniform Semiclassical Approximation for the Wi...,A new uniform asymptotic approximation for t...
7,68_index theorem_dirac operator_pseudodifferen...,Logarithmic Trace of Toeplitz Projectors,We prove that the trace of the logarithmic t...
8,,,
9,,,


# Jenia

In [96]:
user_input_ids = jenia

user_abs = get_title_abstracts(user_input_ids)

# convert the list of abstracts of user input papers into vectors using
# sBERT inside the trained BERTopic model(i.e. topic_model) 
user_vecs = bertopic_model.embedding_model.embed(user_abs)

# inference the topics that the user input papers belong to
user_topics,_ = bertopic_model.transform(user_abs)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2023-06-03 00:44:12,383 - BERTopic - Reduced dimensionality
2023-06-03 00:44:12,539 - BERTopic - Calculated probabilities with HDBSCAN
2023-06-03 00:44:12,540 - BERTopic - Predicted clusters


In [97]:
print('For the following input papers :')
display(results_to_df(arxiv.Search(id_list = user_input_ids).results())[['categories','title','summary']])

For the following input papers :


Unnamed: 0,categories,title,summary
0,"[math.AP, math-ph, math.MP, math.SP]",Construction of quasimodes for non-selfadjoint...,We construct quasimodes for some non-selfadjoi...
1,"[math-ph, math.MP, math.SG, 37J05, 81S10, 53C55]",The exponential map of the complexification of...,"Let $(M, \omega, J)$ be a K\""ahler manifold an..."
2,"[quant-ph, math-ph, math.MP]",Time evolution of non-Hermitian Hamiltonian sy...,"We provide time-evolution operators, gauge tra..."
3,"[quant-ph, hep-th, math-ph, math.MP]",An introduction to PT-symmetric quantum mechan...,I will provide a pedagogical introduction to n...
4,"[math-ph, math.MP, quant-ph]",Complexified coherent states and quantum evolu...,"The complex geometry underlying the Schr\""odin..."
5,"[quant-ph, math-ph, math.MP]",Holomorphic Methods in Mathematical Physics,This set of lecture notes gives an introductio...
6,[quant-ph],Unitarity of the time-evolution and observabil...,Here we present an strategy for the derivation...
7,"[math-ph, math.MP, math.NA, 42C05, 81Q12, 81S10]",Non-Hermitian propagation of Hagedorn wavepackets,We investigate the time evolution of Hagedorn ...
8,"[math-ph, hep-th, math.MP, quant-ph]",Pseudo-Hermiticity versus PT Symmetry: The nec...,We introduce the notion of pseudo-Hermiticity ...
9,"[math.AP, math-ph, math.MP, 58J40, 81Q20]",Semiclassical states associated to isotropic s...,We define classes of quantum states associated...


In [98]:
print('We recommend :')
display(one_on_one_recommender(user_vecs,user_topics,lib_dfs_by_topics,lib_vecs_by_topics,df_topic_info))

We recommend :


Unnamed: 0,categories,title,summary
0,,,
1,75_quaternionic contact_complex structures_qua...,The Complex Geometry and Representation Theory...,"Given a measure space , we can construct a n..."
2,,,
3,115_exceptional points_quantum phase transitio...,Symmetry in HartreeFock Theory,symmetry invariance with respect to combine...
4,189_coherent states_number coherent states_num...,Complexified coherent states and quantum evolu...,"The complex geometry underlying the Schr\""od..."
5,,,
6,115_exceptional points_quantum phase transitio...,Energy Observable for a Quantum System with a ...,A nonHermitian operator may serve as the Ham...
7,189_coherent states_number coherent states_num...,Nonclassical behaviour of coherent states for ...,"We construct the coherent states and Schr\""o..."
8,,,
9,,,
