In [130]:
import pandas as pd
import numpy as np
# from scipy import spatial

import nltk
from nltk.corpus import stopwords

import gensim.downloader as api
import gensim

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from scipy.spatial import distance

# from gensim import corpora, models, similarities
#from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from multipledispatch import dispatch

import arxiv
from data_utils import format_query, query_to_df, clean_data, clean_authors

import time

In [131]:
## Load in the dataset
## This file keeps the stopwords, but removes words of freq=1 in the corpus from the tokens
#df = pd.read_parquet("./data/filter_20k_tokenized_stopwords.parquet")

## This file removes both stopwords and words of freq=1 in the corpus from the tokens
df = pd.read_parquet("./data/filter_20k_tokenized.parquet")
df.head(1)

Unnamed: 0,id,title,abstract,update_date,authors_parsed,strip_cat,clean_title,clean_abstract,clean_authors,abstract_tokenized,abstract_reduced_tokens
182244,1412.3275,Limit cycles bifurcating from a degenerate center,We study the maximum number of limit cycles ...,2014-12-11,"[['Llibre', 'J.', ''], ['Pantazi', 'C.', '']]",[DS],limit cycles bifurcating from a degenerate center,we study the maximum number of limit cycles th...,"[['llibre', 'j', ''], ['pantazi', 'c', '']]","[we, study, the, maximum, number, of, limit, c...","[study, maximum, number, limit, cycles, bifurc..."


In [132]:
df = df.reset_index()
df = df.rename(columns={"index": "original_index"})
df.head(1)

Unnamed: 0,original_index,id,title,abstract,update_date,authors_parsed,strip_cat,clean_title,clean_abstract,clean_authors,abstract_tokenized,abstract_reduced_tokens
0,182244,1412.3275,Limit cycles bifurcating from a degenerate center,We study the maximum number of limit cycles ...,2014-12-11,"[['Llibre', 'J.', ''], ['Pantazi', 'C.', '']]",[DS],limit cycles bifurcating from a degenerate center,we study the maximum number of limit cycles th...,"[['llibre', 'j', ''], ['pantazi', 'c', '']]","[we, study, the, maximum, number, of, limit, c...","[study, maximum, number, limit, cycles, bifurc..."


In [133]:
df['abstract_reduced_tokens'].isnull().values.any()

False

In [134]:
# Rejoin the tokens for use with some models
df['abstract_reduced'] = df['abstract_reduced_tokens'].apply(" ".join)
df.head(1)

Unnamed: 0,original_index,id,title,abstract,update_date,authors_parsed,strip_cat,clean_title,clean_abstract,clean_authors,abstract_tokenized,abstract_reduced_tokens,abstract_reduced
0,182244,1412.3275,Limit cycles bifurcating from a degenerate center,We study the maximum number of limit cycles ...,2014-12-11,"[['Llibre', 'J.', ''], ['Pantazi', 'C.', '']]",[DS],limit cycles bifurcating from a degenerate center,we study the maximum number of limit cycles th...,"[['llibre', 'j', ''], ['pantazi', 'c', '']]","[we, study, the, maximum, number, of, limit, c...","[study, maximum, number, limit, cycles, bifurc...",study maximum number limit cycles bifurcate de...


Now we define the functions for the various models that we will need to compute similarities.

In [135]:
## A function to get the vector norm
def norm(u):
    return np.sqrt(np.sum(np.power(u,2)))

## A function to get the cosine similarity
def cos_sim(u,v):
    if norm(u)*norm(v) > 0:
        return (u.dot(v))/(norm(u)*norm(v))
    else:
        return np.nan

In [136]:
## Auxilary function
"""
    Display the top n most similar article in the dataset to an 
    article that is already in the dataset.
    
    Inputs:
    df: a DataFrame with all of the articles
    df_sim: a DataFrame with the top n most similar articles to article_index 
            and their computed cosine similarities
            
    article_index: an integer that is an index of an article in the DataFrame;
                   should range from 0 to len(df)-1 
"""
def print_similar(df, df_sim, article_index):
    
    print("The top", len(df_sim), "articles most similar to the article \n\n", 
            article_index, ".", df['title'][article_index])
    print("-----------------------------------------------------\n")
    
    i = 1
    for index in df_sim.index.values: 
        print(i, ".", "(", index , ")", df['title'][index], 
          ", Cosine Similiarity=", np.round(df_sim['Cosine Similarity'][index], 3))
        print()
        i = i + 1

### Some functions for returning the top $n$ most similar papers to a paper already in the dataset

In [137]:
def remove_nan(x):
    temp = x.copy()
    for i in range(len(temp)):
        if np.isnan(temp[i]):
            temp[i] = -1
    return temp

## For use with CountVectorizer and TfidVectorizer
"""
    Prints the top n most similar article titles from the dataframe
    to the input article by calculating their cosine similarity.
    
    Inputs:
    df: a DataFrame with all of the articles
    df_vectorized: a dataframe of word frequencies
    article_index: index of the article we want to compare cosine similarities to
    n: number "n" number of most similar articles to search for
"""
@dispatch(pd.core.frame.DataFrame, pd.core.frame.DataFrame, int, int)

def get_n_most_similar(df, df_vectorized, article_index , n):
    
    print("Using a CountVectorizer or TfidVectorizer.\n")
    
    # Calculate the cosine similariy scores for the i-th article in the dataset
    cosine_sim_list = np.zeros(len(df))

    for i in range(len(df)):
        
        if i != article_index:

            text_to_vector_v1 = df_vectorized.iloc[i].values
            text_to_vector_v2 = df_vectorized.iloc[article_index].values
            
            sim_scores = cos_sim(text_to_vector_v1, text_to_vector_v2)
            cosine_sim_list[i] = sim_scores
   
    ## Caution: there may be some cosine sims with value NaN
    cosine_sim_list = remove_nan(cosine_sim_list)
    ## Getting indices of n maximum values
    x = np.argsort(cosine_sim_list)[::-1][:n]
     
    
    ## Create a dataframe with the results
    df_sim = pd.DataFrame(columns=df.columns)
    df_sim['Cosine Similarity'] = []
   
    for index in x:
        df_sim.loc[index] = df.iloc[index]
        # df_sim.loc[index] = df.iloc[index]
       # df_sim.iloc[index]['Cosine Similarity'] = cosine_sim_list[index]
        df_sim.at[index, 'Cosine Similarity'] = cosine_sim_list[index]
          
    return df_sim, article_index

###################################################################

## For use with Word2Vec
## Word2Vec supports a function called n_similarity
"""
    Prints the top n most similar (by cosine similarity)
    article titles to article_index. 
    
    Inputs: 
    model: a trained Word2Vec model
    df: a DataFrame with all of the articles
    article_index: an integer that is an index of an article in the DataFrame;
                   should range from 0 to len(df)-1 
    n: number "n" number of most similar articles to search for
    
"""
@dispatch(gensim.models.keyedvectors.KeyedVectors, pd.core.frame.DataFrame, int, int)

def get_n_most_similar(model, df, article_index , n):
    
    print("Using Word2Vec.\n")
     
    cosine_sim_list = np.zeros(len(df))

    for i in range(len(df)):      
        # Calculate the cosine similariy scores with the i-th article in the dataset
        
        if i != article_index and len(df['abstract_reduced_tokens'][i]) != 0:
        # The difference here is the use of the n_similarity function
            cosine_sim_list[i]  = model.n_similarity(df['abstract_reduced_tokens'][article_index], 
                                                     df['abstract_reduced_tokens'][i])
        
    ## Caution: there may be some cosine sims with value NaN
    cosine_sim_list = remove_nan(cosine_sim_list)
    ## Getting indices of the n maximum values
    x = np.argsort(cosine_sim_list)[::-1][:n]
            
    ## Create a dataframe with the results
    df_sim = pd.DataFrame(columns=df.columns)
    df_sim['Cosine Similarity'] = []
    
    for index in x:
        df_sim.loc[index] = df.iloc[index].copy()
        #df_sim.loc[index] = df.iloc[index]
        #df_sim['Cosine Similarity'].loc[index] = cosine_sim_list[index]
        df_sim.at[index, 'Cosine Similarity'] = cosine_sim_list[index]
        
    return df_sim, article_index

###################################################################

## Doc2Vec supports a function called most_similar 
"""
    Prints the top n most similar (by cosine similarity)
    article titles to article_index.    
    
    Inputs: 
    model: a trained Doc2Vec model
    df: a DataFrame with all of the articles
    article_index: an integer that is an index of an article in the DataFrame;
                   should range from 0 to len(df)-1 
    n: number "n" number of most similar articles to search for
"""
@dispatch(gensim.models.doc2vec.Doc2Vec, pd.core.frame.DataFrame, int, int)

def get_n_most_similar(model, df, article_index , n):
    
    print("Using Doc2Vec\n")
    
    # dv.most_similar returns the same values as d2v_model.dv.similarity(i, j)
    topn = model.dv.most_similar(article_index, topn=n)
   
    article_indices = [x[0] for x in topn]
    cos_sims = [x[1] for x in topn]
    
    ## Create a dataframe with the results
    df_sim = pd.DataFrame(columns=df.columns)
    
    for index in article_indices:
       # df_sim.loc[index] = df.iloc[index].copy()
        df_sim.loc[index] = df.iloc[index]
    
    df_sim['Cosine Similarity'] = cos_sims
        
    return df_sim, article_index

### Some functions for returning the top $n$ most similar papers to a paper NOT already in the dataset

In [138]:
## CountVectorizer

In [139]:
## TfidVectorizer

In [140]:
## For use with Word2Vec
## Word2Vec supports a function called n_similarity

## user_vector: the tokenized and cleaned abstract of the user's input

@dispatch(gensim.models.keyedvectors.KeyedVectors, pd.core.frame.DataFrame, list, int)
def get_n_most_similar(model, df, user_tokens, n):
    
    print("Using Word2Vec\n")
     
    cosine_sim_list = np.zeros(len(df))

    for i in range(len(df)):      
        # Calculate the cosine similariy scores with the i-th article in the dataset
        # The difference here is the use of the n_similarity function
        cosine_sim_list[i]  = model.n_similarity(user_tokens, df['abstract_reduced_tokens'][i])
        
    ## Getting indices of the n maximum values
    x = np.argsort(cosine_sim_list)[::-1][:n]
            
    ## Create a dataframe with the results
    df_sim = pd.DataFrame(columns=df.columns)
    df_sim['Cosine Similarity'] = []
    
    for index in x:
        df_sim.loc[index] = df.iloc[index].copy() 
        df_sim['Cosine Similarity'].loc[index] = cosine_sim_list[index]
        
    return df_sim

###################################################################

## Doc2Vec Recommender
## use the infer_vector function (may not be necessary?)
## Choose the first paper in our dataset

@dispatch(gensim.models.doc2vec.Doc2Vec, pd.core.frame.DataFrame, list, int)

def get_n_most_similar(d2v_model, df, user_vector, n):
    print("Using Doc2Vec\n")
    
    topn = d2v_model.dv.most_similar([user_vector], topn=n)
    
    article_indices = [x[0] for x in topn]
    cos_sims = [x[1] for x in topn]
    
    ## Create a dataframe with the results
    df_sim = pd.DataFrame(columns=df.columns)
    
    for index in article_indices:
        df_sim.loc[index] = df.iloc[index].copy()
    
    df_sim['Cosine Similarity'] = cos_sims
        
    return df_sim

### Functions for finding the top $n$ recommendations based on $m$ user inputs

In [141]:
## Auxiliary function
"""
    A nicer way to display the recommendations to the user.
    
    df_user: the dataset of articles the user has input
    df_results: results of similar articles based on 
"""
def display_results(df_results, df_user):
    print("The top", len(df_results), "articles most similar to the articles: \n\n")   
   ## print(df_user.index.values)
    ## Get the titles
    i = 0
    for i in range(len(df_user)):
        title = df_user.iloc[i]['title']
        print(i+1, ".", title)  
        i = i + 1
        
    print("\n#############################################################\n")
    
    for i in range(len(df_results)):
       
        match_index = df_results.index.values[i]
        title = df_results.loc[match_index]['title']
        authors = df_results.loc[match_index]['authors_parsed']
        abstract = df_results.loc[match_index]['abstract']
      #  link = df_results.loc[match_index]['entry_id'] 
        cos_sim = df_results.loc[match_index]['Cosine Similarity']
        
        print(i+1, ".", "(", match_index, ")", title, 
              "\n [ Cosine Similarity=", np.round(cos_sim, 3) ,"]\n")
     #   print("\n", authors)
        print("\n", abstract)
      #  print("\n", link) 
        print("\n-----------------------------------------------------\n")

In [142]:
## For use with Word2Vec
"""
    Returns a dataframe of the top n most similar (by cosine similarity)
    article titles to the user's inputs. 
    
    Inputs:
    model: a trained Word2Vec model
    df: a DataFrame with all of the articles  
    df_user: a dataframe of the user's article inputs   
    n: number of recommendations to return 
"""
@dispatch(gensim.models.keyedvectors.KeyedVectors, pd.core.frame.DataFrame,
                                              pd.core.frame.DataFrame, int)

def n_recommendations(model, df, df_user, n):
    
    print("Using Word2Vec")
    
    ## Create a DataFrame to store the similarity results     
    df_sim_scores = pd.DataFrame(columns=df.columns)
    df_sim_scores['Cosine Similarity'] = []
    
    for article_index in df_user.index.values: 
    
        user_abstract = df_user['abstract_tokenized'][article_index]
    
        for i in range(len(df)): 
            if len(df['abstract_reduced_tokens'][i]) != 0:
                # Calculate the cosine similarity scores with the i-th article in the dataset
                sim_score = model.n_similarity(user_abstract, df['abstract_reduced_tokens'][i])
             #   df_sim_scores.loc[len(df_sim_scores)] = df.iloc[i].copy()
                df_sim_scores.loc[len(df_sim_scores)] = df.iloc[i]
             #   df_sim_scores['Cosine Similarity'].loc[i] = sim_score
                df_sim_scores.at[i, 'Cosine Similarity'] = sim_score
            
  #  print("after computing, length df_sim_score =", len(df_sim_scores))
    
    ## Sort the cosine similarity scores in the dataframe
    df_sim_scores = df_sim_scores.sort_values(by=['Cosine Similarity'], ascending=False)
        
    ## Now check for duplicate articles indices and keep the last index
    ## By default, it will keep the first row and remove the redundant rows.
        
    df_sim_scores = df_sim_scores.drop_duplicates(subset=['id'])
       
  #  print("\n after dropping,  length df_sim_score =", len(df_sim_scores))
    
     ## Get the first n articles in the dataframe
    df_top_n = df_sim_scores.head(n)

    return df_top_n, df_user

###################################################################

"""
    Returns a dataframe of the top n most similar (by cosine similarity)
    article titles to the user's inputs. 
    
    Inputs:
    model: a trained Doc2Vec model
    df: a DataFrame with all of the articles  
    df_user: a dataframe of the user's article inputs   
    n: number of recommendations to return 
"""
@dispatch(gensim.models.doc2vec.Doc2Vec, pd.core.frame.DataFrame,
                                      pd.core.frame.DataFrame, int)

def n_recommendations(model, df, df_user, n):
    
    print("Using Doc2Vec")
    
    ## Create a DataFrame to store the similarity results     
    df_sim_scores = pd.DataFrame(columns=df.columns)
    df_sim_scores['Cosine Similarity'] = []
    
    for article_index in df_user.index.values: 
    
        user_abstract = df_user['abstract_tokenized'][article_index]
        
      #  print("type(user_abstract)", type(user_abstract))
        
        ## Infer a vector for a new document    
        ## Is this necessary?
        # user_vector = d2v_model.infer_vector(user_abstract)
        
       # print("type(user_vector)", type(user_vector))
    
        for i in range(len(df)):      
        # Calculate the cosine similarity scores with the i-th article in the dataset
               
           # sim_score = model.n_similarity(user_abstract, df['abstract_reduced_tokens'][i])
            
            vector = df['abstract_reduced_tokens'][i] 
            
           # print("df['abstract_reduced_tokens'][i]", type(vector))
            if len(vector) != 0:
                sim_score = model.wv.n_similarity(user_abstract, vector)

              #  sim_score = model.dv.similarity(user_vector, i)
             #   df_sim_scores.loc[len(df_sim_scores)] = df.iloc[i].copy()
                df_sim_scores.loc[len(df_sim_scores)] = df.iloc[i]
              #  df_sim_scores['Cosine Similarity'].loc[i] = sim_score
                df_sim_scores.at[i, 'Cosine Similarity'] = sim_score
            
   # print("after computing, length df_sim_score =", len(df_sim_scores))
    
    ## Sort the cosine similarity scores in the dataframe
    df_sim_scores = df_sim_scores.sort_values(by=['Cosine Similarity'], ascending=False)
        
    ## Now check for duplicate articles indices and keep the last index
    ## By default, it will keep the first row and remove the redundant rows.
        
    df_sim_scores = df_sim_scores.drop_duplicates(subset=['id'])
       
  #  print("\n after dropping,  length df_sim_score =", len(df_sim_scores))
    
     ## Get the first n articles in the dataframe
    df_top_n = df_sim_scores.head(n)

    return df_top_n, df_user

## The models

### CountVectorizer

Creates a matrix with documents and token counts (bag of terms/tokens) therefore it is also known as document term matrix (dtm).

In [143]:
## max_df: When building the vocabulary ignore terms that have a document frequency 
##         strictly higher than the given threshold (corpus-specific stop words).

count_vectorizer = CountVectorizer(analyzer="word", 
                                tokenizer=nltk.word_tokenize,
                                preprocessor=None, 
                               # stop_words='english', 
                                max_features=2500,
                                ngram_range=(1,3))
                                    ##  max_df=.9
    
bow = count_vectorizer.fit_transform(df['abstract_reduced'])

df_bow = pd.DataFrame(bow.toarray(),
                      columns = count_vectorizer.get_feature_names_out())

### TfidVectorizer

In [144]:
tfid_vectorizer = TfidfVectorizer(analyzer="word", 
                                tokenizer=nltk.word_tokenize,
                                preprocessor=None, 
                             #   stop_words='english', 
                                max_features=2500,
                                ngram_range=(1,3))
                                    ##  max_df=.9
    
tfid = tfid_vectorizer.fit_transform(df['abstract_reduced'])

df_tfid = pd.DataFrame(tfid.toarray(),
                      columns = tfid_vectorizer.get_feature_names_out())

### Word2Vec 

We use a pretrained model for better results!

In [145]:
## Load word2vec model, here GoogleNews is used
## The file must be previously downloaded
## The size of the vectors is 300
w2v_model = gensim.models.KeyedVectors.load_word2vec_format('../GoogleNews-vectors-negative300.bin', 
                                                        binary=True)

### Doc2Vec 

There are two approaches to the Doc2Vec model, (1) distributed bag of words and (2) distributed memory models.

In [146]:
# Rather than a list of tokenized docs Doc2Vec requires tagged lists of tokens
# that's because the model needs to keep track of the documents.
summaries = [TaggedDocument(doc,[i]) for i, doc in enumerate(df['abstract_reduced_tokens'])]

#### Doc2Vec using Distributed Bag of Words

Word2Vec was trained to model the probability that a randomly chosen word was a context word of the target word. With distributed bag of words the idea is to train a classifier network to model the probability that a given word is in the document, so in essence we're modeling the word distribution of the document rather than the contextual distribution of a target word.

In [147]:
# Now we train the Doc2Vec models

# dm = 1 or 0 (optional) – Defines the training algorithm. 
# If dm=1, ‘distributed memory’ (PV-DM) is used. 
# Otherwise, distributed bag of words (PV-DBOW) is employed.

# vector_size - Dimensionality of the feature vectors.

# window -  The maximum distance between the current and 
# predicted word within a sentence.

# min_count - require words to show up a minimum of 2 times
# iscard words with very few occurrences. (Without a variety of representative 
# examples, retaining such infrequent words can often make a model worse!)


d2v_model_bow = Doc2Vec(documents = summaries,
                    dm = 0, ## use distributed bag of words (PV-DBOW) 
                    vector_size = 300, 
                    window = 2, 
                    min_count = 2,
                    epochs=50)

In [148]:
#### Doc2Vec using Distributed Memory
d2v_model_dm = Doc2Vec(documents = summaries,
                    dm = 1, ## use distributed memory
                    vector_size = 300, 
                    window = 2, 
                    min_count = 2,
                    epochs=50)

#### Sanity check

We can  see how "good" the embedding is by looping through the abstracts and recording the similarity rank of the actual abstract embedding to the inferred embedding.

Then, we can calculate the fraction of documents whose rank was 0.

In [149]:
"""
    To see how "good" the Doc2Vec embedding is loop through the abstracts 
    and record the similarity rank of the actual abstract embedding to the 
    inferred embedding.
    Then, calculate and print the fraction of documents whose rank was 0.
"""
def check_doc2vec_embedding(d2v_model):
    # We'll loop through all of the documents
    # and record the similarity rank to their inferred vector
    summary_ranks = []

    # for each document
    for summary in summaries:
        # get the inferred vector
        inferred_vec = d2v_model.infer_vector(summary.words)
        # find the most similar vectors
        sims = d2v_model.dv.most_similar([inferred_vec], topn=len(summaries))
    
    # loop through those vectors
        for i in range(len(sims)):
            # find the rank of the document
            if summary.tags[0] == sims[i][0]:
                # record it
                summary_ranks.append(i)
                
    # the fraction of documents whose rank was 0           
    rank_0 = np.sum(np.array(summary_ranks)==0)/len(summary_ranks)
    print("The fraction of documents whose rank is 0 is", np.round(rank_0, 4))

In [150]:
print("----------------- Doc2Vec Model (Dist. Bag of Words)-----------------\n")
check_doc2vec_embedding(d2v_model_bow)

----------------- Doc2Vec Model (Dist. Bag of Words)-----------------

The fraction of documents whose rank is 0 is 0.992


In [152]:
print("----------------- Doc2Vec Model (Dist. Memory)-----------------\n")
check_doc2vec_embedding(d2v_model_dm)

----------------- Doc2Vec Model (Dist. Memory)-----------------

The fraction of documents whose rank is 0 is 0.992


These models seem reasonable!

#### Compare results of CountVectorizer, TfidVectorizer, Word2Vec, Doc2Vec for articles within the dataset

In [153]:
## Similarity scores for the the first article in the DataFrame

## Using CountVectorizer
df_cv = get_n_most_similar(df.copy(), df_bow, 1, 5)[0]

# Using TfidVectorizer
df_tf = get_n_most_similar(df.copy(), df_tfid, 0, 5)[0]

# Using Word2Vec
df_wv = get_n_most_similar(w2v_model, df, 0, 5)[0]

# Using Doc2Vec with Distributed BOW
df_dv_bow = get_n_most_similar(d2v_model_bow, df, 0, 5)[0]

# Using Doc2Vec with Distributed Memory
df_dv_dm = get_n_most_similar(d2v_model_dm, df, 0, 5)[0]

Using a CountVectorizer or TfidVectorizer.

Using a CountVectorizer or TfidVectorizer.

Using Word2Vec.

Using Doc2Vec

Using Doc2Vec



In [154]:
print("--------------------- Count Vectorizer ---------------------\n")
df_cv[['original_index', 'title', 'abstract', 'strip_cat', 'authors_parsed', 'Cosine Similarity']]

--------------------- Count Vectorizer ---------------------



Unnamed: 0,original_index,title,abstract,strip_cat,authors_parsed,Cosine Similarity
8939,301296,Grazingsliding bifurcations creating infinitel...,As the parameters of a piecewisesmooth syste...,[DS],"[['Simpson', 'David J. W.', '']]",0.417186
3181,339095,On the bifurcation diagram of the capillarygra...,We study the bifurcation of periodic travell...,[AP],"[['Ehrnström', 'Mats', ''], ['Johnson', 'Mathe...",0.412479
9980,373712,Numerical continuation for a fastreaction syst...,In this paper we investigate the bifurcation...,[AP],"[['Kuehn', 'Christian', ''], ['Soresina', 'Cin...",0.379043
4123,217240,The structure of modelocking regions of piecew...,The modelocking regions of a dynamical syste...,[DS],"[['Simpson', 'David J. W.', '']]",0.369274
14098,75053,"Unfolding a CodimensionTwo, Discontinuous, And...",We present an unfolding of the codimensiontw...,[DS],"[['Simpson', 'D. J. W.', ''], ['Meiss', 'J. D....",0.360492


In [155]:
print("--------------------- Tfid Vectorizer ---------------------\n")
df_tf[['original_index', 'title', 'abstract', 'strip_cat', 'authors_parsed', 'Cosine Similarity']]

--------------------- Tfid Vectorizer ---------------------



Unnamed: 0,original_index,title,abstract,strip_cat,authors_parsed,Cosine Similarity
9508,315961,Planar Semiquasi Homogeneous Polynomial differ...,This paper study the planar semiquasi homoge...,[DS],"[['Tian', 'Yuzhou', ''], ['Liang', 'Haihua', '']]",0.405092
15936,329106,Bifurcation of limit cycles from a quadratic g...,"In this paper, we generalize the PicardFuchs...",[DS],"[['Yang', 'Jihua', '']]",0.325908
17297,169000,Solution of the parametric center problem for ...,The Abel differential equation with is sai...,"[CA, DS]","[['Pakovich', 'Fedor', '']]",0.32451
13706,511903,A sufficient and necessary condition of genera...,The aim of this paper is to give a sufficien...,[DS],"[['Chen', 'Hebai', ''], ['Li', 'Zhijie', ''], ...",0.309687
4796,408265,The local period function for Hamiltonian syst...,In the first part of the paper we develop a ...,[DS],"[['Buzzi', 'Claudio A.', ''], ['Carvalho', 'Ya...",0.301153


In [156]:
print("--------------------- Word2Vec Model ---------------------\n")
df_wv[['original_index', 'title', 'abstract', 'strip_cat', 'authors_parsed', 'Cosine Similarity']]

--------------------- Word2Vec Model ---------------------



Unnamed: 0,original_index,title,abstract,strip_cat,authors_parsed,Cosine Similarity
6108,158258,Topology trivialization and large deviations f...,Finding the global minimum of a cost functio...,"[MP, OC]","[['Fyodorov', 'Yan V', ''], ['Doussal', 'Pierr...",0.884011
13151,107803,Continuous Limits of Classical Repeated Intera...,We consider the physical model of a classica...,"[MP, PR]","[['Deschamps', 'Julien', '']]",0.880297
19029,53103,Phase portraits for quadratic homogeneous poly...,Let X be a homogeneous polynomial vector fie...,[DS],"[['Llibre', 'Jaume', ''], ['Pessoa', 'Claudio'...",0.878772
9508,315961,Planar Semiquasi Homogeneous Polynomial differ...,This paper study the planar semiquasi homoge...,[DS],"[['Tian', 'Yuzhou', ''], ['Liang', 'Haihua', '']]",0.877008
18932,365655,"Invariant tori, actionangle variables and phas...","We study the classical RajeevRanken model, a...","[DS, MP]","[['Krishnaswami', 'Govind S.', ''], ['Vishnu',...",0.875581


In [157]:
print("----------------- Doc2Vec Model (Dist. Bag of Words)-----------------\n")
df_dv_bow[['original_index', 'title', 'abstract', 'strip_cat', 'authors_parsed', 'Cosine Similarity']]

----------------- Doc2Vec Model (Dist. Bag of Words)-----------------



Unnamed: 0,original_index,title,abstract,strip_cat,authors_parsed,Cosine Similarity
6867,63325,Planar polynomial vector fields having a polyn...,We consider in this work planar polynomial d...,"[CA, DS]","[['Garcia', 'Belen', ''], ['Giacomini', 'Hecto...",0.597346
10619,468612,Rational integrals of 2dimensional geodesic fl...,This paper is devoted to searching for Riema...,"[DS, AP, DG]","[['Agapov', 'Sergei', '', '1 and 2'], ['Shubin...",0.531299
10096,325855,Averaging theory at any order for computing li...,This work is devoted to study the existence ...,[DS],"[['Llibre', 'Jaume', ''], ['Novaes', 'Douglas ...",0.495319
15464,240592,Dual morse index estimates and application to ...,"In this paper, we study the multiplicity of ...",[AP],"[['Tang', 'Shanshan', '']]",0.48476
16638,15846,The meromorphic nonintegrability of the threeb...,We study the planar threebody problem and pr...,[DS],"[['Tsygvintsev', 'Alexei', '']]",0.475719


In [158]:
print("----------------- Doc2Vec Model (Dist. Memory)-----------------\n")
df_dv_dm[['original_index', 'title', 'abstract', 'strip_cat', 'authors_parsed', 'Cosine Similarity']]

----------------- Doc2Vec Model (Dist. Memory)-----------------



Unnamed: 0,original_index,title,abstract,strip_cat,authors_parsed,Cosine Similarity
9508,315961,Planar Semiquasi Homogeneous Polynomial differ...,This paper study the planar semiquasi homoge...,[DS],"[['Tian', 'Yuzhou', ''], ['Liang', 'Haihua', '']]",0.499888
4796,408265,The local period function for Hamiltonian syst...,In the first part of the paper we develop a ...,[DS],"[['Buzzi', 'Claudio A.', ''], ['Carvalho', 'Ya...",0.476396
6867,63325,Planar polynomial vector fields having a polyn...,We consider in this work planar polynomial d...,"[CA, DS]","[['Garcia', 'Belen', ''], ['Giacomini', 'Hecto...",0.457078
16329,379056,On the finite cyclicity of open period annuli,"Let be an open, relatively compact period a...",[DS],"[['Gavrilov', 'Lubomir', ''], ['Novikov', 'Dmi...",0.449955
15936,329106,Bifurcation of limit cycles from a quadratic g...,"In this paper, we generalize the PicardFuchs...",[DS],"[['Yang', 'Jihua', '']]",0.447468


## Test the models using new user inputs

Presumably, these articles are not already in the dataset, but we should double check!

In [159]:
## Here are several lists of papers we are interested in
ethan = ['1802.03426', '2304.14481', '2303.03190', '2210.13418',
         '2210.12824', '2210.00661', '2007.02390', '1808.05860',
         '2005.12732','1804.05690']

jeeuhn = ['0905.0486', 'math/0006187', '2106.07444', '1402.0490', 
          '1512.08942', '1603.09235', 'math/0510265', 'math/0505056', 
          'math/0604379', '2209.02568']

mike = ['2207.13571','2207.13498','2211.09644','2001.10647',
        '2103.08093','2207.08245', '2207.01677','2205.08744',
        '2008.04406','1912.09845']

jenia = ['2010.14967', '1307.0493', 'quant-ph/0604014', '2201.05140', 
         '1111.1877', 'quant-ph/9912054', '1611.08286', '1507.02858', 
         'math-ph/0107001','1511.01241', 'math-ph/9904020', '2211.15336', 
         '2212.03719']
jenia = jenia[0:10]

In [160]:
# words can be accessed like so
# print(stopwords.words('english'))

## Tokenize the abstract by splitting on whitespaces
## and get rid of the occasional empty string.
def clear_empty(clean_string):
    return [word for word in clean_string.split(" ") if word != '']

def remove_stop(tokens):
    return [token for token in tokens if token not in stopwords.words('english')]

In [161]:
def user_data(paper_ids_list):
    ## Create the dataframe to store the user's input papers
    df_user = pd.DataFrame(columns=['id','entry_id', 'title', 'authors','abstract'])
    
    list_urls = []
    list_titles = []
    list_authors = []
    list_abstracts = []


    for item in paper_ids_list:
        paper = next(arxiv.Search(id_list=[item]).results())
        list_titles.append(paper.title)
        list_authors.append(paper.authors)
        list_abstracts.append(paper.summary)
        list_urls.append(paper.entry_id)
    
    df_user['id'] = paper_ids_list
    df_user['entry_id'] = list_urls
    df_user['title'] = list_titles
    df_user['authors'] = list_authors
    df_user['abstract'] = list_abstracts
    
    ## Clean the user's data
    df_user['abstract_clean'] = df_user['abstract'].apply(clean_data)
    df_user['abstract_tokenized'] = df_user['abstract_clean'].apply(nltk.word_tokenize)
    df_user['abstract_tokenized'] = df_user['abstract_clean'].apply(clear_empty)
    df_user['abstract_tokenized'] = df_user['abstract_tokenized'].apply(remove_stop)
    
    return df_user

### Ethan's Recommendations

In [162]:
df_ethan = user_data(ethan)
df_ethan

Unnamed: 0,id,entry_id,title,authors,abstract,abstract_clean,abstract_tokenized
0,1802.03426,http://arxiv.org/abs/1802.03426v3,UMAP: Uniform Manifold Approximation and Proje...,"[Leland McInnes, John Healy, James Melville]",UMAP (Uniform Manifold Approximation and Proje...,umap uniform manifold approximation and projec...,"[umap, uniform, manifold, approximation, proje..."
1,2304.14481,http://arxiv.org/abs/2304.14481v1,"Endperiodic maps, splitting sequences, and bra...","[Michael P. Landry, Chi Cheuk Tsang]",We strengthen the unpublished theorem of Gabai...,we strengthen the unpublished theorem of gabai...,"[strengthen, unpublished, theorem, gabai, mosh..."
2,2303.0319,http://arxiv.org/abs/2303.03190v1,Train track combinatorics and cluster algebras,[Shunsuke Kano],The concepts of train track was introduced by ...,the concepts of train track was introduced by ...,"[concepts, train, track, introduced, w, p, thu..."
3,2210.13418,http://arxiv.org/abs/2210.13418v2,Standardly embedded train tracks and pseudo-An...,"[Eriko Hironaka, Chi Cheuk Tsang]",We show that given a fully-punctured pseudo-An...,we show that given a fully punctured pseudo an...,"[show, given, fully, punctured, pseudo, anosov..."
4,2210.12824,http://arxiv.org/abs/2210.12824v2,Class number for pseudo-Anosovs,"[François Dahmani, Mahan Mj]","Given two automorphisms of a group $G$, one is...",given two automorphisms of a group one is int...,"[given, two, automorphisms, group, one, intere..."
5,2210.00661,http://arxiv.org/abs/2210.00661v1,"Braids, entropies and fibered 2-fold branched ...","[Susumu Hirose, Eiko Kin]",It is proved by Sakuma and Brooks that any clo...,it is proved by sakuma and brooks that any clo...,"[proved, sakuma, brooks, closed, orientable, m..."
6,2007.0239,http://arxiv.org/abs/2007.02390v1,The (homological) persistence of gerrymandering,"[Moon Duchin, Tom Needham, Thomas Weighill]","We apply persistent homology, the dominant too...",we apply persistent homology the dominant tool...,"[apply, persistent, homology, dominant, tool, ..."
7,1808.0586,http://arxiv.org/abs/1808.05860v1,Discrete geometry for electoral geography,"[Moon Duchin, Bridget Eileen Tenner]","We discuss the ""compactness,"" or shape analysi...",we discuss the compactness or shape analysis o...,"[discuss, compactness, shape, analysis, electo..."
8,2005.12732,http://arxiv.org/abs/2005.12732v1,Mathematics of Nested Districts: The Case of A...,"[Sophia Caldera, Daryl DeFord, Moon Duchin, Sa...","In eight states, a ""nesting rule"" requires tha...",in eight states a nesting rule requires that e...,"[eight, states, nesting, rule, requires, state..."
9,1804.0569,http://arxiv.org/abs/1804.05690v4,You can hear the shape of a billiard table: Sy...,"[Moon Duchin, Viveka Erlandsson, Christopher J...",We give a complete characterization of the rel...,we give a complete characterization of the rel...,"[give, complete, characterization, relationshi..."


In [163]:
start = time.time()

## Using the pretrained Word2Vec
df_rec_wv, df_ethan_new_wv = n_recommendations(w2v_model, df, df_ethan[0:2], 10)

end = time.time()
res = (end - start)/60
print('Execution time:', res, 'minutes')

Using Word2Vec
Execution time: 5.081173849105835 minutes


In [164]:
print("--------------------- Word2Vec Model ---------------------\n")
display_results(df_rec_wv, df_ethan_new_wv)

--------------------- Word2Vec Model ---------------------

The top 10 articles most similar to the articles: 


1 . UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction
2 . Endperiodic maps, splitting sequences, and branched surfaces

#############################################################

1 . ( 7133 ) Area minimizing hypersurfaces modulo : a geometric freeboundary   problem 
 [ Cosine Similarity= 0.888 ]


   We consider area minimizing dimensional currents  in complete  Riemannian manifolds  of dimension . For odd moduli we prove that, away from a closed rectifiable set of codimension , the current in question is, locally, the union of finitely many smooth minimal hypersurfaces coming together at a common  boundary of dimension , and the result is optimal. For even  such structure holds in a neighborhood of any point where at least one tangent cone has dimensional spine. These structural results are indeed the byproduct of a theorem that proves (for any

In [165]:
start = time.time()

## Using Doc2Vec with Distributed Bag of Words
df_rec_d2v_bow, df_ethan_new_bow = n_recommendations(d2v_model_bow, df, df_ethan[0:2], 10)

end = time.time()
res = (end - start)/60
print('Execution time:',res, 'minutes')

Using Doc2Vec
Execution time: 5.084524818261465 minutes


In [166]:

print("----------------- Doc2Vec Model (Dist. Bag of Words)-----------------\n")
display_results(df_rec_d2v_bow, df_ethan_new_bow)

----------------- Doc2Vec Model (Dist. Bag of Words)-----------------

The top 10 articles most similar to the articles: 


1 . UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction
2 . Endperiodic maps, splitting sequences, and branched surfaces

#############################################################

1 . ( 16855 ) A factorization theorem for harmonic maps 
 [ Cosine Similarity= 0.319 ]


   Let  be a harmonic map from a Riemann surface to a Riemannian manifold. We prove that if there is a holomorphic diffeomorphism  between open subsets of the surface such that , then  factors through a holomorphic map onto another Riemann surface. If such  is antiholomorphic, we obtain an analogous statement. For minimal maps, this result is well known and is a consequence of the theory of branched immersions of surfaces due to GulliverOssermanRoyden. Our proof relies on various geometric properties of the Hopf differential. 

--------------------------------------------

In [167]:
start = time.time()

## Using Doc2Vec with Distributed Memory
df_rec_d2v_dm, df_ethan_new_dm = n_recommendations(d2v_model_dm, df, df_ethan[0:2], 10)

end = time.time()
res = (end - start)/60
print('Execution time:',res, 'minutes')

Using Doc2Vec
Execution time: 5.3633530855178835 minutes


In [168]:
print("----------------- Doc2Vec Model (Dist. Memory)-----------------\n")
display_results(df_rec_d2v_dm, df_ethan_new_dm)

----------------- Doc2Vec Model (Dist. Memory)-----------------

The top 10 articles most similar to the articles: 


1 . UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction
2 . Endperiodic maps, splitting sequences, and branched surfaces

#############################################################

1 . ( 16290 ) Geometry of foliations and flows I: Almost transverse pseudoAnosov   flows and asymptotic behavior of foliations 
 [ Cosine Similarity= 0.887 ]


   Let F be a foliation in a closed 3manifold with negatively curved fundamental group and suppose that F is almost transverse to a quasigeodesic pseudoAnosov flow. We show that the leaves of the foliation in the universal cover extend continuously to the sphere at infinity, hence the limit sets are continuous images of the circle. One important corollary is that if F is a Reebless finite depth foliation in a hyperbolic manifold, then it has the continuous extension property. Such finite depth foliations exi

### Jee Uhn's Recommendations

In [None]:
df_jeeuhn = user_data(jeeuhn)
df_jeeuhn

In [None]:
%%time
## Using the pretrained Word2Vec
df_rec_wv2, df_jeeuhn_new_wv = n_recommendations(w2v_model, df, df_jeeuhn[0:2], 10)

print("--------------------- Word2Vec Model ---------------------\n")
display_results(df_rec_wv2, df_jeeuhn_new_wv)

In [None]:
%%time
## Using Doc2Vec with Distributed Bag of Words
df_rec_d2v_bow2, df_jeeuhn_new_bow = n_recommendations(d2v_model_bow, df, df_jeeuhn[0:2], 10)

print("----------------- Doc2Vec Model (Dist. Bag of Words)-----------------\n")
display_results(df_rec_d2v_bow2, df_jeeuhn_new_bow)

In [None]:
%%time
## Using Doc2Vec with Distributed Memory
df_rec_d2v_dm2, df_jeeuhn_new_dm = n_recommendations(d2v_model_dm, df, df_jeeuhn[0:2], 10)

print("----------------- Doc2Vec Model (Dist. Memory)-----------------\n")
display_results(df_rec_d2v_dm2, df_jeeuhn_new_dm)

### Mike's Recommendations

In [None]:
df_mike = user_data(mike)
df_mike

In [None]:
%%time
## Using the pretrained Word2Vec
df_rec_wv3, df_mike_new_wv = n_recommendations(w2v_model, df, df_mike[0:2], 10)

print("--------------------- Word2Vec Model ---------------------\n")
display_results(df_rec_wv3, df_mike_new_wv)

In [None]:
%%time
## Using Doc2Vec with Distributed Bag of Words
df_rec_d2v_bow3, df_mike_new_bow = n_recommendations(d2v_model_bow, df, df_mike[0:2], 10)

print("----------------- Doc2Vec Model (Dist. Bag of Words)-----------------\n")
display_results(df_rec_d2v_bow3, df_mike_new_bow)

In [None]:
%%time
## Using Doc2Vec with Distributed Memory
df_rec_d2v_dm3, df_mike_new_dm = n_recommendations(d2v_model_dm, df, df_mike[0:2], 10)

print("----------------- Doc2Vec Model (Dist. Memory)-----------------\n")
display_results(df_rec_d2v_dm3, df_mike_new_dm)

### Jenia's Recommendations

In [None]:
df_jenia = user_data(jenia)
df_jenia

In [None]:
%%time
## Using the pretrained Word2Vec
df_rec_wv4, df_jenia_new_wv = n_recommendations(w2v_model, df, df_jenia[0:2], 10)

print("--------------------- Word2Vec Model ---------------------\n")
display_results(df_rec_wv4, df_jenia_new_wv)

In [None]:
%%time
## Using Doc2Vec with Distributed Bag of Words
df_rec_d2v_bow4, df_jenia_new_bow = n_recommendations(d2v_model_bow, df, df_jenia[0:2], 10)

print("----------------- Doc2Vec Model (Dist. Bag of Words)-----------------\n")
display_results(df_rec_d2v_bow4, df_jenia_new_bow)

In [None]:
%%time
## Using Doc2Vec with Distributed Memory
df_rec_d2v_dm4, df_jenia_new_dm = n_recommendations(d2v_model_dm, df, df_jenia[0:2], 10)

print("----------------- Doc2Vec Model (Dist. Memory)-----------------\n")
display_results(df_rec_d2v_dm4, df_jenia_new_dm)