In [1]:
import pandas as pd
import numpy as np
# from scipy import spatial

import nltk
from nltk.corpus import stopwords

import gensim.downloader as api
import gensim

# from gensim import corpora, models, similarities
from gensim.models import Word2Vec

## Applying Word2Vec to a collection of abstracts from ArXiv articles.

In [3]:
df = pd.read_csv("./data/df_experiment")
df.head()

Unnamed: 0.1,Unnamed: 0,entry_id,updated,published,title,summary,primary_category,categories,authors
0,0,http://arxiv.org/abs/2305.11154v1,2023.42,2023.42,non linear operator valued elliptic flows with...,differential equations on spaces of operators ...,ph,['mp'],"['jean bernard bru', 'nathan metraud']"
1,1,http://arxiv.org/abs/2305.11103v1,2023.42,2023.42,blockwise inversion and algorithms for inverti...,using the blockwise matrix inversion inversion...,na,"['na', 'na', 'mp']",['r thiru senthil']
2,2,http://arxiv.org/abs/2305.11054v1,2023.42,2023.42,ising systems measures on the sphere and zonoids,we give an interpretation of a class of discre...,ap,"['ap', 'mp', 'oc']","['andrea braides', 'antonin chambolle']"
3,3,http://arxiv.org/abs/2210.09458v2,2023.42,2022.83,mobility edge for levy matrices,levy matrices are symmetric random matrices wh...,pr,"['pr', 'mp']","['amol aggarwal', 'charles bordenave', 'patric..."
4,4,http://arxiv.org/abs/2205.08765v2,2023.42,2022.42,necessary and sufficient conditions for one di...,this paper deals with necessary and sufficient...,ca,"['ca', 'ft', 'mp']",['pavol quittner']


In [4]:
df = df.drop(['Unnamed: 0'], axis=1)

In [5]:
# words can be accessed like so
# print(stopwords.words('english'))

## Tokenize the abstract by splitting on whitespaces
## and get rid of the occasional empty string.
def clear_empty(clean_string):
    return [word for word in clean_string.split(" ") if word != '']

def remove_stop(tokens):
    return [token for token in tokens if token not in stopwords.words('english')]

In [6]:
df['summary_tokenized'] = df['summary'].apply(nltk.word_tokenize)
df['summary_tokenized'] = df['summary'].apply(clear_empty)
df['summary_tokenized'] = df['summary_tokenized'].apply(remove_stop)

In [8]:
df.head()

Unnamed: 0,entry_id,updated,published,title,summary,primary_category,categories,authors,summary_tokenized
0,http://arxiv.org/abs/2305.11154v1,2023.42,2023.42,non linear operator valued elliptic flows with...,differential equations on spaces of operators ...,ph,['mp'],"['jean bernard bru', 'nathan metraud']","[differential, equations, spaces, operators, l..."
1,http://arxiv.org/abs/2305.11103v1,2023.42,2023.42,blockwise inversion and algorithms for inverti...,using the blockwise matrix inversion inversion...,na,"['na', 'na', 'mp']",['r thiru senthil'],"[using, blockwise, matrix, inversion, inversio..."
2,http://arxiv.org/abs/2305.11054v1,2023.42,2023.42,ising systems measures on the sphere and zonoids,we give an interpretation of a class of discre...,ap,"['ap', 'mp', 'oc']","['andrea braides', 'antonin chambolle']","[give, interpretation, class, discrete, contin..."
3,http://arxiv.org/abs/2210.09458v2,2023.42,2022.83,mobility edge for levy matrices,levy matrices are symmetric random matrices wh...,pr,"['pr', 'mp']","['amol aggarwal', 'charles bordenave', 'patric...","[levy, matrices, symmetric, random, matrices, ..."
4,http://arxiv.org/abs/2205.08765v2,2023.42,2022.42,necessary and sufficient conditions for one di...,this paper deals with necessary and sufficient...,ca,"['ca', 'ft', 'mp']",['pavol quittner'],"[paper, deals, necessary, sufficient, conditio..."


In [7]:
## A function to get the vector norm
def norm(u):
    return np.sqrt(np.sum(np.power(u,2)))

## A function to get the cosine similarity
def cos_sim(u,v):
    if norm(u)*norm(v) > 0:
        return (u.dot(v))/(norm(u)*norm(v))
    else:
        return np.nan

#### Use a pretrained model from Gensim

We will fetch the Word2Vec model trained on part of the Google News dataset, covering approximately 3 million words and phrases.  A list of `gensim`'s the pretrained models:

In [9]:
# ['fasttext-wiki-news-subwords-300',
#  'conceptnet-numberbatch-17-06-300',
#  'word2vec-ruscorpora-300',
#  'word2vec-google-news-300',
#  'glove-wiki-gigaword-50',
#  'glove-wiki-gigaword-100',
#  'glove-wiki-gigaword-200',
#  'glove-wiki-gigaword-300',
#  'glove-twitter-25',
#  'glove-twitter-50',
#  'glove-twitter-100',
#  'glove-twitter-200',
#  '__testing_word2vec-matrix-synopsis']

In [10]:
## Load word2vec model, here GoogleNews is used
## The file must be previously downloaded
model = gensim.models.KeyedVectors.load_word2vec_format('../GoogleNews-vectors-negative300.bin', 
                                                        binary=True)

If using word2vec, one needs to calculate the average vector for all words in every sentence/document and use cosine similarity between vectors:

In [11]:
index2key_set = set(model.index_to_key)

def avg_feature_vector(sentence, model, num_features, index2key_set):
   # words = sentence.split()
    words = sentence
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    for word in words:
        if word in index2key_set:
            n_words += 1
            feature_vec = np.add(feature_vec, model[word])
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec

In [12]:
## Test
s1_afv = avg_feature_vector(df['summary_tokenized'][0], 
                            model=model, 
                            num_features=300, 
                            index2key_set=index2key_set)

s2_afv = avg_feature_vector(df['summary_tokenized'][0], 
                            model=model, 
                            num_features=300, 
                            index2key_set=index2key_set)
print(cos_sim(s1_afv, s2_afv))

0.9999999


The above should be 1, but maybe this is due to rounding error?

In [13]:
## Test
s1_afv = avg_feature_vector(df['summary_tokenized'][0], 
                            model=model, 
                            num_features=300, 
                            index2key_set=index2key_set)

s2_afv = avg_feature_vector(df['summary_tokenized'][1], 
                            model=model, 
                            num_features=300, 
                            index2key_set=index2key_set)
print(cos_sim(s1_afv, s2_afv))

0.79644567


In [15]:
"""
    Prints the top n most similar article titles from the dataframe
    to the input article by calculating their cosine similarity.
"""
def get_most_similar(model, num_features, index2key_set, article_index , n):
    
    s1_afv = avg_feature_vector(df['summary_tokenized'][article_index], 
                            model=model, 
                            num_features=300, 
                            index2key_set=index2key_set)
    
   
    cosine_sim_list = np.zeros(len(df))

    for i in range(len(df)):
        
        # Calculate the cosine similariy scores with the i-th article in the database
        s2_afv = avg_feature_vector(df['summary_tokenized'][i], 
                            model=model, 
                            num_features=300, 
                            index2key_set=index2key_set)
        
        cosine_sim_list[i]  = cos_sim(s1_afv, s2_afv)
        
    ## Getting indices of n = 10 (+1) maximum values
    ## Note the first value is the similarity value of the 
    ## article to itself
    x = np.argsort(cosine_sim_list)[::-1][:n+1]

    # Remove the first entry in x since it's just
    # the original article itself
    x = np.delete(x,0)
    
    print("The top", n, "articles most similar to the article \n\n", 
            article_index, ".", df['title'][article_index])
    print("-----------------------------------------------------\n")
    
    i = 1
    for index in x: 
        print(i, ".", "(", index , ")", df['title'][index], 
          ", Cosine Similiarity=", np.round(cosine_sim_list[index], 3))
        print()
        i = i + 1

Let's compute the cosine similarity of some raondomly selected articles from the dataset so all of the other articles in the dataset.

In [16]:
article_index = 0
n = 5

get_most_similar(model=model, 
                 num_features =300, 
                 index2key_set=index2key_set, 
                 article_index=article_index, 
                 n=n)

print("\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n")

article_index = 132
n = 5

get_most_similar(model=model, 
                 num_features =300, 
                 index2key_set=index2key_set, 
                 article_index=article_index, 
                 n=n)

print("\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n")

article_index = 3852
n = 5

get_most_similar(model=model, 
                 num_features =300, 
                 index2key_set=index2key_set, 
                 article_index=article_index, 
                 n=n)

The top 5 articles most similar to the article 

 0 . non linear operator valued elliptic flows with application to quantum field theory
-----------------------------------------------------

1 . ( 158 ) bootstrap approach to  dimensional integrable quantum field theories the case of the sinh gordon model , Cosine Similiarity= 0.921

2 . ( 939 ) equilibrium states for the massive sine gordon theory in the lorentzian signature , Cosine Similiarity= 0.92

3 . ( 175 ) from lindblad master equations to langevin dynamics and back , Cosine Similiarity= 0.919

4 . ( 589 ) measures on compact riemannian manifolds , Cosine Similiarity= 0.916

5 . ( 370 ) the teaching from entanglement d su antiferromagnet to valence bond solid deconfined quantum critical points are not conformal , Cosine Similiarity= 0.915


+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

The top 5 articles most similar to the article 

 132 . quantum operations on conformal nets
---------

### Pending questions:

1. Should we train our own Word2Vec model?
2. Improve tokenzing? 
3. Should we try different similarity measures?