In [3]:
import pandas as pd
import numpy as np

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

from sklearn.metrics.pairwise import cosine_similarity

In [4]:
df_info = pd.read_csv('node_information.csv', header=None, names=['ID', 'Year', 'Title', 'Author', 'Journal', 'Abstract'])

# Doc2Vec

In [5]:
# Build a corpus
data = df_info['Title'] + ' ' + df_info['Abstract']

# Tokenization
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]

In [6]:
tagged_data

[TaggedDocument(words=['compactification', 'geometry', 'and', 'duality', 'these', 'are', 'notes', 'based', 'on', 'lectures', 'given', 'at', 'tasi99', 'we', 'review', 'the', 'geometry', 'of', 'the', 'moduli', 'space', 'of', 'n', '2', 'theories', 'in', 'four', 'dimensions', 'from', 'the', 'point', 'of', 'view', 'of', 'superstring', 'compactification', 'the', 'cases', 'of', 'a', 'type', 'iia', 'or', 'type', 'iib', 'string', 'compactified', 'on', 'a', 'calabi-yau', 'threefold', 'and', 'the', 'heterotic', 'string', 'compactified', 'on', 'k3xt2', 'are', 'each', 'considered', 'in', 'detail', 'we', 'pay', 'specific', 'attention', 'to', 'the', 'differences', 'between', 'n', '2', 'theories', 'and', 'n', '2', 'theories', 'the', 'moduli', 'spaces', 'of', 'vector', 'multiplets', 'and', 'the', 'moduli', 'spaces', 'of', 'hypermultiplets', 'are', 'reviewed', 'in', 'the', 'case', 'of', 'hypermultiplets', 'this', 'review', 'is', 'limited', 'by', 'the', 'poor', 'state', 'of', 'our', 'current', 'understan

In [7]:
# Hyper-parameters
max_epochs = 50
vec_size = 50
alpha = 0.01

In [8]:
# Build a model
model = Doc2Vec(vector_size=vec_size,
                window=3,
                alpha=alpha, 
                min_alpha=0.0025,
                min_count=1,
                workers=16)

model.build_vocab(tagged_data)

In [9]:
# Training
for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("d2v.model")
print("Model Saved")

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
Model Saved


In [11]:
# Most similar documents
similar_doc = model.docvecs.most_similar('1')
print(similar_doc)

[('262', 0.6739646196365356), ('8372', 0.6453841924667358), ('11832', 0.640739917755127), ('27606', 0.6248440742492676), ('13399', 0.6222867965698242), ('27116', 0.6145443320274353), ('1092', 0.6130225658416748), ('339', 0.6098054647445679), ('9141', 0.6078057289123535), ('8635', 0.6076599359512329)]


In [12]:
# Get document vectors
model.docvecs[1]

array([-0.97916836, -1.5867622 ,  0.23070374, -0.2232663 , -0.6846714 ,
       -3.321913  ,  1.204285  , -0.36113656, -0.06387594,  4.599673  ,
       -0.5356246 ,  1.1699567 ,  0.99085104,  0.20373988, -0.49607956,
       -0.5858151 , -2.707298  ,  0.39857838,  0.5625839 ,  2.0380526 ,
       -0.83718395, -0.7024638 ,  0.60613894, -0.04145955,  2.1662066 ,
        2.6996667 , -0.28348523, -0.8617396 , -0.67849696, -0.21028812,
        0.433852  ,  1.4322782 ,  1.7432699 , -1.6563792 , -2.135546  ,
       -0.13407755, -0.30058935, -2.3381872 ,  2.7907238 ,  0.11397408,
        2.05126   , -2.298971  ,  2.5210521 ,  0.4806553 ,  2.287641  ,
       -1.9737315 , -0.15183257, -0.6696437 , -0.4521602 , -0.5281625 ],
      dtype=float32)

In [13]:
# Create a vector dataframe
df_vector = df_info.copy()
m_v = np.array([model.docvecs[i] for i in range(len(df_info))])
m_v = pd.DataFrame(m_v)
m_v.columns = ['v_' + str(x) for x in m_v.columns]

df_vector = pd.concat([df_vector[['ID']], m_v], axis=1)

In [14]:
# Save the vector
df_vector.to_csv('doc2vec.csv', index=None)

# Cosine Similarity

In [4]:
df_vector = pd.read_csv('doc2vec.csv')

In [13]:
# Generate a similarity matrix
sim = cosine_similarity(df_vector.iloc[:,1:])
df_sim = pd.DataFrame(sim, index=df_vector.iloc[:,0], columns=df_vector.iloc[:,0])