In [1]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import pandas as pd
import ast
import numpy as np

In [2]:
model = Doc2Vec.load('./data/wikiDoc2Vec.model')

In [3]:
dat = pd.read_csv('./data/cleaned_documents.csv',names=['text','label'],header=0)

In [4]:
vectors = [ast.literal_eval(each) for each in dat['text']]
labels = dat['label'].values

### Sanity check to manually evaluate doc2vec. Expecting embedding from similar topics to be close. 

In [5]:
labels[0],labels[1],labels[-2],labels[-1]

('entertainment', 'entertainment', 'tech', 'tech')

In [6]:
v1 = model.infer_vector(vectors[0],steps=10)#entertainment
v2 = model.infer_vector(vectors[1],steps=10)#entertainment
v3 = model.infer_vector(vectors[-2],steps=10)#tech
v4 = model.infer_vector(vectors[-1],steps=10)#tech

In [7]:
np.dot(v1,v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))

0.58765554

In [8]:
np.dot(v1,v3)/(np.linalg.norm(v1)*np.linalg.norm(v3))

0.35680595

In [9]:
np.dot(v2,v3)/(np.linalg.norm(v3)*np.linalg.norm(v2))

0.29931214

In [10]:
np.dot(v4,v3)/(np.linalg.norm(v3)*np.linalg.norm(v4))

0.55868834

### We see that embeddings of docs from same topic have similarity ~0.56 but embeddings of docs from dissimilar topics have lower similarity. This makes sense and reinforces our understanding of vector embeddings. 


#### Fitting and dumping the embeddings. 

In [11]:
embed = []
for each in vectors: 
    embed.append(model.infer_vector(each))

In [12]:
dat['embedding'] = embed

In [13]:
dat.to_csv('./data/embedded_doc2vec_bbc.csv')