# Doc2Vec

In [1]:
from gensim.models.doc2vec import Doc2Vec,TaggedDocument
from nltk.tokenize import word_tokenize

In [2]:
# define a list of documents.
data = ["This is the first document",
        "This is the second document",
        "This is the third document",
        "This is the fourth document"]

In [3]:
# preproces the documents, and create TaggedDocuments
tagged_data = [TaggedDocument(words=word_tokenize(doc.lower()),
                              tags=[str(i)]) for i,
               doc in enumerate(data)]

In [7]:
for i in tagged_data:
    print(i)

TaggedDocument<['this', 'is', 'the', 'first', 'document'], ['0']>
TaggedDocument<['this', 'is', 'the', 'second', 'document'], ['1']>
TaggedDocument<['this', 'is', 'the', 'third', 'document'], ['2']>
TaggedDocument<['this', 'is', 'the', 'fourth', 'document'], ['3']>


In [8]:
# train the Doc2vec model
model = Doc2Vec(vector_size=20,
                min_count=2, epochs=50)

In [9]:
model.build_vocab(tagged_data)
model.train(tagged_data,
            total_examples=model.corpus_count,
            epochs=model.epochs)

In [10]:
# get the document vectors
document_vectors = [model.infer_vector(
    word_tokenize(doc.lower())) for doc in data]

In [11]:
#  print the document vectors of 20 dimensions
for i, doc in enumerate(data):
    print("Document", i+1, ":", doc)
    print("Vector:", document_vectors[i])
    print()

Document 1 : This is the first document
Vector: [-0.01142612  0.02129321  0.00438275  0.01593259 -0.01025523 -0.01399397
  0.02363143 -0.00929494  0.00280987  0.01716064 -0.00214236 -0.00684456
  0.01837697 -0.01073344 -0.01869184  0.01852228 -0.01733654  0.0029314
  0.01985864  0.01297776]

Document 2 : This is the second document
Vector: [ 0.01930039  0.00430071 -0.01087804  0.01039556 -0.01064147 -0.01848278
  0.01715824  0.00135932  0.01734472 -0.00040147  0.0055463  -0.00043264
 -0.01124458 -0.00049775  0.00681238 -0.01269298  0.00231915  0.02265955
  0.02137966  0.01878252]

Document 3 : This is the third document
Vector: [-0.00010546 -0.00347492 -0.0167986  -0.02467314  0.01298489 -0.00870977
  0.02024623 -0.00800551  0.00320132  0.00953845 -0.01783402  0.02428336
  0.02148232  0.00426813  0.00995438  0.01787821 -0.00371053 -0.01987445
  0.01677905 -0.01133107]

Document 4 : This is the fourth document
Vector: [-0.02009611  0.00455549  0.00353938 -0.02240707 -0.00534018 -0.01515

In [25]:
model.docvecs.similarity("0","1")

  model.docvecs.similarity("0","1")


0.27298784