In [2]:
# an unsupervised algorithm that learns fixed-length feature representations from variable-length pieces of texts, 
# such as
# sentences, paragraphs, and documents. Our algorithm represents each document by a dense vector which is trained 
#to predict words in the document

In [3]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec,TaggedDocument

In [5]:
# checking the training corpus

In [4]:
common_texts

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

In [6]:
# now we will convert the tokenized documents into tagged-documents

In [12]:
documents = [TaggedDocument(doc,[i]) for i,doc in enumerate(common_texts)]
documents

[TaggedDocument(words=['human', 'interface', 'computer'], tags=[0]),
 TaggedDocument(words=['survey', 'user', 'computer', 'system', 'response', 'time'], tags=[1]),
 TaggedDocument(words=['eps', 'user', 'interface', 'system'], tags=[2]),
 TaggedDocument(words=['system', 'human', 'system', 'eps'], tags=[3]),
 TaggedDocument(words=['user', 'response', 'time'], tags=[4]),
 TaggedDocument(words=['trees'], tags=[5]),
 TaggedDocument(words=['graph', 'trees'], tags=[6]),
 TaggedDocument(words=['graph', 'minors', 'trees'], tags=[7]),
 TaggedDocument(words=['graph', 'minors', 'survey'], tags=[8])]

In [13]:
# Doc2Vec expects a list of tokens as input for each document.

In [14]:
# now let's build and train basic Doc2Vec model 

In [15]:
model = Doc2Vec(documents,vector_size=5,min_count=1,workers=4, epochs=40)

In [16]:
model.train(documents,total_examples = model.corpus_count, epochs=model.epochs)

In [17]:
# each document will be represented by a vector of five floating-point values.
# only terms that occur at least min_count number of times will be considered in the vocabulary.
# number of threads to be used while training to speed up the process

In [18]:
model.vector_size

5

In [20]:
len(model.wv.key_to_index)

12

In [21]:
model.wv.key_to_index

{'system': 0,
 'graph': 1,
 'trees': 2,
 'user': 3,
 'minors': 4,
 'eps': 5,
 'time': 6,
 'response': 7,
 'survey': 8,
 'computer': 9,
 'interface': 10,
 'human': 11}

In [23]:
# building a document vector for a new sentence

In [22]:
vector = model.infer_vector(['user','interface','for','computer'])
vector

array([ 0.00217576, -0.04548197, -0.03271691, -0.09795582, -0.06015328],
      dtype=float32)

In [28]:
model.corpus_count # how many documents is the model trained with

9

### Changing Vector size and min_count

In [26]:
# let us built doc2vec model with vectorsize of 50 and min_count parameter set to 3

In [29]:
model = Doc2Vec(documents,vector_size=50,min_count=3,epochs=40)
model.train(documents,total_examples=model.corpus_count,epochs=model.epochs)

In [30]:
len(model.wv.key_to_index)

4

In [32]:
model.wv.key_to_index # here only words having minimum freq of 3 in the total corpus were considered

{'system': 0, 'graph': 1, 'trees': 2, 'user': 3}

In [33]:
vector = model.infer_vector(['user','interface','for','computer'])
print(vector)

[ 0.00045011 -0.00428057 -0.00265949 -0.00928813 -0.00611875  0.00621604
 -0.00886304 -0.00055536  0.0060769   0.00585221  0.00437172 -0.00416743
 -0.00106433  0.00950826  0.00108614  0.00951415  0.00881194  0.00165049
  0.00996538 -0.00823202 -0.00739136 -0.00375222  0.00100804  0.00483836
 -0.00078981 -0.00267532  0.00833382 -0.00716673 -0.00849993 -0.00057538
 -0.00774354  0.00789681 -0.00271049  0.01000315 -0.0011322   0.00955895
 -0.00095618  0.00871789 -0.00894057  0.00494313 -0.00680891 -0.00928201
  0.00455901  0.00222068 -0.00302778  0.00211217 -0.00469394 -0.00836468
 -0.0031305  -0.00807102]


In [36]:
# this is our paragraph vector
# the size of vector is 50 though there are only 4 terms are in the vocab

In [38]:
# there are tow popular approach to build paragraph vectors
# PV-DM, PV-DBOW

### the dm paramter for switchig between modeling approach

In [40]:
# when dm=1, distributed memory approach
# when dm=0, distributed bag of word approach

In [41]:
model = Doc2Vec(documents,vector_size=50,min_count=2,epochs=40,dm=1)
model.train(documents, total_examples=model.corpus_count, epochs = model.epochs)

In [42]:
vector = model.infer_vector(['user','interface','for','computer'])
print(vector)

[-1.9349295e-04 -4.4075958e-03 -3.0497687e-03 -9.3083568e-03
 -6.3402513e-03  6.3978978e-03 -9.1088917e-03 -9.2215640e-05
  5.4598744e-03  6.1668139e-03  4.7003049e-03 -3.6173128e-03
 -8.3109201e-04  9.1717634e-03  1.3813858e-03  8.9835059e-03
  8.8255377e-03  1.9142296e-03  9.3122525e-03 -8.3465176e-03
 -7.7758809e-03 -3.7051956e-03  7.2509644e-04  5.0154338e-03
 -8.8647025e-04 -2.6348645e-03  7.9512224e-03 -7.8090215e-03
 -8.4188618e-03 -8.4397587e-04 -7.1763806e-03  8.3678029e-03
 -3.3651828e-03  9.8766657e-03 -1.1650656e-03  9.9916318e-03
 -1.0872122e-03  8.1198551e-03 -8.7413434e-03  4.8681051e-03
 -7.2452938e-03 -8.8497410e-03  4.3714978e-03  1.6152665e-03
 -3.3612733e-03  2.4591167e-03 -4.4178534e-03 -8.3774794e-03
 -2.7117245e-03 -8.2320301e-03]


In [43]:
# -------

In [45]:
model = Doc2Vec(documents,vector_size=50,min_count=2,epochs=40,dm=0)
model.train(documents,total_examples=model.corpus_count,epochs=model.epochs)

In [46]:
vector = model.infer_vector(['user','interface','for','computer'])
print(vector)

[-1.51261990e-03 -4.63405671e-03 -3.65272630e-03 -9.65321902e-03
 -6.80024642e-03  6.81753457e-03 -9.61384457e-03  9.70871712e-04
  4.80194576e-03  6.96728472e-03  5.10881562e-03 -2.65987264e-03
 -3.51611496e-04  8.70065391e-03  2.11652718e-03  8.22698232e-03
  8.14919453e-03  2.08939565e-03  8.67329352e-03 -8.13122466e-03
 -8.40026885e-03 -3.65634984e-03 -3.69181835e-05  5.06948819e-03
 -1.16028322e-03 -2.40938179e-03  7.49177812e-03 -8.96984804e-03
 -8.41754582e-03 -1.05946395e-03 -6.11592317e-03  9.35982727e-03
 -4.61108750e-03  1.02405492e-02 -1.38859160e-03  1.10740885e-02
 -1.57870154e-03  7.43725058e-03 -8.41030292e-03  4.65027709e-03
 -8.00203346e-03 -8.00907519e-03  4.03361348e-03  7.91331637e-04
 -4.42849845e-03  3.30409384e-03 -3.83381685e-03 -8.21210537e-03
 -2.02311412e-03 -8.53967294e-03]


In [49]:
## The distributed memory model takes word vectors into account and comes with two additional parameters,
#dm_concat and dm_mean.


In [52]:
# indicates to the algorithm that the context vectors should be concatenated while trying to predict the target word.
# This, of course, leads to building a larger model since multiple word embeddings get concatenated.

# The window size parameter controls the distance between the word under concentration and the word to be predicted,
# initial learning rate can be specified using the alpha parameter
# min_alpha- what value the learning rate should drop to over the course of training

In [51]:
model = Doc2Vec(documents,vector_size=50,min_count=2,epochs=40, window=2,dm=1,dm_concat=1,alpha=0.3,min_alpha=0.05)
model.train(documents,total_examples=model.corpus_count,epochs=model.epochs)

In [53]:
vector=model.infer_vector(['user','interface','for','computer'])
print(vector)


[ 4.50854525e-02 -2.06051767e-01 -1.15051866e-01  1.61393196e-04
  1.48151055e-01  7.69819841e-02  1.94396619e-02 -2.61510372e-01
  1.47296200e-02  1.72400221e-01 -1.03294447e-01 -1.41764833e-02
 -1.79328769e-01 -1.58353925e-01 -1.14882970e-02 -1.46243125e-01
  5.05606569e-02  7.99241960e-02 -3.40881161e-02 -2.03332707e-01
 -1.22323632e-04  1.61011219e-02 -9.09983665e-02  3.37552801e-02
 -2.39929393e-01 -8.70695561e-02 -2.62958497e-01 -2.02713106e-02
  6.73472807e-02 -1.70263305e-01  1.73107237e-01 -2.00216919e-02
  6.06580377e-02 -1.98573694e-01  9.53349099e-02 -1.42446518e-01
  5.88329472e-02 -1.05561122e-01 -1.49081983e-02  4.25416231e-02
  1.99918240e-01 -2.58556753e-02  2.30551079e-01 -2.17566103e-01
  4.46799025e-02 -7.29657933e-02  4.57035787e-02 -1.49397757e-02
  5.45404106e-03 -1.58229351e-01]


In [54]:
# the dm_mean parameter

In [55]:
# Two alternative approaches are to sum or average the context vectors instead of concatenating them

In [56]:
# When the dm_mean parameter is set to 1, the mean of the context word vectors is taken.
# The sum of the context word vectors is taken into account when dm_mean is set to 0.

In [None]:
model = Doc2Vec(documents,vector_size=50,min_count=2,epochs=40, window=2,dm=1,dm_concat=0,dm_mean=1,alpha=0.3,min_alpha=0.05)
model.train(documents,total_examples=model.corpus_count,epochs=model.epochs)