## Vector Representations (Doc2Vec) : Sentence(or list of sents) by Sentence Comparison

DOCUMENTATION 
https://radimrehurek.com/gensim/models/doc2vec.html

### Importing Packages

In [1]:
import numpy as np
import pandas as pd
from os import listdir

# --- NLTK PACKAGE ---
import nltk
# Tokenizers
from nltk.tokenize import word_tokenize, sent_tokenize, PunktSentenceTokenizer, RegexpTokenizer
# Stemming and Lemmatizing
from nltk.stem import PorterStemmer, WordNetLemmatizer
# Stopwords
from nltk.corpus import stopwords, state_union, brown, movie_reviews, treebank

# --- GENSIM PACKAGE ---
import gensim, logging
from gensim.models import Word2Vec, doc2vec, Doc2Vec

### Loading Documents(DataSet)

In [6]:
## List of Full names of doc-files in the folder "Documents"

docLabels = []
docLabels = [file for file in listdir("Documents")]

In [7]:
docLabels

['Book_1', 'Book_3', 'Book_2']

In [8]:
data = []

for doc in docLabels:
    path = "Documents/" + doc
    data.append(open(path))

In [9]:
''' KEY--
    
    data        : Contains the list of all documents
    data[0]     : First string document, i.e Book 1
    labels_list : Contains the list of labels('Book_1','Book_2','Book_3')
    words       : List of all words in one document('word1', 'word2', ...., 'wordn')
    labels      : Assigned 'label_list[i]' to words[i]
'''

class DocIterator(object):
    
    ## Initailizes document's list(doc1,doc2...) and its label's list('Book_1','Book_2',...)
    def __init__(self, doc_list, labels_list):
        self.doc_list = doc_list
        self.labels_list = labels_list
    
    ## Assigns label1 to a list of all words in doc1, label2 to all words in doc2, etc.
    def __iter__(self):
        
        for idx, doc in enumerate(self.doc_list):
            yield doc2vec.LabeledSentence(words=doc.read().split(), tags=[self.labels_list[idx]])

### Model

In [10]:
'''The input to Doc2Vec is iteration of objects, and each object consists of two simple lists.
   The two simple lists are: List of all words of a doc & List of labels.
   'iter_docs' is an object that contains all docs, and
   it passes (each document, its doc-label) to to our class "DocIterator" to "yield" a list of those two lists.
'''

iter_docs = DocIterator(data, docLabels)

In [11]:
''' KEY--

    size      : is the dimensionality of the feature vectors = 100; 100 weights or features(w0,w1,w2......w99)
    window    : is the maximum distance between the current and predicted word within a sentence.
    min_count : times the word occurs in all sentences, set to min_count=1 (makes a vocab of all unique words)
    workers   :
    alpha     : 
'''

## Training our model with our input data
model = Doc2Vec(iter_docs, size=500, window=10, min_count=1, workers=11, alpha=0.025, min_alpha=0.025, dbow_words = 1, iter = 20)

##### Saving the model

In [12]:
model.save('doc2vec.model')

##### Loading the model

In [13]:
model_loaded = gensim.models.Doc2Vec.load('doc2vec.model')

### Various Methods

In [15]:
# Comparing it with other documents
print(model.docvecs.most_similar(['Book_1']))

[('Book_2', -0.004399042576551437), ('Book_3', -0.03813797980546951)]


In [16]:
# Most similar document
print(model.docvecs.most_similar(['Book_1'],topn=1)[0][0])

Book_2


In [17]:
# Comparing it with other words
model.most_similar('Azkaban')

[('after...', 0.18889042735099792),
 ('Okay,', 0.18032433092594147),
 ('Amazing!', 0.1711585968732834),
 ('inch-', 0.16916677355766296),
 ('"Scabbers', 0.1685291975736618),
 ('escorted', 0.16335710883140564),
 ('FIFTEEN', 0.156924307346344),
 ('many-armed', 0.15507975220680237),
 ('memorized', 0.15472283959388733),
 ('treating', 0.1536206752061844)]

In [18]:
gensim.matutils.cossim(data[0], data[1])

0.0

In [19]:
# Finding the similarity between a NEW sentence and docs

test = '''At half past eight, Mr. Dursley picked up his briefcase, pecked Mrs.
Dursley on the cheek, and tried to kiss Dudley good-bye but missed,
because Dudley was now having a tantrum and throwing his cereal at the
walls.'''

In [20]:
words = word_tokenize(test)

In [21]:
new_doc_vec = model.infer_vector(words)

In [22]:
print(model.docvecs.most_similar([new_doc_vec]))

[('Book_2', 0.006374867632985115), ('Book_3', -0.022799234837293625), ('Book_1', -0.03998814895749092)]
