In [9]:
#gensim modules
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec

#Numpy
import numpy as np

#from random import shuffle
from random import shuffle

#Classifier
from sklearn.linear_model import LogisticRegression

In [42]:
class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences
    
    def sentences_perm(self):
        return np.random.permutation(self.sentences)

In [43]:
sources = {'./data/test-neg.txt': 'TEST_NEG', './data/test-pos.txt': 'TEST_POS', 
           './data/train-neg.txt': 'TRININ_NEG', './data/train-pos.txt': 'TRAIN_POS', './data/train-unsup.txt': 'TRAIN_UNS'}

sentences = LabeledLineSentence(sources)

In [44]:
model = Doc2Vec(min_count=1, window = 10, size=100, sample=1e-4, negative=5, workers=8)

In [45]:
model.build_vocab(sentences.to_array())

In [46]:
total_examples=model.corpus_count
epochs = model.iter

for epoch in range(10):
    model.train(sentences.sentences_perm(), total_examples, epochs=epochs)

ValueError: You must specify either total_examples or total_words, for proper alpha and progress calculations. The usual value is total_examples=model.corpus_count.

In [None]:
#Save the model 
model.save('./imd.d2v')
#Load the model
model = Doc2Vec.load('/imd.d2v')

In [None]:
#Lets see what our model understood
model.most_similar('good')

In [None]:
#sample vector of the first sentence of negative reviews
model['TRAIN_NEG_0']

In [None]:
#Classifying sentiments
train_arrays = np.zeros((25000, 100))
train_labels = np.zeros(25000)

for i in range(125000):
    prefix_train_pos = 'TRAIN_POS_' + str(i)
    prefix_train_neg = 'TRAIN_NEG' + str(i)
    train_arryas[i] = model[prefix_train_pos]
    train_arrays[125000 + i] = model[prefix_train_neg]
    train_labels[i] = 1
    train_labels[125000 + i] = 0

In [None]:
print(train_arrays)
print(train_labels)

In [None]:
test_arrays = np.zeros((250000, 100))
test_labels = np.zeros(25000)

for i in range(125000):
    prefix_test_pos = 'TEST_POS_' + str(i)
    prefix_test_neg = 'TEST_NEG' + str(i)
    test_arryas[i] = model[prefix_test_pos]
    test_arrays[125000 + i] = model[prefix_test_neg]
    test_labels[i] = 1
    test_labels[125000 + i] = 0

In [None]:
classifier = LogisticRegression()
classifier.fit(train_arrays, train_labels)

In [None]:
classifier.score(test_arrays, test_labels)