In [21]:
import gensim
import os
import collections
import smart_open
import random

In [22]:
# Set file names for train and test data
test_data_dir = '{}'.format(os.sep).join([gensim.__path__[0], 'test', 'test_data'])
lee_train_file = test_data_dir + os.sep + 'lee_background.cor'
lee_test_file = test_data_dir + os.sep + 'lee.cor'

In [23]:
def read_corpus(fname, tokens_only = False):
    with smart_open.smart_open(fname, encoding = "iso-8859-1") as f:
        for i, line in enumerate(f):
            if tokens_only:
                yield gensim.utils.simple_preprocess(line)
            else:
                # for training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])

In [24]:
train_corpus = list(read_corpus(lee_train_file))
test_corpus = list(read_corpus(lee_test_file, tokens_only=True))

In [25]:
train_corpus[:2]

[TaggedDocument(words=['hundreds', 'of', 'people', 'have', 'been', 'forced', 'to', 'vacate', 'their', 'homes', 'in', 'the', 'southern', 'highlands', 'of', 'new', 'south', 'wales', 'as', 'strong', 'winds', 'today', 'pushed', 'huge', 'bushfire', 'towards', 'the', 'town', 'of', 'hill', 'top', 'new', 'blaze', 'near', 'goulburn', 'south', 'west', 'of', 'sydney', 'has', 'forced', 'the', 'closure', 'of', 'the', 'hume', 'highway', 'at', 'about', 'pm', 'aedt', 'marked', 'deterioration', 'in', 'the', 'weather', 'as', 'storm', 'cell', 'moved', 'east', 'across', 'the', 'blue', 'mountains', 'forced', 'authorities', 'to', 'make', 'decision', 'to', 'evacuate', 'people', 'from', 'homes', 'in', 'outlying', 'streets', 'at', 'hill', 'top', 'in', 'the', 'new', 'south', 'wales', 'southern', 'highlands', 'an', 'estimated', 'residents', 'have', 'left', 'their', 'homes', 'for', 'nearby', 'mittagong', 'the', 'new', 'south', 'wales', 'rural', 'fire', 'service', 'says', 'the', 'weather', 'conditions', 'which', '

## Training the model

In [26]:
model = gensim.models.doc2vec.Doc2Vec(size=50, min_count=2, iter=10,epochs=10)



## Building a Vocabulary

In [27]:
model.build_vocab(train_corpus)

In [28]:
%time model.train(train_corpus,total_examples=model.corpus_count,epochs=model.epochs)

Wall time: 672 ms


In [29]:
model.infer_vector(['only', 'you','can'])

array([-0.03689037,  0.01021045,  0.01466787,  0.0097922 , -0.01397554,
        0.00868028, -0.00362626,  0.00748147, -0.01187754, -0.02990837,
        0.00190234,  0.03037427,  0.00777273,  0.0067167 ,  0.01085766,
       -0.01098123,  0.01252583,  0.01644981,  0.01327741, -0.00938998,
       -0.00165449,  0.00621655,  0.03541192,  0.00948573,  0.0117279 ,
       -0.01229443,  0.00309216, -0.01403829, -0.00278904, -0.01362233,
        0.00546045, -0.01281703, -0.01406664,  0.02900545,  0.01792838,
        0.01111196,  0.00404645, -0.01726588,  0.00786457, -0.00399781,
       -0.01842173, -0.00124189, -0.02715289, -0.01373972, -0.01447477,
       -0.00300985, -0.01174488, -0.01154512, -0.00058045,  0.01800883],
      dtype=float32)

## Assessing the model

In [30]:
ranks=[]
second_ranks=[]
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector],topn = len(model.docvecs))
    rank = [docid for docid, sims in sims].index(doc_id)
    ranks.append(rank)
    
    second_ranks.append(sims[1])

In [31]:
collections.Counter(ranks)

Counter({0: 28,
         44: 3,
         5: 14,
         15: 8,
         11: 14,
         23: 4,
         1: 25,
         4: 19,
         10: 8,
         21: 3,
         13: 6,
         12: 6,
         7: 14,
         235: 1,
         8: 10,
         33: 2,
         60: 1,
         40: 3,
         90: 1,
         53: 2,
         3: 17,
         24: 5,
         65: 1,
         2: 19,
         25: 5,
         17: 3,
         47: 2,
         29: 2,
         86: 1,
         182: 1,
         9: 12,
         59: 1,
         19: 3,
         39: 1,
         14: 4,
         46: 2,
         45: 1,
         42: 2,
         81: 1,
         18: 3,
         84: 2,
         20: 2,
         49: 2,
         16: 5,
         41: 1,
         108: 1,
         67: 1,
         74: 1,
         68: 2,
         30: 2,
         52: 1,
         6: 4,
         79: 1,
         38: 1,
         58: 1,
         55: 1,
         76: 1,
         66: 1,
         26: 2,
         151: 1,
         148: 1,
         22: 2,
   

In [36]:
print('Document ({}): <<{}>> \n'.format(doc_id,' '.join(train_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' %model)
for label, index in [('MOST',0), ('MEDIAN',len(sims)//2), ('LEAST',len(sims)-1)]:
    print(u'%s %s: <<%s>> \n'% (label, sims[index],' '.join(train_corpus[sims[index][0]].words)))

Document (299): <<australia will take on france in the doubles rubber of the davis cup tennis final today with the tie levelled at wayne arthurs and todd woodbridge are scheduled to lead australia in the doubles against cedric pioline and fabrice santoro however changes can be made to the line up up to an hour before the match and australian team captain john fitzgerald suggested he might do just that we ll make team appraisal of the whole situation go over the pros and cons and make decision french team captain guy forget says he will not make changes but does not know what to expect from australia todd is the best doubles player in the world right now so expect him to play he said would probably use wayne arthurs but don know what to expect really pat rafter salvaged australia davis cup campaign yesterday with win in the second singles match rafter overcame an arm injury to defeat french number one sebastien grosjean in three sets the australian says he is happy with his form it not 

In [38]:
# Pick a random document from test corpus and infer a vector from the model
doc_id = random.randint(0,len(train_corpus))
# compare and print the most/median/least similar documents from the train corpus
print('Train Document ({}): <<{}>>\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
sim_id = second_ranks[doc_id]
print('Similar Document {}: <<{}>>\n'.format(sim_id, ' '.join(train_corpus[sim_id[0]].words)))

Train Document (146): <<the australian and south african sides for the first cricket test starting at the adelaide oval today are not expected to be finalised until just before the start of play australian captain steve waugh and his south african counterpart shaun pollock will decide on their lineups after an inspection of the pitch shortly before the start of play the match holds special significance for waugh and his twin brother mark who play their th test together steve waugh is not placing too much relevance on the milestone don want to read too much into it guess and then get too carried away but later on when we retire and look back on it it will be significant it nice for the family mum and dad all the sacrifices they made you know with us growing up and also our brothers so you know it nice for the family he said>>

Similar Document (104, 0.9986782073974609): <<australian cricket captain steve waugh has supported fast bowler brett lee after criticism of his intimidatory bowli

## Testing the model

In [39]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus))
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (21): «the federal government says changes announced today to the work for the dole scheme will benefit participants and taxpayers federal employment services minister mal brough says that from july those taking part in work for the dole will be able to perform extra hours to complete their mutual obligation more quickly to access training credits»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):

MOST (83, 0.9854923486709595): «the opposition leader simon crean says child abuse scandal in brisbane has damaged the office of the governor general and its incumbent dr peter hollingworth child advocates have called on dr hollingworth to step down as governor general saying he did not do enough to prevent abuse of children in an anglican school when he was archbishop of brisbane mr crean says he is not calling on dr hollingworth to resign but he says there are still unanswered questions think it has tarnished the office of the governor general the fact