# Doc2Vec Model

In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Prepare the Training and Test Data

In [2]:
import os
import gensim
# Set file names for train and test data
test_data_dir = os.path.join(gensim.__path__[0], 'test', 'test_data')
lee_train_file = os.path.join(test_data_dir, 'lee_background.cor')
lee_test_file = os.path.join(test_data_dir, 'lee.cor')

# Define a Function to Read and Preprocess Text

In [3]:
import smart_open

def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            tokens = gensim.utils.simple_preprocess(line)
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

train_corpus = list(read_corpus(lee_train_file))
test_corpus = list(read_corpus(lee_test_file, tokens_only=True))

In [4]:
print(train_corpus[:2])

[TaggedDocument(words=['hundreds', 'of', 'people', 'have', 'been', 'forced', 'to', 'vacate', 'their', 'homes', 'in', 'the', 'southern', 'highlands', 'of', 'new', 'south', 'wales', 'as', 'strong', 'winds', 'today', 'pushed', 'huge', 'bushfire', 'towards', 'the', 'town', 'of', 'hill', 'top', 'new', 'blaze', 'near', 'goulburn', 'south', 'west', 'of', 'sydney', 'has', 'forced', 'the', 'closure', 'of', 'the', 'hume', 'highway', 'at', 'about', 'pm', 'aedt', 'marked', 'deterioration', 'in', 'the', 'weather', 'as', 'storm', 'cell', 'moved', 'east', 'across', 'the', 'blue', 'mountains', 'forced', 'authorities', 'to', 'make', 'decision', 'to', 'evacuate', 'people', 'from', 'homes', 'in', 'outlying', 'streets', 'at', 'hill', 'top', 'in', 'the', 'new', 'south', 'wales', 'southern', 'highlands', 'an', 'estimated', 'residents', 'have', 'left', 'their', 'homes', 'for', 'nearby', 'mittagong', 'the', 'new', 'south', 'wales', 'rural', 'fire', 'service', 'says', 'the', 'weather', 'conditions', 'which', '

In [5]:
print(test_corpus[:2])

[['the', 'national', 'executive', 'of', 'the', 'strife', 'torn', 'democrats', 'last', 'night', 'appointed', 'little', 'known', 'west', 'australian', 'senator', 'brian', 'greig', 'as', 'interim', 'leader', 'shock', 'move', 'likely', 'to', 'provoke', 'further', 'conflict', 'between', 'the', 'party', 'senators', 'and', 'its', 'organisation', 'in', 'move', 'to', 'reassert', 'control', 'over', 'the', 'party', 'seven', 'senators', 'the', 'national', 'executive', 'last', 'night', 'rejected', 'aden', 'ridgeway', 'bid', 'to', 'become', 'interim', 'leader', 'in', 'favour', 'of', 'senator', 'greig', 'supporter', 'of', 'deposed', 'leader', 'natasha', 'stott', 'despoja', 'and', 'an', 'outspoken', 'gay', 'rights', 'activist'], ['cash', 'strapped', 'financial', 'services', 'group', 'amp', 'has', 'shelved', 'million', 'plan', 'to', 'buy', 'shares', 'back', 'from', 'investors', 'and', 'will', 'raise', 'million', 'in', 'fresh', 'capital', 'after', 'profits', 'crashed', 'in', 'the', 'six', 'months', 'to'

# Training the Model

In [6]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)

2023-04-04 14:23:59,700 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec<dm/m,d50,n5,w5,mc2,s0.001,t3>', 'datetime': '2023-04-04T14:23:59.700019', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'created'}


In [7]:
model.build_vocab(train_corpus)

2023-04-04 14:24:08,973 : INFO : collecting all words and their counts
2023-04-04 14:24:08,975 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags
2023-04-04 14:24:09,001 : INFO : collected 6981 word types and 300 unique tags from a corpus of 300 examples and 58152 words
2023-04-04 14:24:09,003 : INFO : Creating a fresh vocabulary
2023-04-04 14:24:09,050 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=2 retains 3955 unique words (56.65% of original 6981, drops 3026)', 'datetime': '2023-04-04T14:24:09.050643', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-04-04 14:24:09,052 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=2 leaves 55126 word corpus (94.80% of original 58152, drops 3026)', 'datetime': '2023-04-04T14:24:09.052629', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.191

In [8]:
print(f"Word 'penalty' appeared {model.wv.get_vecattr('penalty', 'count')} times in the training corpus.")

Word 'penalty' appeared 4 times in the training corpus.


In [9]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

2023-04-04 14:24:29,190 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 3 workers on 3955 vocabulary and 50 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2023-04-04T14:24:29.190043', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'train'}
2023-04-04 14:24:29,333 : INFO : EPOCH 0: training on 58152 raw words (42704 effective words) took 0.1s, 315828 effective words/s
2023-04-04 14:24:29,451 : INFO : EPOCH 1: training on 58152 raw words (42628 effective words) took 0.1s, 381048 effective words/s
2023-04-04 14:24:29,558 : INFO : EPOCH 2: training on 58152 raw words (42635 effective words) took 0.1s, 417747 effective words/s
2023-04-04 14:24:29,658 : INFO : EPOCH 3: training on 58152 raw words (42634 effective words) took 0.1s, 451664 effective words/s
2023-04-04 14:24:29,764 : INFO : EPOCH 4: training on 58152 raw words (42633 ef

In [10]:
vector = model.infer_vector(['only', 'you', 'can', 'prevent', 'forest', 'fires'])
print(vector)

[-1.8845938e-01 -2.6537803e-01 -5.5914130e-02  2.5016531e-01
 -1.7719235e-01 -4.7731183e-02  2.9580062e-02  6.0472861e-02
 -2.5424814e-01 -1.6215636e-01  1.4758591e-01 -6.8410158e-02
 -4.0138811e-03 -4.0330701e-02 -9.8202035e-02 -5.8546420e-02
  2.0185548e-01  2.4245872e-01  1.3970153e-01 -1.4516649e-01
  5.5887435e-02  5.0207246e-02  7.6410420e-02  5.5191953e-02
  9.9016510e-02 -2.3333257e-04 -1.5753821e-01  3.1018430e-02
 -1.3048492e-01  4.0233448e-02  4.2507154e-01  6.8206400e-02
  1.5263835e-01  7.6525342e-03  2.1054219e-01  3.7167244e-02
  4.5424066e-02 -2.3485456e-01 -1.0403177e-01 -1.6196141e-02
 -9.2698485e-03 -4.1289590e-02 -6.0131565e-02 -1.1096421e-01
  1.4979953e-01  1.6080467e-02 -5.1995084e-02 -1.1647032e-01
  9.1443807e-02 -5.7463502e-03]


# Assessing the Model

In [11]:
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

In [12]:
import collections

counter = collections.Counter(ranks)
print(counter)

Counter({0: 292, 1: 8})


In [13]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Document (299): «australia will take on france in the doubles rubber of the davis cup tennis final today with the tie levelled at wayne arthurs and todd woodbridge are scheduled to lead australia in the doubles against cedric pioline and fabrice santoro however changes can be made to the line up up to an hour before the match and australian team captain john fitzgerald suggested he might do just that we ll make team appraisal of the whole situation go over the pros and cons and make decision french team captain guy forget says he will not make changes but does not know what to expect from australia todd is the best doubles player in the world right now so expect him to play he said would probably use wayne arthurs but don know what to expect really pat rafter salvaged australia davis cup campaign yesterday with win in the second singles match rafter overcame an arm injury to defeat french number one sebastien grosjean in three sets the australian says he is happy with his form it not v

In [14]:
# Pick a random document from the corpus and infer a vector from the model
import random
doc_id = random.randint(0, len(train_corpus) - 1)

# Compare and print the second-most-similar document
print('Train Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
sim_id = second_ranks[doc_id]
print('Similar Document {}: «{}»\n'.format(sim_id, ' '.join(train_corpus[sim_id[0]].words)))

Train Document (86): «argentina economy minister domingo cavallo is reported to have resigned in the face of mounting unrest over the country crumbling economy the reports in number of local media outlets could not be officially confirmed the news comes as police used teargas to disperse tens of thousands of people who had massed near the presidential palace in buenos aires and in other parts of the city to protest against the declaration of state of emergency it was declared after mounting popular discontent and widespread looting in the past few days with people over the state of the economy which has been in recession for four years»

Similar Document (223, 0.7362826466560364): «indonesian troop re enforcements have started arriving in central sulawesi as the government attempts to end days of deadly clashes between christians and muslims violence in the last week has claimed at least eight lives and left thousands of people homeless more than police and soldiers are being sent in t

# Testing the Model

In [15]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (2): «the united states government has said it wants to see president robert mugabe removed from power and that it is working with the zimbabwean opposition to bring about change of administration as scores of white farmers went into hiding to escape round up by zimbabwean police senior bush administration official called mr mugabe rule illegitimate and irrational and said that his re election as president in march was won through fraud walter kansteiner the assistant secretary of state for african affairs went on to blame mr mugabe policies for contributing to the threat of famine in zimbabwe»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec<dm/m,d50,n5,w5,mc2,s0.001,t3>:

MOST (94, 0.6363785266876221): «foreign minister alexander downer says the commonwealth democracy watchdog should put zimbabwe formally on its agenda in the first step to possible suspension from the organisation mr downer says ministers from the commonwealth ministerial action group cmag should review wheth