In [437]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

### The data

The data consists of 356 documents (paragraphs) which have one or more events.
The data was collected from the BiographySampo.
The first training corpus wasn't lemmatized.
Currently used corpus is lemmatized.

In [430]:
with open('new_corpus.csv') as corpus:
    reader = corpus.readlines()
    mydata = list(reader)

Tagging data, creating a model and training it:

In [431]:
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(mydata)]

In [432]:
#so far the best results for test sentences are with this number of epochs and this vector size: 30 and 17
max_epochs = 30
vec_size = 17
alpha = 0.025

model = Doc2Vec(vector_size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)
model.build_vocab(tagged_data)

In [433]:
for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha
    
model.save("d2v.model")
print("Model Saved")

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
Model Saved


### Documents for testing

As an example, such event as marriage was taken: 

1.	"Hän avioitui Pietarissa 2. toukokuuta 1892 venäläisen kenraalimajurin varakkaan perijättären Anastasia Arapovan kanssa."
2.	"Lönnrot meni naimisiin vuonna 1849, 47-vuotiaana, Maria Piponiuksen kanssa, joka oli oululaisen värjärimestarin Elias Piponiuksen tytär."
3.	"Tytär Karin solmi avioliiton ministeri Henrik Ramsayn kanssa ja toimi Suomen Punaisessa Ristissä." (from the corpus)

Test documents are not lemmatized.

Also short sentences (e.g. "Hän sai nimityksen kenraaliksi") were tested, but the results were sigificantly worse than for the long ones.

In [434]:
model= Doc2Vec.load("d2v.model")
test_data = word_tokenize("Hän sai nimityksen kenraaliksi".lower())
v1 = model.infer_vector(test_data)
#print("V1_infer", v1)

test_data_M = word_tokenize("Hän avioitui Pietarissa 2. toukokuuta 1892 venäläisen kenraalimajurin varakkaan perijättären Anastasia Arapovan kanssa.".lower())
v_m = model.infer_vector(test_data_M)

test_data_L = word_tokenize("Lönnrot meni naimisiin vuonna 1849 kappalaisen tyttären Maria Piponiuksen kanssa ja lähti opettamaan kirjallisuutta.".lower())
v_l = model.infer_vector(test_data_L)

test_data_N = word_tokenize("Tytär Karin solmi avioliiton ministeri Henrik Ramsayn kanssa ja toimi Suomen Punaisessa Ristissä.".lower())
v_n = model.infer_vector(test_data_N)

# to find most similar doc using tags
#similar_doc = model.docvecs.most_similar('2')
#print(similar_doc)

# to find vector of doc in training data using tags or in other words, printing the vector of document at index 1 in training data
#print(model.docvecs['1'])

similar_doc1 = model.docvecs.most_similar([v1])
similar_doc_v_m = model.docvecs.most_similar([v_m],topn=10)
similar_doc_v_l = model.docvecs.most_similar([v_l],topn=10)
similar_doc_v_n = model.docvecs.most_similar([v_n],topn=10)
#print(similar_doc1)

[('27', 0.6963678002357483), ('22', 0.68153977394104), ('76', 0.6719655394554138), ('25', 0.6616845726966858), ('21', 0.6556556820869446), ('157', 0.6308171153068542), ('198', 0.6302379965782166), ('23', 0.624057948589325), ('189', 0.6208353042602539), ('68', 0.6095075607299805)]
[('23', 0.13377512991428375), ('189', 0.08624720573425293), ('22', 0.08456331491470337), ('145', 0.05946550890803337), ('78', 0.05627712234854698), ('148', 0.03999495506286621), ('154', 0.02445557340979576), ('149', 0.023879773914813995), ('143', 0.017699720337986946), ('52', 0.014712880365550518)]
[('22', 0.6840628385543823), ('192', 0.6476923227310181), ('23', 0.6463348269462585), ('315', 0.480753093957901), ('189', 0.4704722464084625), ('21', 0.46617254614830017), ('266', 0.4640052020549774), ('25', 0.4380340576171875), ('137', 0.43303754925727844), ('20', 0.43024328351020813)]


The results are similar for the 1st and the 3rd test documents, but the results for the 2nd differ from them:

In [440]:
print(similar_doc_v_m)
print(similar_doc_v_l)
print(similar_doc_v_n)

[('27', 0.6963678002357483), ('22', 0.68153977394104), ('76', 0.6719655394554138), ('25', 0.6616845726966858), ('21', 0.6556556820869446), ('157', 0.6308171153068542), ('198', 0.6302379965782166), ('23', 0.624057948589325), ('189', 0.6208353042602539), ('68', 0.6095075607299805)]
[('23', 0.13377512991428375), ('189', 0.08624720573425293), ('22', 0.08456331491470337), ('145', 0.05946550890803337), ('78', 0.05627712234854698), ('148', 0.03999495506286621), ('154', 0.02445557340979576), ('149', 0.023879773914813995), ('143', 0.017699720337986946), ('52', 0.014712880365550518)]
[('22', 0.6840628385543823), ('192', 0.6476923227310181), ('23', 0.6463348269462585), ('315', 0.480753093957901), ('189', 0.4704722464084625), ('21', 0.46617254614830017), ('266', 0.4640052020549774), ('25', 0.4380340576171875), ('137', 0.43303754925727844), ('20', 0.43024328351020813)]


Let's look at the similarities for particular words:

In [439]:
print(model.docvecs.most_similar([model['avioitua']]))
print(model.docvecs.most_similar([model['naimisiin']]))
print(model.docvecs.most_similar([model['avioliitto']]))

[('21', 0.8781343698501587), ('23', 0.7849961519241333), ('27', 0.7636508345603943), ('25', 0.7487219572067261), ('22', 0.7202984094619751), ('192', 0.7149435877799988), ('170', 0.6967310309410095), ('76', 0.6512880325317383), ('189', 0.6428390145301819), ('94', 0.6211383938789368)]
[('350', 0.7831231355667114), ('23', 0.7823107242584229), ('180', 0.7546542882919312), ('316', 0.7542555332183838), ('189', 0.7463042736053467), ('22', 0.7123032808303833), ('145', 0.6880351305007935), ('299', 0.68767249584198), ('181', 0.6724562048912048), ('155', 0.6524287462234497)]
[('22', 0.900201141834259), ('23', 0.7920703291893005), ('192', 0.7810046076774597), ('21', 0.7547167539596558), ('27', 0.7032747864723206), ('146', 0.694972813129425), ('14', 0.6924290060997009), ('181', 0.6843852400779724), ('137', 0.6786996126174927), ('315', 0.6674879789352417)]


The problem seems to be with 'naimisiin'