In [1]:
# import all classes of biosses_d2v

from biosses_d2v import PMCOASubsetCorpus, BIOSSESDataset, Doc2VecRunner



In [2]:
# downloads and initializes  training corpus first 
# only 10 articles from the 0-9A-B package without lemmatization

corpus = PMCOASubsetCorpus(packages=['0-9A-B'], size=10, lemma=False)

In [3]:
for text in corpus:
    print(text)

TaggedDocument(['when', 'jean', 'todt', 'defeated', 'ari', 'vatanen', 'in', '2009', 'and', 'david', 'ward', 'in', '2013', 'for', 'the', 'presidency', 'of', 'the', 'fdration', 'internationale', 'de', 'lautomobile', '(fia)', 'in', 'campaign', 'races', 'that', 'involved', 'allegations', 'of', 'corruption', 'as', 'well', 'as', 'criticism', 'of', 'transparency', 'issues', 'and', 'malpractice,', 'the', 'fia', 'was', 'definitely', 'no', 'longer', 'merely', 'a', 'sporting', 'body.', 'instead,', 'at', 'the', 'end', 'of', 'todts', 'third', 'presidential', 'period,', 'it', 'had', 'through', 'its', 'arrangements', 'with', 'the', 'mighty', 'coalitions', 'like', 'the', 'united', 'nations', '(un', ')', 'and', 'collaboration', 'with', 'international', 'actors', 'from', 'environmental', 'groups', 'and', 'powerful', 'sponsors', 'of', 'race', 'series', '(', 'formula', 'e,', 'for', 'examplee', 'for', 'electric),', 'like', 'the', 'swiss-swedish', 'multinational', 'corporation', 'abb,', 'become', 'a', 'glob

In [4]:
# initializes Doc2Vec runner with dimensionality 200 and minimum count 3 

runner = Doc2VecRunner(corpus, vector_size=200, min_count=3)

In [5]:
# train Doc2Vec model and enable logging

model = runner.run(use_logger=True, progress_per=1)

2021-09-21 10:48:18,381 : INFO : collecting all words and their counts
2021-09-21 10:48:18,385 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2021-09-21 10:48:18,391 : INFO : PROGRESS: at example #1, processed 16022 words (3162916/s), 4330 word types, 0 tags
2021-09-21 10:48:18,394 : INFO : PROGRESS: at example #2, processed 23275 words (2607338/s), 5728 word types, 0 tags
2021-09-21 10:48:18,405 : INFO : PROGRESS: at example #3, processed 28323 words (1591848/s), 6901 word types, 0 tags
2021-09-21 10:48:18,410 : INFO : PROGRESS: at example #4, processed 32841 words (2471545/s), 7834 word types, 0 tags
2021-09-21 10:48:18,412 : INFO : PROGRESS: at example #5, processed 36526 words (1618826/s), 8429 word types, 0 tags
2021-09-21 10:48:18,417 : INFO : PROGRESS: at example #6, processed 42109 words (1622883/s), 9391 word types, 0 tags
2021-09-21 10:48:18,421 : INFO : PROGRESS: at example #7, processed 54667 words (3259454/s), 10821 word types, 0 tags
2021-

In [6]:
# downloads and initializes BIOSSES dataset

biosses = BIOSSESDataset()

In [7]:
# with the model trained, we can finally benchmark it!
# as you'll see, correlation is really low 
# since we only trained on 10 documents

p_corr, d2v_scores = biosses.benchmark_with_d2v(model, lemma=False)
print("- Doc2Vec Simiarlity Scores:")
print(d2v_scores)
print("- Pearson Correlation with Gold Standard Scores", p_corr)

- Doc2Vec Simiarlity Scores:
1      0.974930
2      0.990900
3      0.970816
4      0.974793
5      0.959298
         ...   
96     0.987446
97     0.949742
98     0.965572
99     0.958312
100    0.982795
Length: 100, dtype: float64
- Pearson Correlation with Gold Standard Scores 0.010694678931334696
