In [1]:
import gensim
import codecs
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from collections import OrderedDict
import multiprocessing


from random import shuffle
import datetime

cores = multiprocessing.cpu_count()

VOC_SIZE = 200

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Arxiv dataset

### clean triplets (without path)

In [None]:
clean_arxiv_triplets = []
with codecs.open("data/arxiv_triplets.txt") as fin:
    for line in fin:
        idx1, idx2, idx3 = (path.split('/')[-1] for path in line.split(' '))
        clean_arxiv_triplets.append(f'{idx1} {idx2} {idx3}')

In [None]:
with codecs.open("data/clean_arxiv_triplets.txt", "w") as fin:
    for triplet in clean_arxiv_triplets:
        fin.write(triplet)

### Doc2Vec


In [4]:
def docs_iterator(filename, start_from=0):
    with codecs.open(filename, encoding='utf-8') as fin:
        for line_no, line in enumerate(fin):
            tokens = gensim.utils.to_unicode(line).split()
            yield TaggedDocument(tokens[start_from:], [line_no])

In [None]:
class MyCorpus_train:
    def __iter__(self, ):
        with codecs.open(train_filename, encoding='utf-8') as fin:
            for line_no, line in enumerate(fin):
                tokens = gensim.utils.to_unicode(line).split()
                yield TaggedDocument(tokens[0:], [line_no])

### train vectors

In [None]:
train_filename = 'data/arxiv/arxiv_plain.txt'

In [None]:
dbow = Doc2Vec(dm=0, vector_size=VOC_SIZE, negative=5, hs=0, min_count=0, workers=8)
dbow.build_vocab(docs_iterator(train_filename))

In [None]:
dbow.save('train_doc2vec/train0.doc2vec')

In [None]:
dbow = Doc2Vec.load('train_doc2vec/train5.doc2vec')

In [None]:
# если начинать с некоторого шага
alpha, min_alpha, passes = (0.025, 0.0001, 15)
alpha_delta = (alpha - min_alpha) / passes
alpha -= 5 * alpha_delta

for epoch in range(5, passes):
    dbow.alpha, dbow.min_alpha = alpha, alpha       
    dbow.train(MyCorpus_train(), total_examples=dbow.corpus_count, epochs=10)        
    print('completed pass %i at alpha %f' % (epoch + 1, alpha))
    dbow.save(f'train_doc2vec/train{epoch + 1}.doc2vec')  
    print('saved step')
    alpha -= alpha_delta

In [None]:
alpha, min_alpha, passes = (0.025, 0.0001, 15)
alpha_delta = (alpha - min_alpha) / passes

for epoch in range(passes):
    dbow.alpha, dbow.min_alpha = alpha, alpha       
    dbow.train(MyCorpus_train(), total_examples=dbow.corpus_count, epochs=10)        
    print('completed pass %i at alpha %f' % (epoch + 1, alpha))
    dbow.save(f'train_doc2vec/train{epoch + 1}.doc2vec')  
    print('saved step')
    alpha -= alpha_delta       

In [None]:
dbow.save('train_doc2vec/train_total.doc2vec')

### test vectors

In [None]:
test_filename = 'data/arxiv/test_arxiv_plain.txt'

In [None]:
class MyCorpus_test:
    def __iter__(self, ):
        with codecs.open(test_filename, encoding='utf-8') as fin:
            for line_no, line in enumerate(fin):
                tokens = gensim.utils.to_unicode(line).split()
                yield TaggedDocument(tokens[0:], [line_no])

In [None]:
test_id2tag = {}
with codecs.open(test_filename, encoding='utf-8') as fin:
    for line_no, line in enumerate(fin):
        test_id2tag[line.split()[0]] = line_no

In [None]:
infer_steps = 5
infer_alpha = 0.1
test_vectors = [dbow.infer_vector(doc.words, steps=infer_steps, alpha=infer_alpha) 
                for doc in docs_iterator('test_arxiv_plain.txt', start_from=1)]

In [None]:
with codecs.open('data/arxiv/embeddings_doc2vec.txt', 'w') as fin:
    for id, tag in test_id2tag.items():
        line = 'd-' + id + ' ' + ' '.join(map(str, test_vectors[tag])) + '\n'
        fin.write(line)

# MIND dataset

In [None]:
# берем уже обученную модель Doc2Vec на базе arxiv, потому что база супер большая - 25 ГБ

In [3]:
#local
#dbow = Doc2Vec.load('Doc2Vec_model_trained_arxiv/train_total.doc2vec')

#colab
dbow = Doc2Vec.load('/content/drive/MyDrive/diploma/train_doc2vec/train_total.doc2vec')

In [5]:
train_file = '/content/drive/MyDrive/diploma/texts_train.txt'

In [9]:
train_id2tag = {}
with codecs.open(train_file, encoding='utf-8') as fin:
    for line_no, line in enumerate(fin):
        train_id2tag[line.split()[0]] = line_no

In [10]:
infer_steps = 5
infer_alpha = 0.1
train_vectors = [dbow.infer_vector(doc.words, steps=infer_steps, alpha=infer_alpha) 
                for doc in docs_iterator(train_file, start_from=1)]

In [15]:
with codecs.open('/content/drive/MyDrive/diploma/embeddings_doc2vec_train.txt', 'w') as fin:
    for id, tag in train_id2tag.items():
        line = id + ' ' + ' '.join(map(str, train_vectors[tag])) + '\n'
        fin.write(line)

In [16]:
test_file = '/content/drive/MyDrive/diploma/texts_test.txt'

In [17]:
test_id2tag = {}
with codecs.open(test_file, encoding='utf-8') as fin:
    for line_no, line in enumerate(fin):
        test_id2tag[line.split()[0]] = line_no

In [18]:
infer_steps = 5
infer_alpha = 0.1
test_vectors = [dbow.infer_vector(doc.words, steps=infer_steps, alpha=infer_alpha) 
                for doc in docs_iterator(test_file, start_from=1)]

In [19]:
with codecs.open('/content/drive/MyDrive/diploma/embeddings_doc2vec_test.txt', 'w') as fin:
    for id, tag in test_id2tag.items():
        line = id + ' ' + ' '.join(map(str, test_vectors[tag])) + '\n'
        fin.write(line)