In [19]:
import gensim
import logging
import os
from gensim.models.doc2vec import TaggedDocument

# read data from all file 
PATH = '../Data'
# print log
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
# # examples
# documents = []
# sentences = [['this', 'is', 'the', 'first', 'sentence', 'for', 'doc2vec'],
#              ['this', 'is', 'the', 'second', 'sentence'],
#              ['yet', 'another', 'sentence'],
#              ['one', 'more', 'sentence'],
#              ['and', 'the', 'final', 'sentence']]
# counter = 0;
# for line in sentences:
#     documents.append(TaggedDocument(line, str(counter)))
#     counter+=1;

In [None]:
# read all files 
def readallfile(filepath):
    documents = []
    with open(filepath, 'r', encoding = 'utf8') as f:
        for line in f:
            items = line.split("\t")
            # word[0] is paper ID, word[1] is title content, word[2] is abstract content
            paperID = items[0]
            title = items[1].lower().strip().split(" ")
            abstract = items[2].lower().strip().split(" ")
            documents.append(TaggedDocument(title+abstract, [paperID]))
    f.close()
    print("Done loading files")
    return documents

documents = readallfile(PATH+"/id_title_abstract_processed.txt")


In [24]:
print(len(documents))
print(documents[:2])

46
[TaggedDocument(words=['metal', 'substitutions', 'incarbonic', 'anhydrase:', 'a', 'halide', 'ion', 'probe', 'study', ''], tags=['3']), TaggedDocument(words=['purification', 'and', 'properties', 'of', 'escherichia', 'coli', 'dihydrofolate', 'reductase', 'dihydrofolate', 'reductase', 'has', 'been', 'purified', '40-fold', 'to', 'apparent', 'homogeneity', 'from', 'a', 'trimethoprim-resistant', 'strain', 'of', 'escherichia', 'coli', '(rt', '500)', 'using', 'a', 'procedure', 'that', 'includes', 'methotrexate', 'affinity', 'column', 'chromatography.', 'determinations', 'of', 'the', 'molecular', 'weight', 'of', 'the', 'enzyme', 'based', 'on', 'its', 'amino', 'acid', 'composition,', 'sedimentation', 'velocity,', 'and', 'sodium', 'dodecyl', 'sulfate', 'gel', 'electrophoresis', 'gave', 'values', 'of', '17680,', '17470', 'and', '18300,', 'respectively.', 'an', 'aggregated', 'form', 'of', 'the', 'enzyme', 'with', 'a', 'low', 'specific', 'activity', 'can', 'be', 'separated', 'from', 'the', 'monom

In [25]:
# save the model
def train_and_save_d2v_model(document):
    # train model
    # size is number of vector return, alpha is learning rate, sample is number of sample want to remove
    model = gensim.models.Doc2Vec(document, min_count=5,vector_size=100, epochs=10, workers=8, window=5, sample=1e-3, negative=5)
    # save model
    newfileDir = "../models/doc2v"
    if not os.path.exists(newfileDir):
        os.makedirs(newfileDir)
    print("Saving model")
    model.save(newfileDir+"/model")
    print("Done")
    model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
    
train_and_save_d2v_model(documents)

2018-02-15 19:12:57,568 : INFO : collecting all words and their counts
2018-02-15 19:12:57,570 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2018-02-15 19:12:57,576 : INFO : collected 1609 word types and 46 unique tags from a corpus of 46 examples and 3976 words
2018-02-15 19:12:57,579 : INFO : Loading a fresh vocabulary
2018-02-15 19:12:57,583 : INFO : min_count=5 retains 113 unique words (7% of original 1609, drops 1496)
2018-02-15 19:12:57,585 : INFO : min_count=5 leaves 1898 word corpus (47% of original 3976, drops 2078)
2018-02-15 19:12:57,589 : INFO : deleting the raw counts dictionary of 1609 items
2018-02-15 19:12:57,592 : INFO : sample=0.001 downsamples 113 most-common words
2018-02-15 19:12:57,594 : INFO : downsampling leaves estimated 750 word corpus (39.6% of prior 1898)
2018-02-15 19:12:57,597 : INFO : estimated required memory for 113 words and 100 dimensions: 174500 bytes
2018-02-15 19:12:57,599 : INFO : resetting layer weights
2018-02-1

2018-02-15 19:12:57,901 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-02-15 19:12:57,906 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-02-15 19:12:57,908 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-02-15 19:12:57,911 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-02-15 19:12:57,913 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-02-15 19:12:57,915 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-02-15 19:12:57,917 : INFO : EPOCH - 9 : training on 3976 raw words (807 effective words) took 0.0s, 28701 effective words/s
2018-02-15 19:12:57,926 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-02-15 19:12:57,933 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-02-15 19:12:57,940 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-02-15 19:12:57,942 : INFO : worker thread finis

Saving model
Done
