In [7]:
import gensim
import logging
import os
import numpy as np
from gensim.models.doc2vec import TaggedDocument
from matplotlib import pyplot

# read data from all file 
PATH = '../Data'
# print log
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
# read all files
def readallfile(filepath):
    documents = []
    with open(filepath, 'r', encoding = 'utf8') as f:
        for line in f:
            items = line.split("\t")
            # word[0] is paper ID, word[1] is title content, word[2] is abstract content
            title = items[1].lower().strip().split(" ")
            abstract = items[2].lower().strip().split(" ")
            paper = title;
            if len(abstract)>1:
                paper = np.concatenate((title, abstract), axis=0)
            documents.append(TaggedDocument(paper, items[0]))
    f.close()
    print("Done loading files")
    return documents

documents = readallfile(PATH+"/id_title_abstract_processed.txt")

Done loading files


In [5]:
documents[:2]

[TaggedDocument(words=['metal', 'substitutions', 'incarbonic', 'anhydrase:', 'a', 'halide', 'ion', 'probe', 'study'], tags='3'),
 TaggedDocument(words=array(['purification', 'and', 'properties', 'of', 'escherichia', 'coli',
       'dihydrofolate', 'reductase', 'dihydrofolate', 'reductase', 'has',
       'been', 'purified', '40-fold', 'to', 'apparent', 'homogeneity',
       'from', 'a', 'trimethoprim-resistant', 'strain', 'of',
       'escherichia', 'coli', '(rt', '500)', 'using', 'a', 'procedure',
       'that', 'includes', 'methotrexate', 'affinity', 'column',
       'chromatography.', 'determinations', 'of', 'the', 'molecular',
       'weight', 'of', 'the', 'enzyme', 'based', 'on', 'its', 'amino',
       'acid', 'composition,', 'sedimentation', 'velocity,', 'and',
       'sodium', 'dodecyl', 'sulfate', 'gel', 'electrophoresis', 'gave',
       'values', 'of', '17680,', '17470', 'and', '18300,',
       'respectively.', 'an', 'aggregated', 'form', 'of', 'the', 'enzyme',
       'with', '

In [None]:
# save the model
def train_and_save_d2v_model(document):
    # train model
    # size is number of vector return, alpha is learning rate, sample is number of sample want to remove
    model = gensim.models.Doc2Vec(document, min_count=5,vector_size=100, epochs=10, workers=8, window=5, sample=1e-3, negative=5)
    # save model
    print("Saving model")
    model.save("../models/doc2v/model")
    print("Done")
    model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
    
train_and_save_d2v_model(documents)

2018-02-08 20:23:53,453 : INFO : collecting all words and their counts
2018-02-08 20:23:53,469 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2018-02-08 20:23:56,195 : INFO : PROGRESS: at example #10000, processed 931761 words (342027/s), 70809 word types, 10 tags
2018-02-08 20:23:58,640 : INFO : PROGRESS: at example #20000, processed 1721718 words (323284/s), 106219 word types, 10 tags
2018-02-08 20:24:00,879 : INFO : PROGRESS: at example #30000, processed 2461861 words (330880/s), 135522 word types, 10 tags
2018-02-08 20:24:02,974 : INFO : PROGRESS: at example #40000, processed 3264273 words (383175/s), 162197 word types, 10 tags
2018-02-08 20:24:05,857 : INFO : PROGRESS: at example #50000, processed 4004884 words (257050/s), 185104 word types, 10 tags
2018-02-08 20:24:08,522 : INFO : PROGRESS: at example #60000, processed 5021623 words (381804/s), 219856 word types, 10 tags
2018-02-08 20:24:13,069 : INFO : PROGRESS: at example #70000, processed 66955

2018-02-08 20:28:13,288 : INFO : PROGRESS: at example #650000, processed 83594404 words (257732/s), 1548839 word types, 10 tags
2018-02-08 20:28:18,994 : INFO : PROGRESS: at example #660000, processed 85249601 words (290370/s), 1574020 word types, 10 tags
2018-02-08 20:28:24,331 : INFO : PROGRESS: at example #670000, processed 87157935 words (357678/s), 1603073 word types, 10 tags
2018-02-08 20:28:29,324 : INFO : PROGRESS: at example #680000, processed 89070691 words (383215/s), 1633310 word types, 10 tags
2018-02-08 20:28:35,495 : INFO : PROGRESS: at example #690000, processed 90939663 words (303014/s), 1662275 word types, 10 tags
2018-02-08 20:28:41,561 : INFO : PROGRESS: at example #700000, processed 92837831 words (313040/s), 1692075 word types, 10 tags
2018-02-08 20:28:46,716 : INFO : PROGRESS: at example #710000, processed 94690444 words (359697/s), 1722287 word types, 10 tags
2018-02-08 20:28:52,196 : INFO : PROGRESS: at example #720000, processed 96541524 words (339209/s), 1751

2018-02-08 20:34:00,453 : INFO : PROGRESS: at example #1290000, processed 195577941 words (316756/s), 3253577 word types, 10 tags
2018-02-08 20:34:06,854 : INFO : PROGRESS: at example #1300000, processed 197445870 words (291917/s), 3279845 word types, 10 tags
2018-02-08 20:34:13,107 : INFO : PROGRESS: at example #1310000, processed 199377096 words (309325/s), 3307247 word types, 10 tags
2018-02-08 20:34:20,696 : INFO : PROGRESS: at example #1320000, processed 201328998 words (257256/s), 3334754 word types, 10 tags
2018-02-08 20:34:27,160 : INFO : PROGRESS: at example #1330000, processed 203258827 words (299145/s), 3361970 word types, 10 tags
2018-02-08 20:34:33,519 : INFO : PROGRESS: at example #1340000, processed 205193557 words (305090/s), 3388553 word types, 10 tags
2018-02-08 20:34:38,062 : INFO : PROGRESS: at example #1350000, processed 206600301 words (309816/s), 3409462 word types, 10 tags
2018-02-08 20:34:44,528 : INFO : PROGRESS: at example #1360000, processed 208548972 words 

2018-02-08 20:40:41,407 : INFO : PROGRESS: at example #1930000, processed 316915632 words (299397/s), 4933128 word types, 10 tags
2018-02-08 20:40:48,714 : INFO : PROGRESS: at example #1940000, processed 318858822 words (265989/s), 4962674 word types, 10 tags
2018-02-08 20:40:54,094 : INFO : PROGRESS: at example #1950000, processed 320791138 words (359335/s), 4991091 word types, 10 tags
2018-02-08 20:40:59,858 : INFO : PROGRESS: at example #1960000, processed 322705646 words (332423/s), 5019243 word types, 10 tags
2018-02-08 20:41:06,523 : INFO : PROGRESS: at example #1970000, processed 324646234 words (291938/s), 5048023 word types, 10 tags
2018-02-08 20:41:13,378 : INFO : PROGRESS: at example #1980000, processed 326580765 words (282833/s), 5076195 word types, 10 tags
2018-02-08 20:41:20,360 : INFO : PROGRESS: at example #1990000, processed 328505474 words (275760/s), 5103914 word types, 10 tags
2018-02-08 20:41:27,414 : INFO : PROGRESS: at example #2000000, processed 330432107 words 

2018-02-08 20:47:17,816 : INFO : PROGRESS: at example #2570000, processed 440909794 words (309953/s), 6729470 word types, 10 tags
2018-02-08 20:47:22,326 : INFO : collected 6748745 word types and 10 unique tags from a corpus of 2576835 examples and 442250772 words
2018-02-08 20:47:22,328 : INFO : Loading a fresh vocabulary
2018-02-08 20:47:36,432 : INFO : min_count=5 retains 988278 unique words (14% of original 6748745, drops 5760467)
2018-02-08 20:47:36,449 : INFO : min_count=5 leaves 434201937 word corpus (98% of original 442250772, drops 8048835)
2018-02-08 20:47:47,942 : INFO : deleting the raw counts dictionary of 6748745 items
2018-02-08 20:47:48,509 : INFO : sample=0.001 downsamples 24 most-common words
2018-02-08 20:47:48,511 : INFO : downsampling leaves estimated 346660436 word corpus (79.8% of prior 434201937)
2018-02-08 20:47:59,128 : INFO : estimated required memory for 988278 words and 100 dimensions: 1284767400 bytes
2018-02-08 20:47:59,130 : INFO : resetting layer weight

2018-02-08 20:49:50,714 : INFO : EPOCH 1 - PROGRESS: at 4.59% examples, 179098 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:49:51,744 : INFO : EPOCH 1 - PROGRESS: at 4.66% examples, 179167 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:49:52,861 : INFO : EPOCH 1 - PROGRESS: at 4.71% examples, 178868 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:49:53,884 : INFO : EPOCH 1 - PROGRESS: at 4.76% examples, 178929 words/s, in_qsize 16, out_qsize 0
2018-02-08 20:49:54,914 : INFO : EPOCH 1 - PROGRESS: at 4.81% examples, 178859 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:49:55,935 : INFO : EPOCH 1 - PROGRESS: at 4.87% examples, 179139 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:49:57,010 : INFO : EPOCH 1 - PROGRESS: at 4.93% examples, 179197 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:49:58,021 : INFO : EPOCH 1 - PROGRESS: at 4.99% examples, 179298 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:49:59,063 : INFO : EPOCH 1 - PROGRESS: at 5.05% examples, 179232 words/s, in_qsize

2018-02-08 20:51:06,598 : INFO : EPOCH 1 - PROGRESS: at 9.26% examples, 178055 words/s, in_qsize 14, out_qsize 1
2018-02-08 20:51:07,631 : INFO : EPOCH 1 - PROGRESS: at 9.34% examples, 178049 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:51:08,701 : INFO : EPOCH 1 - PROGRESS: at 9.41% examples, 177998 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:51:09,701 : INFO : EPOCH 1 - PROGRESS: at 9.48% examples, 178088 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:51:10,720 : INFO : EPOCH 1 - PROGRESS: at 9.55% examples, 178258 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:51:11,767 : INFO : EPOCH 1 - PROGRESS: at 9.62% examples, 178288 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:51:12,769 : INFO : EPOCH 1 - PROGRESS: at 9.69% examples, 178375 words/s, in_qsize 14, out_qsize 1
2018-02-08 20:51:13,858 : INFO : EPOCH 1 - PROGRESS: at 9.77% examples, 178414 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:51:14,868 : INFO : EPOCH 1 - PROGRESS: at 9.88% examples, 178053 words/s, in_qsize

2018-02-08 20:52:21,712 : INFO : EPOCH 1 - PROGRESS: at 16.48% examples, 177431 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:52:22,748 : INFO : EPOCH 1 - PROGRESS: at 16.54% examples, 177742 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:52:23,764 : INFO : EPOCH 1 - PROGRESS: at 16.60% examples, 177772 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:52:24,866 : INFO : EPOCH 1 - PROGRESS: at 16.66% examples, 177810 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:52:25,869 : INFO : EPOCH 1 - PROGRESS: at 16.72% examples, 177926 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:52:26,877 : INFO : EPOCH 1 - PROGRESS: at 16.77% examples, 177963 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:52:27,881 : INFO : EPOCH 1 - PROGRESS: at 16.82% examples, 178007 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:52:28,884 : INFO : EPOCH 1 - PROGRESS: at 16.88% examples, 178161 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:52:29,950 : INFO : EPOCH 1 - PROGRESS: at 16.93% examples, 178114 words/s,

2018-02-08 20:53:36,365 : INFO : EPOCH 1 - PROGRESS: at 20.51% examples, 182926 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:53:37,417 : INFO : EPOCH 1 - PROGRESS: at 20.57% examples, 183045 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:53:38,423 : INFO : EPOCH 1 - PROGRESS: at 20.62% examples, 183138 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:53:39,473 : INFO : EPOCH 1 - PROGRESS: at 20.68% examples, 183231 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:53:40,485 : INFO : EPOCH 1 - PROGRESS: at 20.74% examples, 183432 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:53:41,493 : INFO : EPOCH 1 - PROGRESS: at 20.79% examples, 183521 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:53:42,607 : INFO : EPOCH 1 - PROGRESS: at 20.85% examples, 183651 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:53:43,635 : INFO : EPOCH 1 - PROGRESS: at 20.91% examples, 183806 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:53:44,694 : INFO : EPOCH 1 - PROGRESS: at 20.97% examples, 183945 words/s,

2018-02-08 20:54:51,001 : INFO : EPOCH 1 - PROGRESS: at 24.41% examples, 185239 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:54:52,021 : INFO : EPOCH 1 - PROGRESS: at 24.58% examples, 185234 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:54:53,035 : INFO : EPOCH 1 - PROGRESS: at 24.65% examples, 185208 words/s, in_qsize 16, out_qsize 0
2018-02-08 20:54:54,040 : INFO : EPOCH 1 - PROGRESS: at 24.73% examples, 185251 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:54:55,061 : INFO : EPOCH 1 - PROGRESS: at 24.78% examples, 185249 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:54:56,099 : INFO : EPOCH 1 - PROGRESS: at 24.87% examples, 185217 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:54:57,176 : INFO : EPOCH 1 - PROGRESS: at 24.98% examples, 185141 words/s, in_qsize 16, out_qsize 0
2018-02-08 20:54:58,220 : INFO : EPOCH 1 - PROGRESS: at 25.03% examples, 185112 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:54:59,249 : INFO : EPOCH 1 - PROGRESS: at 25.08% examples, 185105 words/s,

2018-02-08 20:56:05,866 : INFO : EPOCH 1 - PROGRESS: at 28.48% examples, 187193 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:56:06,894 : INFO : EPOCH 1 - PROGRESS: at 28.53% examples, 187218 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:56:07,983 : INFO : EPOCH 1 - PROGRESS: at 28.59% examples, 187306 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:56:09,013 : INFO : EPOCH 1 - PROGRESS: at 28.65% examples, 187390 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:56:10,023 : INFO : EPOCH 1 - PROGRESS: at 28.71% examples, 187479 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:56:11,157 : INFO : EPOCH 1 - PROGRESS: at 28.76% examples, 187444 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:56:12,272 : INFO : EPOCH 1 - PROGRESS: at 28.81% examples, 187432 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:56:13,349 : INFO : EPOCH 1 - PROGRESS: at 28.87% examples, 187455 words/s, in_qsize 14, out_qsize 1
2018-02-08 20:56:14,349 : INFO : EPOCH 1 - PROGRESS: at 28.92% examples, 187510 words/s,

2018-02-08 20:57:20,895 : INFO : EPOCH 1 - PROGRESS: at 32.33% examples, 189252 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:57:21,920 : INFO : EPOCH 1 - PROGRESS: at 32.39% examples, 189368 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:57:22,927 : INFO : EPOCH 1 - PROGRESS: at 32.46% examples, 189473 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:57:23,927 : INFO : EPOCH 1 - PROGRESS: at 32.51% examples, 189533 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:57:25,037 : INFO : EPOCH 1 - PROGRESS: at 32.57% examples, 189567 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:57:26,094 : INFO : EPOCH 1 - PROGRESS: at 32.63% examples, 189651 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:57:27,371 : INFO : EPOCH 1 - PROGRESS: at 32.69% examples, 189576 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:57:28,380 : INFO : EPOCH 1 - PROGRESS: at 32.75% examples, 189530 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:57:29,503 : INFO : EPOCH 1 - PROGRESS: at 32.81% examples, 189446 words/s,

2018-02-08 20:58:36,674 : INFO : EPOCH 1 - PROGRESS: at 37.26% examples, 188643 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:58:37,700 : INFO : EPOCH 1 - PROGRESS: at 37.32% examples, 188674 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:58:38,774 : INFO : EPOCH 1 - PROGRESS: at 37.37% examples, 188704 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:58:39,808 : INFO : EPOCH 1 - PROGRESS: at 37.43% examples, 188732 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:58:40,826 : INFO : EPOCH 1 - PROGRESS: at 37.48% examples, 188779 words/s, in_qsize 16, out_qsize 0
2018-02-08 20:58:41,827 : INFO : EPOCH 1 - PROGRESS: at 37.53% examples, 188805 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:58:42,873 : INFO : EPOCH 1 - PROGRESS: at 37.59% examples, 188883 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:58:43,964 : INFO : EPOCH 1 - PROGRESS: at 37.65% examples, 188948 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:58:44,965 : INFO : EPOCH 1 - PROGRESS: at 37.71% examples, 189015 words/s,

2018-02-08 20:59:51,047 : INFO : EPOCH 1 - PROGRESS: at 41.29% examples, 190723 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:59:52,047 : INFO : EPOCH 1 - PROGRESS: at 41.33% examples, 190694 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:59:53,103 : INFO : EPOCH 1 - PROGRESS: at 41.38% examples, 190697 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:59:54,120 : INFO : EPOCH 1 - PROGRESS: at 41.44% examples, 190712 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:59:55,135 : INFO : EPOCH 1 - PROGRESS: at 41.49% examples, 190728 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:59:56,209 : INFO : EPOCH 1 - PROGRESS: at 41.55% examples, 190761 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:59:57,222 : INFO : EPOCH 1 - PROGRESS: at 41.60% examples, 190813 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:59:58,227 : INFO : EPOCH 1 - PROGRESS: at 41.66% examples, 190855 words/s, in_qsize 15, out_qsize 0
2018-02-08 20:59:59,235 : INFO : EPOCH 1 - PROGRESS: at 41.70% examples, 190873 words/s,

2018-02-08 21:01:06,055 : INFO : EPOCH 1 - PROGRESS: at 45.01% examples, 191805 words/s, in_qsize 15, out_qsize 0
2018-02-08 21:01:07,059 : INFO : EPOCH 1 - PROGRESS: at 45.05% examples, 191799 words/s, in_qsize 15, out_qsize 0
2018-02-08 21:01:08,070 : INFO : EPOCH 1 - PROGRESS: at 45.10% examples, 191824 words/s, in_qsize 15, out_qsize 0
2018-02-08 21:01:09,088 : INFO : EPOCH 1 - PROGRESS: at 45.16% examples, 191858 words/s, in_qsize 15, out_qsize 0
2018-02-08 21:01:10,131 : INFO : EPOCH 1 - PROGRESS: at 45.21% examples, 191863 words/s, in_qsize 15, out_qsize 0
2018-02-08 21:01:11,156 : INFO : EPOCH 1 - PROGRESS: at 45.26% examples, 191882 words/s, in_qsize 16, out_qsize 0
2018-02-08 21:01:12,199 : INFO : EPOCH 1 - PROGRESS: at 45.31% examples, 191856 words/s, in_qsize 15, out_qsize 0
2018-02-08 21:01:13,253 : INFO : EPOCH 1 - PROGRESS: at 45.36% examples, 191891 words/s, in_qsize 15, out_qsize 0
2018-02-08 21:01:14,264 : INFO : EPOCH 1 - PROGRESS: at 45.41% examples, 191926 words/s,

In [None]:
# load existing model
print("Loading model ........")
model = gensim.models.Word2Vec.load('./models/doc2v/model')
