In [61]:
# Pre-processing Notes
import json

with open('X.txt', 'r') as f:
    for i in range(3000):
        content = f.readline()
        with open('Article '+str(i+1)+'.note', 'w') as f0:
            body = {"body": content}
            f0.write(json.dumps(body))
print("Done.")

Done.


## Doc2Vec

In [42]:
# Training Doc2Vec
import json
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [44]:
# Extract the body of notes and preprocess the corpus
def read_corpus():
    note_list = []
    for i in range(1, 3001):
        with open('Article '+str(i)+'.note', 'r') as f0:
            content = f0.read()
            body = json.loads(content)['body']
            
            # Preprocess the text
            tokens = gensim.utils.simple_preprocess(body)
            note_list.append(tokens)
    return note_list

note_list = read_corpus()
print(len(note_list))

3000


In [4]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(note_list)]
print(documents[2])

model = Doc2Vec(vector_size=200, min_count=2, epochs=50)

TaggedDocument<['universal', 'feature', 'of', 'the', 'replication', 'of', 'positive', 'strand', 'rna', 'viruses', 'is', 'the', 'association', 'with', 'intracellular', 'membranes', 'carnation', 'italian', 'ringspot', 'virus', 'cirv', 'replication', 'in', 'plants', 'occurs', 'in', 'vesicles', 'derived', 'from', 'the', 'mitochondrial', 'outer', 'membrane', 'the', 'product', 'encoded', 'by', 'cirv', 'orf', 'is', 'required', 'for', 'targeting', 'the', 'virus', 'replication', 'complex', 'to', 'the', 'outer', 'mitochondrial', 'membrane', 'both', 'in', 'plant', 'and', 'yeast', 'cells', 'here', 'the', 'yeast', 'saccharomyces', 'cerevisiae', 'was', 'used', 'as', 'model', 'host', 'to', 'study', 'the', 'effect', 'of', 'cirv', 'on', 'cell', 'survival', 'and', 'death', 'it', 'was', 'shown', 'that', 'does', 'not', 'promote', 'cell', 'death', 'but', 'decreases', 'cell', 'growth', 'rate', 'in', 'addition', 'changed', 'the', 'nature', 'of', 'acetic', 'acid', 'induced', 'cell', 'death', 'in', 'yeast', 'b

In [5]:
# Building model's vocabulary
model.build_vocab(documents)

print(f"Word 'replication' appeared {model.wv.get_vecattr('replication', 'count')} times in the training corpus.")

Word 'replication' appeared 39 times in the training corpus.


In [6]:
# Training the model
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)
print("Done.")

Done.


In [12]:
# Save the model
#from gensim.test.utils import get_tmpfile
#fname = get_tmpfile("Trained_Embedding_Model")
model.save("Trained_Embedding_Model")

In [17]:
# Now the model can output vectors
model = Doc2Vec.load("Trained_Embedding_Model")
vector = model.infer_vector(documents[3].words)
print(documents[3].words)
print(vector.shape)

['dichloropropane', 'dcp', 'and', 'dichloromethane', 'dcm', 'are', 'possible', 'causative', 'agents', 'associated', 'with', 'the', 'development', 'of', 'in', 'employees', 'working', 'in', 'printing', 'plant', 'in', 'osaka', 'japan', 'however', 'few', 'reports', 'have', 'demonstrated', 'an', 'association', 'between', 'these', 'agents', 'and', 'in', 'rodent', 'carcinogenicity', 'studies', 'moreover', 'the', 'combined', 'effects', 'of', 'these', 'compounds', 'have', 'not', 'been', 'fully', 'elucidated', 'in', 'the', 'present', 'study', 'we', 'evaluated', 'the', 'in', 'vivo', 'mutagenicity', 'of', 'dcp', 'and', 'dcm', 'alone', 'or', 'combined', 'in', 'the', 'livers', 'of', 'gpt', 'delta', 'rats', 'six', 'week', 'old', 'male', 'gpt', 'delta', 'rats', 'were', 'treated', 'with', 'dcp', 'dcm', 'or', 'dcp', 'dcm', 'by', 'oral', 'administration', 'for', 'weeks', 'at', 'the', 'dose', 'mgkg', 'body', 'weight', 'dcp', 'and', 'mgkg', 'body', 'weight', 'dcm', 'used', 'in', 'the', 'carcinogenesis', 's

In [63]:
# Save the embeddings of existing files
model = Doc2Vec.load("Trained_Embedding_Model")

for i in range(1, 3001):
    with open('Article '+str(i)+'.note', 'r') as f:
        content = f.read()
        content = json.loads(content)
        embedding = model.infer_vector(documents[i-1].words)
        content['embeddings'] = list(embedding.astype('str'))
    
    with open('Article '+str(i)+'.note', 'w') as f:
        f.write(json.dumps(content))
print("Done.")

Done.


### Accessing the Model

In [8]:
# Assessing the model
ranks = []
second_ranks = []
for doc_id in range(len(documents)):
    inferred_vector = model.infer_vector(documents[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

In [9]:
import collections

counter = collections.Counter(ranks)
print(counter)

Counter({0: 2998, 1: 2})


In [10]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(documents[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(documents[sims[index][0]].words)))


Document (2999): «ge malignancies make up significant and growing segment of newly diagnosed cancers approximately of patients who have ge cancers die within years of diagnosis which means that effective treatments for these malignancies need to be found currently targeted therapies have minimal role in this disease group intensive study of the molecular biology of ge cancers is relatively new and ongoing venture but it has already led to significant increase in our understanding of these malignancies this understanding although still limited has the potential to enhance our ability to develop targeted therapies in conjunction with the ability to identify actionable gene mutations and perform genomic profiling to predict drug resistance several cell surface growth factor receptors have been found to play prominent role in ge cancer cell signaling this discovery has led to the approval of agents within the last few years trastuzumab an anti human epidermal growth factor receptor her mon

In [67]:
# Test the model
import random

# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, 3000)
inferred_vector = model.infer_vector(note_list[doc_id])
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(note_list[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(documents[sims[index][0]].words)))


[ 1.0962925e+00 -9.6594751e-01  1.2834587e+00 -9.2835647e-01
  5.8798742e-01 -5.8278233e-01  1.1744876e+00  8.2435536e-01
 -2.4016068e+00 -9.8848581e-01  5.3110230e-01 -3.3889902e-01
 -4.5374417e-01  1.0891911e+00 -1.4333184e+00  1.1973546e+00
  5.2981710e-01  4.4473967e-01 -1.9779749e+00 -8.3651596e-01
 -3.4122637e-01 -1.5228660e-03 -7.0848715e-01 -1.2834401e+00
  3.8667649e-01 -8.8100207e-01 -4.6317659e-02 -5.8532572e-01
 -5.3139591e-01 -5.6672210e-01  1.1643296e+00 -1.5463480e+00
 -2.9876742e-01 -1.0017741e+00  3.0968571e-01  8.6258876e-01
 -2.5702608e-01  5.1764834e-01  2.7180862e-01 -2.9486269e-01
  3.1294107e-01 -5.2312344e-01  6.2677115e-01 -2.4262168e+00
  9.0401977e-01 -2.0781193e+00 -7.7862181e-02 -7.8902155e-01
 -7.2652549e-01  3.6717910e-01  1.1016653e+00 -7.3781401e-01
  1.1475338e+00 -7.0368791e-01  1.2269665e+00  6.2972665e-01
 -3.3473402e-01 -3.3673710e-01 -1.1270872e+00 -6.2429512e-01
  4.6336174e-01  1.4565576e+00 -1.4454970e-01 -1.5094818e-01
 -1.4071305e+00  9.08459

In [None]:
# Function to extract linked notes given a given notes
#
# Params:
# @current_note: String.  The path to the new note that I intend to find linked notes 
# @existing_notes: List[String]. A list of the paths to all existing notes, among which
#                  relevant notes are being searched.
#
# Return: List[String]. A list of paths to the linked notes.
def get_linked_notes(current_note : String, existing_notes : List[String]) -> List[String] = {
    #… I will fill out here
    return aList_of_linked_notes
}

