In [9]:
# Pre-processing Notes
import json
import time

with open('X.txt', 'r') as f:
    for i in range(3000):
        content = f.readline()
        with open('Article '+str(i+1)+'.note', 'w') as f0:
            schema = {}
            schema["title"] = 'Article '+str(i+1)
            schema["body"] = {"ops": [{'insert':content}]}
            schema["text"] = content
            schema["updatedAt"] = int(time.time())
            f0.write(json.dumps(schema))
print("Done.")

Done.


## Doc2Vec

In [23]:
# Training Doc2Vec
import json
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [25]:
# Extract the body of notes and preprocess the corpus
from os import listdir
from os.path import isfile, join

def is_note_file(path):
    if not isfile(path):
        return False
    if path.split('.')[-1] != 'note':
        return False
    return True

def read_corpus():
    note_list = []
    list_dir = listdir()
    pth_list=[file for file in list_dir if is_note_file(file)]
        
    for path in pth_list:
        with open(path, 'r') as f0:
            content = f0.read()
            body = json.loads(content)['text']
            
            # Preprocess the text
            tokens = gensim.utils.simple_preprocess(body)
            note_list.append(tokens)
    return note_list

note_list = read_corpus()
print(len(note_list))

3036


In [26]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(note_list)]
print(documents[2])

model = Doc2Vec(vector_size=200, min_count=2, epochs=50)

TaggedDocument<['in', 'this', 'paper', 'novel', 'high', 'gain', 'op', 'amp', 'design', 'is', 'presented', 'this', 'proposed', 'circuit', 'also', 'exhibits', 'high', 'linearity', 'the', 'architecture', 'of', 'the', 'op', 'amp', 'presented', 'in', 'this', 'paper', 'is', 'based', 'on', 'cross', 'coupled', 'differential', 'pair', 'and', 'positive', 'feedback', 'the', 'proposed', 'op', 'amp', 'is', 'designed', 'in', 'mu', 'cmos', 'process', 'using', 'umc', 'nm', 'library', 'in', 'cadence', 'virtuoso', 'analog', 'design', 'environment', 'the', 'simulation', 'of', 'proposed', 'circuit', 'results', 'in', 'db', 'gain', 'mhz', 'ugb', 'and', 'degree', 'phase', 'margin', 'while', 'dissipating', 'mu', 'power', 'the', 'thd', 'analysis', 'of', 'the', 'circuit', 'shows', 'maximum', 'distortion', 'of', 'db', 'at', 'output', 'peak', 'to', 'peak', 'voltage', 'and', 'khz', 'frequency'], [2]>


In [28]:
# Building model's vocabulary
model.build_vocab(documents)

print(f"Word 'replication' appeared {model.wv.get_vecattr('algorithm', 'count')} times in the training corpus.")

Word 'replication' appeared 164 times in the training corpus.


In [29]:
# Training the model
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)
print("Done.")

Done.


In [30]:
# Save the model
#from gensim.test.utils import get_tmpfile
#fname = get_tmpfile("Trained_Embedding_Model")
model.save("Trained_Embedding_Model")

In [31]:
# Now the model can output vectors
model = Doc2Vec.load("Trained_Embedding_Model")
vector = model.infer_vector(documents[3].words)
print(documents[3].words)
print(vector.shape)

['objective', 'we', 'investigated', 'the', 'expression', 'of', 'annexin', 'anxa', 'in', 'human', 'cca', 'cell', 'line', 'and', 'its', 'effect', 'on', 'proliferation', 'migration', 'and', 'apoptosis', 'of', 'human', 'cca', 'cells', 'materials', 'and', 'methods', 'expression', 'of', 'anxa', 'was', 'detected', 'by', 'fluorescent', 'quantitative', 'reverse', 'transcriptase', 'polymerase', 'chain', 'reaction', 'qrt', 'pcr', 'and', 'western', 'blotting', 'method', 'in', 'human', 'cca', 'cell', 'lines', 'qbc', 'and', 'rbe', 'shrna', 'plasmids', 'for', 'anxa', 'silencing', 'anxa', 'sh', 'anxa', 'sh', 'anxa', 'sh', 'and', 'negative', 'control', 'plasmid', 'were', 'constructed', 'to', 'infect', 'qbc', 'cells', 'the', 'infection', 'efficiency', 'expression', 'of', 'anxa', 'apoptosis', 'and', 'cell', 'cycle', 'of', 'qbc', 'cell', 'were', 'measured', 'separately', 'results', 'the', 'expression', 'of', 'anxa', 'in', 'qbc', 'cell', 'was', 'significantly', 'higher', 'than', 'rbe', 'cell', 'expressed',

In [32]:
# Save the embeddings of existing files
model = Doc2Vec.load("Trained_Embedding_Model")

list_dir = listdir()
pth_list=[file for file in list_dir if is_note_file(file)]
        
for path in pth_list:
    with open(path, 'r') as f:
        content = f.read()
        content = json.loads(content)
        embedding = model.infer_vector(documents[i-1].words)
        content['embeddings'] = list(embedding.astype('str'))
    
    with open(path, 'w') as f:
        f.write(json.dumps(content))
print("Done.")

Done.


### Accessing the Model

In [12]:
# Assessing the model
ranks = []
second_ranks = []
for doc_id in range(len(documents)):
    inferred_vector = model.infer_vector(documents[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

In [13]:
import collections

counter = collections.Counter(ranks)
print(counter)

Counter({0: 3000})


In [14]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(documents[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(documents[sims[index][0]].words)))


Document (2999): «ge malignancies make up significant and growing segment of newly diagnosed cancers approximately of patients who have ge cancers die within years of diagnosis which means that effective treatments for these malignancies need to be found currently targeted therapies have minimal role in this disease group intensive study of the molecular biology of ge cancers is relatively new and ongoing venture but it has already led to significant increase in our understanding of these malignancies this understanding although still limited has the potential to enhance our ability to develop targeted therapies in conjunction with the ability to identify actionable gene mutations and perform genomic profiling to predict drug resistance several cell surface growth factor receptors have been found to play prominent role in ge cancer cell signaling this discovery has led to the approval of agents within the last few years trastuzumab an anti human epidermal growth factor receptor her mon

In [15]:
# Test the model
import random

# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, 3000)
inferred_vector = model.infer_vector(note_list[doc_id])
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(note_list[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(documents[sims[index][0]].words)))


Test Document (1367): «the control of rhipicephalus microplus is essential to prevent cattle discomfort and economic losses however increased resistance and acaricides inefficiency lead producers to adopt strategies that could result in the accumulation of chemical residues in meat and milk with possibilities of poisoning in animals and people this scenario demonstrates the necessity of research into the identification of novel effective and environmentally safe therapeutic options for cattle tick control the objectives of this study were to develop and assess the efficacy of microplus biotherapic and of eugenol for the control of microplus in artificially infested calves eighteen male month old holstein calves were divided into three groups of six animals in group the animals did not receive medication control group in group the animals received ml of microplus biotherapic at dilution ch centesimal hahnemannian orally administered twice daily and in group they received single applicat

In [None]:
# Function to extract linked notes given a given notes
#
# Params:
# @current_note: String.  The path to the new note that I intend to find linked notes 
# @existing_notes: List[String]. A list of the paths to all existing notes, among which
#                  relevant notes are being searched.
#
# Return: List[String]. A list of paths to the linked notes.
def get_linked_notes(current_note : String, existing_notes : List[String]) -> List[String] = {
    #… I will fill out here
    return aList_of_linked_notes
}

