In [14]:
from rank_bm25 import BM25Okapi
import fasttext
from scipy import spatial
from numpy import linalg as LA
import json
import spacy
from nltk.stem.porter import *
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import *

Parameters and Setup

In [15]:
num_of_candid=500
candid_gen_type = 'fasttext' # Pick between 'fasttext', 'tf-idf', 'bm25'

In [8]:
# Loads all of the mesh words
file = open("mesh_2018_ID.txt","r")
mesh = file.readlines()
meshWords = []
meshID = []
for n, line  in enumerate(mesh):
    lineArr = line.split("=")
    word = lineArr[0].lower()
    meshWords.append(word)

# Creates a dictionary to map words to index
wordToIdx = {}
for n, word in enumerate(meshWords):
    wordToIdx[word] = n

#Loading in all of the training and testing data
testing_articles=[]
test_mesh=[]
articles=[]
groundtruth_mesh=[]
with open('data.json') as json_file:
    data= json.load(json_file)
    for d in data['train']:
        articles.append(d['article'])
        groundtruth_mesh.append(d['mesh_labels'])
    for d in data['test']:
        testing_articles.append(d['article'])
        test_mesh.append(d['mesh_labels'])


Candidate Generator 1: Tf-idf

In [9]:
tfif_char_vectorizer = TfidfVectorizer(decode_error='ignore', stop_words='english',
                                       analyzer='char', ngram_range=(2, 5),
                                       max_features=100000)

umls_tfidf = tfif_char_vectorizer.fit_transform(meshWords).tocsr()

In [10]:

nlp = spacy.load("en_core_sci_sm")

def candidMesh(groundtruth_mesh, article, num_of_candid):
    mentions=[]
    mentionSet = set()
    doc = nlp(article)
    for ent in doc.ents:
        mentions.append(str(ent))

    mention_tfidf = tfif_char_vectorizer.transform(mentions).tocsr()


    # try sharding this
    shard_start = 0
    delta = mention_tfidf.shape[0] 
    keep_scores = []
    topk_hits = defaultdict(int)
    topk_misses = defaultdict(int)
    for shard_num in range(1):
        shard_end = shard_start + delta

        pair_scores = pairwise_distances(mention_tfidf[shard_start:shard_end,:], umls_tfidf, metric='cosine', n_jobs=-1)
        arg_mins = np.argpartition(pair_scores, num_of_candid, axis=1)
        for row in range(pair_scores.shape[0]):
            hit = False
            cur_mins = arg_mins[row]
            for topK in range(num_of_candid):
                k = 0
                while k < topK+1 and k < arg_mins.shape[1] and not hit:
                    match_idx = arg_mins[row, k]
                    mentionSet.add(meshWords[match_idx])
                    k+=1

    count = 0
#     for entity in groundtruth_mesh:
#         if entity.lower() in mentionSet:
#             count+=1
#     print("RIGHT:",count,"OUT OF", len(groundtruth_mesh))
#     print(len(mentionSet))
#     averages.append(count/len(groundtruth_mesh))
#     groundtruth_mesh = []
#     print('Size of Mention set',len(mentionSet))
    return mentionSet

Candidate Generator 2: BM25

In [11]:
def bm_candidate_gen(article, num_of_candid):
    corpus = [str(m) for m in meshWords]
    bm25 = BM25Okapi(corpus)
    mentions=[]
    mentionSet = set()
    doc = nlp(article)
    for ent in doc.ents:
        mentions.append(str(ent))
    
    corpus = [str(m) for m in meshWords]
    for word in mentions:
    

        mentionSet = mentionSet.union(set(bm25.get_top_n(word, corpus, n=num_of_candid)))

    return mentionSet

Candidate Generator 3: Fasttext

In [19]:

model_fasttext = fasttext.train_unsupervised('mesh-words.txt', minn=2, maxn=5, dim=300,epoch=25,verbose=2)
mesh_vectors_fasttext =[]
for word in meshWords:
    mesh_vectors_fasttext.append(model_fasttext.get_word_vector(word))
tree = spatial.KDTree(mesh_vectors_fasttext)

In [20]:
mesh_np = np.array(meshWords)
def fasttext_candidate_gen(article, num_of_candid):
    mentions=[]
    mentionSet = set()
    doc = nlp(article)
    for ent in doc.ents:
        mentions.append(str(ent))
    word_vect = [model_fasttext.get_word_vector(word_candid) for word_candid in mentions]
    

    words = mesh_np[tree.query(word_vect,k=num_of_candid)[1]]
    word_set = [set(w) for w in words]

    mentionSet = mentionSet.union(*word_set)

    return mentionSet

Creating Candidate Sets from one of the three candidate generators

In [28]:
# Generating candidates through tf idf
candidate_mesh_list_train = []
candidate_mesh_list_test = []
print('Running candidate generation | type:',candid_gen_type)
if candid_gen_type == 'tf-idf':
    for n, article in enumerate(articles):
        candidate_mesh_list_train.append(candidMesh(groundtruth_mesh[n],article,num_of_candid))
    for n, article in enumerate(testing_articles):
        candidate_mesh_list_test.append(candidMesh(test_mesh[n],article,num_of_candid))
        
        
elif candid_gen_type == 'bm25':
    for n, article in enumerate(articles):
        candidate_mesh_list_train.append(bm_candidate_gen(article,num_of_candid))
        if n%10==0:
            print('On training article',n)
    for n, article in enumerate(testing_articles):
        candidate_mesh_list_test.append(bm_candidate_gen(article,num_of_candid))
        
        
elif candid_gen_type =='fasttext':
    for n, article in enumerate(articles):
        candidate_mesh_list_train.append(fasttext_candidate_gen(article,num_of_candid))
        if n%10==0:
            print('On training article',n)
    for n, article in enumerate(testing_articles):
        candidate_mesh_list_test.append(fasttext_candidate_gen(article,num_of_candid))
        
else:
    print('Invalid candidate generator')

Running candidate generation | type: fasttext
On training article 0
On training article 10
On training article 20
On training article 30
On training article 40
On training article 50
On training article 60
On training article 70
On training article 80
On training article 90


In [29]:
can = [list(s) for s in candidate_mesh_list_train]
candidate_mesh_list_train = can
can = [list(s) for s in candidate_mesh_list_test]
candidate_mesh_list_test = can

In [30]:
data={}
data['train'] = candidate_mesh_list_train
data['test'] = candidate_mesh_list_test
found=True
if candid_gen_type=='tf-idf':
    filename = 'candidate_data.json'
elif candid_gen_type=='bm25':
    filename = 'candidate_data_bm25.json'
elif candid_gen_type =='fasttext':
    filename='candidate_data_fasttext.json'
else:
    found=False
if found:
    with open(filename,'w') as outfile:
        json.dump(data,outfile)