In [5]:
import numpy as np
import pandas as pd
import random, pickle, argparse, json, os, urllib2
from collections import OrderedDict
from operator import itemgetter
from sklearn.ensemble import RandomForestClassifier

In [33]:
# functions
########################################################################################
def query_from(q, f):
    q = q+'&from='+str(f)
    response = urllib2.urlopen(q)
    data = json.load(response)
    subject_ids = np.array(range(len(data['hits'])), dtype=np.object)
    for (i, hit) in enumerate(data['hits']):
        subject_ids[i] = hit['id']
    return subject_ids

def query(q):
    response = urllib2.urlopen(q)
    data = json.load(response)
    nb_requests = 1 + data['total'] / 1000
    if nb_requests > 10: # maximum number of pages due to API pagination restrection
        nb_requests = 10
    subject_ids = query_from(q, 0)
    for i in range(nb_requests)[1:]:
        f = i * 1000
        next_request = query_from(q, f)
        subject_ids = np.hstack((subject_ids, next_request))
    return subject_ids.tolist()

def find_intersection(list_a, list_b):
    return list(set(list_a) & set(list_b))

def term2url(string):
    string = string.split(' ')
    res = '%22'
    for s in string:
        res = res + s + '%20'
    res = res[:-3]
    res = res + '%22'
    return res

def babel_synset(synset):
    q = 'https://api.istex.fr/document/?q=(('
    for syn in synset:
        syn = term2url(syn)
        q = q + 'title:' + syn + '%20OR%20abstract:' + syn + '%20OR%20'
    q = q[:-8]
    q = q + ')%20AND%20(qualityIndicators.abstractWordCount:[35%20500]%20AND%20qualityIndicators.pdfPageCount:[3%2060]%20AND%20publicationDate:[1990%202016]%20AND%20language:(%22eng%22%20OR%20%22unknown%22)%20AND%20genre:(%22research_article%22%20OR%20%22conference[eBooks]%22%20OR%20%22article%22%20)%20))&size=1000&output=id'
    return q

def babel_subj_keyword(topic):
    q = 'https://api.istex.fr/document/?q=(('
    topic = term2url(topic)
    q = q+ 'subject.value:' + topic + '%20OR%20keywords:' + topic
    q = q + ')%20AND%20(qualityIndicators.abstractWordCount:[35%20500]%20AND%20qualityIndicators.pdfPageCount:[3%2060]%20AND%20publicationDate:[1990%202016]%20AND%20language:(%22eng%22%20OR%20%22unknown%22)%20AND%20genre:(%22research_article%22%20OR%20%22conference[eBooks]%22%20OR%20%22article%22%20)%20))&size=1000&output=id'
    return q
 
def babel_title_abst(topic):
    q = 'https://api.istex.fr/document/?q=(('
    topic = term2url(topic)
    q = q+ 'title:' + topic + '%20OR%20abstract:' + topic
    q = q + ')%20AND%20(qualityIndicators.abstractWordCount:[35%20500]%20AND%20qualityIndicators.pdfPageCount:[3%2060]%20AND%20publicationDate:[1990%202016]%20AND%20language:(%22eng%22%20OR%20%22unknown%22)%20AND%20genre:(%22research_article%22%20OR%20%22conference[eBooks]%22%20OR%20%22article%22%20)%20))&size=1000&output=id'
    return q
 
def babelnet_syn_get_input(topic, synset):
    results = query(babel_synset(synset))
    _gs = query(babel_subj_keyword(topic))
    results = find_intersection(results, inversed_index.keys())
    _abst_title = query(babel_title_abst(topic))
    test_set = _inter = {x for x in _gs if x not in _abst_title}
    test_set = find_intersection(test_set, inversed_index.keys())
    results = list(results)
    test = list(test_set)
    print 'initial_corpus size:', len(find_intersection(_abst_title, inversed_index.keys()))
    return results, test

def babelnet_eval(topic, synset):
    babelnet_results, test = babelnet_syn_get_input(topic, synset)
    print 'results size of the topic "' + topic + '":', len(babelnet_results) 
    print 'ground truth size', len(test)
    babel_test_intersection = find_intersection(test,babelnet_results)
    babel_test_intersection_size = len(babel_test_intersection)
    print 'intersection with the ground truth:', babel_test_intersection_size
    
def top_thresh(ordered_dict_pickle, thresh):
    ranked_all = pickle.load(open(ordered_dict_pickle, 'rb'))
    ranked_all_np = np.array(ranked_all.items())
    ranked_all_df = pd.DataFrame(data=ranked_all_np, index=None, columns=['istex_id', 'score'])
    ranked_all_df['score'] = ranked_all_df[['score']].astype(float)
    return ranked_all_df[ranked_all_df['score'] > thresh]

def top_thresh_lst(res_lst_pickle, thresh):
    ranked_all = pickle.load(open(res_lst_pickle, 'rb'))
    if type(ranked_all) is OrderedDict:
        ranked_all = ranked_all.items()[:100000]
    ranked_all_np = np.array(ranked_all)
    ranked_all_df = pd.DataFrame(data=ranked_all_np, index=None, columns=['istex_id', 'score'])
    ranked_all_df['score'] = ranked_all_df[['score']].astype(float)
    return ranked_all_df[ranked_all_df['score'] > thresh]

def babelnet_eval_PR(topic, synset):
    babelnet_results, test = babelnet_syn_get_input(topic, synset)
    print 'babelnet results size of the topic "' + topic + '":', len(babelnet_results) 
    print 'ground truth size', len(test)
    babel_test_intersection = find_intersection(test,babelnet_results)
    babel_test_intersection_size = len(babel_test_intersection)
    print 'intersection with the ground truth:', babel_test_intersection_size
    precision = babel_test_intersection_size / float(len(babelnet_results))
    recall = babel_test_intersection_size / float(len(test))
    if babel_test_intersection_size is not 0:
        F1 = 2 * (precision * recall) / (precision + recall)
    else:
        F1 = 0.0 
    print "F1", F1
    print 'babel precision: ', precision
    print 'babel recall: ', recall
    

#Evaluate 3SH results at treshold
def eval_all_at_thresh(ordered_dict_pickle, topic, synset, thresh):
    babelnet_eval_PR(topic, synset)
    _, test = babelnet_syn_get_input(topic, synset)
    t = len(test)
    top_res = top_thresh(ordered_dict_pickle, thresh)
    n = len(top_res)
    if n > 10000:
        thresh = thresh + 0.1
        top_res = top_thresh(ordered_dict_pickle, thresh)
    elif n < 1000:
        thresh = thresh - 0.1
        top_res = top_thresh(ordered_dict_pickle, thresh)
    n = len(top_res)
    res = list(top_res['istex_id'])
    recall = len(find_intersection(test,res))/float(t)
    precision = len(find_intersection(test,res))/float(n)
    return precision, recall, n, t, thresh

#Evaluate 3SH results list at treshold
def eval_all_at_thresh_lst(res_pickle, topic, synset, thresh=0.75):
    babelnet_eval_PR(topic, synset)
    _, test = babelnet_syn_get_input(topic, synset)
    t = len(test)
    top_res = top_thresh_lst(res_pickle, thresh)
    n = len(top_res)
    if n > 10000:
        thresh = thresh + 0.1
        top_res = top_thresh_lst(res_pickle, thresh)
        if len(top_res) > 10000:
            thresh = thresh + 0.05
            top_res = top_thresh_lst(res_pickle, thresh)
    elif n < 1000:
        thresh = thresh - 0.1
        top_res = top_thresh_lst(res_pickle, thresh)
        if len(top_res) > 1000:
            thresh = thresh - 0.05
            top_res = top_thresh_lst(res_pickle, thresh)
    n = len(top_res)
    res = list(top_res['istex_id'])
    recall = len(find_intersection(test,res))/float(t)
    precision = len(find_intersection(test,res))/float(n)
    F1 = 2 * (precision * recall) / (precision + recall)
    print "length of s3h results: ", n, "length of test set", t
    print "F1: ", F1
    print "precision s3h: ", precision, "recall s3h: ", recall

def get_fusion_res3(s3h_res_pickle, topic, synset):
    topic_s3h_results = pickle.load(open(s3h_res_pickle,'rb'))
    if type(topic_s3h_results) is OrderedDict:
        topic_s3h_results = topic_s3h_results.items()
    topic_s3h_top100k_results = topic_s3h_results[:100000]
    babel_results, test = babelnet_syn_get_input(topic, synset)
    fus = np.array(np.ones(100000))
    fus = fus * len(babel_results)
    for i, s3h in enumerate(topic_s3h_top100k_results):
        for j, bab in enumerate(babel_results):
            if s3h[0] == bab:
                fus[i] = (i + j) / 2
    fusion_df = pd.DataFrame(data=topic_s3h_top100k_results, columns=["istex_id", "3sh_score"])
    fusion_df["fus_rank"] = fus
    fusion_res = fusion_df.sort_values("fus_rank")#["istex_id"].tolist()
    return fusion_res

def get_fusion_res4(s3h_res_pickle, topic, synset):
    topic_s3h_results = pickle.load(open(s3h_res_pickle,'rb'))
    if type(topic_s3h_results) is OrderedDict:
        topic_s3h_results = topic_s3h_results.items()
    topic_s3h_top100k_results = topic_s3h_results[:100000]
    babel_results, test = babelnet_syn_get_input(topic, synset)
    fus = np.array(np.ones(100000))
    fus = fus * 2 * len(babel_results)
    for i, s3h in enumerate(topic_s3h_top100k_results):
        for j, bab in enumerate(babel_results):
            if s3h[0] == bab:
                fus[i] = (i + j) / 2
    fusion_df = pd.DataFrame(data=topic_s3h_top100k_results, columns=["istex_id", "3sh_score"])
    fusion_df["fus_rank"] = fus
    fusion_res = fusion_df.sort_values("fus_rank")#["istex_id"].tolist()
    return fusion_res

def get_fusion_res5(s3h_res_pickle, topic, synset):
    topic_s3h_results = pickle.load(open(s3h_res_pickle,'rb'))
    if type(topic_s3h_results) is OrderedDict:
        topic_s3h_results = topic_s3h_results.items()
    topic_s3h_top100k_results = topic_s3h_results[:100000]
    babel_results, test = babelnet_syn_get_input(topic, synset)
    fus = np.array(range(100000))
    fus = fus * len(babel_results)
    for i, s3h in enumerate(topic_s3h_top100k_results):
        for j, bab in enumerate(babel_results):
            if s3h[0] == bab:
                fus[i] = (i + j) / 2
    fusion_df = pd.DataFrame(data=topic_s3h_top100k_results, columns=["istex_id", "3sh_score"])
    fusion_df["fus_rank"] = fus
    fusion_res = fusion_df.sort_values("fus_rank")#["istex_id"].tolist()
    return fusion_res

def get_fusion_res6(s3h_res_pickle, topic, synset):
    topic_s3h_results = pickle.load(open(s3h_res_pickle,'rb'))
    if type(topic_s3h_results) is OrderedDict:
        topic_s3h_results = topic_s3h_results.items()
    topic_s3h_top100k_results = topic_s3h_results[:100000]
    babel_results, test = babelnet_syn_get_input(topic, synset)
    fus = np.array(range(100000))
    fus = fus * 3 * len(babel_results)
    for i, s3h in enumerate(topic_s3h_top100k_results):
        for j, bab in enumerate(babel_results):
            if s3h[0] == bab:
                fus[i] = (2*i + j) / 2
    fusion_df = pd.DataFrame(data=topic_s3h_top100k_results, columns=["istex_id", "3sh_score"])
    fusion_df["fus_rank"] = fus
    fusion_res = fusion_df.sort_values("fus_rank")#["istex_id"].tolist()
    return fusion_res

#Evaluate fusion df
def eval_all(fusion_df, topic, synset, res_pickle, thresh=0.75):
    eval_all_at_thresh_lst(res_pickle, topic, synset, thresh)
    babelnet_res, test = babelnet_syn_get_input(topic, synset)
    n = len(babelnet_res)
    t = len(test)

    print "length of results: ", n, "length of test set", t
    res = fusion_df["istex_id"].tolist()[:n]
    matches = len(find_intersection(test,res))
    recall = matches / float(t)
    precision = matches / float(n)
    if matches is not 0:
        F1 = 2 * (precision * recall) / (precision + recall)
    else:
        F1 = 0
    print "F1: ", F1 
    print "precision fusion: ", precision, "recall fusion: ",recall

    n = 2 * len(babelnet_res)
    print "for length of results: ", n
    res = fusion_df["istex_id"].tolist()[:n]
    matches = len(find_intersection(test,res))
    recall = matches / float(t)
    precision = matches / float(n)
    if matches is not 0:
        F1 = 2 * (precision * recall) / (precision + recall)
    else:
        F1 = 0
    print "F1: ", F1 
    print "precision fusion: ", precision, "recall fusion: ",recall

    n = 3 * len(babelnet_res)
    print " for length of results: ", n
    res = fusion_df["istex_id"].tolist()[:n]
    matches = len(find_intersection(test,res))
    recall = matches / float(t)
    precision = matches / float(n)
    if matches is not 0:
        F1 = 2 * (precision * recall) / (precision + recall)
    else:
        F1 = 0
    print "F1: ", F1 
    print "precision fusion: ", precision, "recall fusion: ",recall
    
    n = 4 * len(babelnet_res)
    print " for length of results: ", n
    res = fusion_df["istex_id"].tolist()[:n]
    matches = len(find_intersection(test,res))
    recall = matches / float(t)
    precision = matches / float(n)
    if matches is not 0:
        F1 = 2 * (precision * recall) / (precision + recall)
    else:
        F1 = 0
    print "F1: ", F1 
    print "precision fusion: ", precision, "recall fusion: ",recall
    
    n = 5 * len(babelnet_res)
    print " for length of results: ", n
    res = fusion_df["istex_id"].tolist()[:n]
    matches = len(find_intersection(test,res))
    recall = matches / float(t)
    precision = matches / float(n)
    if matches is not 0:
        F1 = 2 * (precision * recall) / (precision + recall)
    else:
        F1 = 0
    print "F1: ", F1  
    print "precision fusion: ", precision, "recall fusion: ",recall

In [7]:
#loading SDV of istex articles
inv_index = json.load(open('../RecSys_Exp_files/182_381_vec150_results/output_paragraph_inversed_index.json','rb'))
print 'original inversed_index'
print inv_index.items()[:3]
inversed_index = dict()
for (k, v) in inv_index.items():
    key = k.split('_')[1]
    inversed_index[key] = v
print 'processed inversed_index'
print inversed_index.items()[:3]

original inversed_index
[(u'ISTEX_D89FA3AC3521074D46F4245762153DF497BFFA1F', 2002320), (u'ISTEX_18EAF4D6A126B077EB38667801D1B7292F32FF49', 2483732), (u'ISTEX_5F91044435FCC4FABB9F02E31467DCFE75F4A7BE', 1429049)]
processed inversed_index
[(u'FCF1393F9B8136AC08FB67E88F94F3CF62C17288', 3517138), (u'482E1102A1114327A744FD2ADB4D9F8FF7E9A70B', 751643), (u'A81022B6295AE66F68A10222C3B94A06B033C1BA', 3983232)]


In [4]:
def get_fusion_res(s3h_res_pickle, topic, synset):
    topic_s3h_results = pickle.load(open(s3h_res_pickle,'rb'))
    topic_s3h_top100k_results = topic_s3h_results[:100000]
    babel_results, test = babelnet_syn_get_input(topic, synset)
    fus = np.array(np.ones(100000))
    fus = (fus + len(babel_results)) / 2
    for i, s3h in enumerate(topic_s3h_top100k_results):
        for j, bab in enumerate(babel_results):
            if s3h[0] == bab:
                fus[i] = (i + j) / 2
    fusion_df = pd.DataFrame(data=topic_s3h_top100k_results, columns=["istex_id", "3sh_score"])
    fusion_df["fus_rank"] = fus
    fusion_res = fusion_df.sort_values("fus_rank")#["istex_id"].tolist()
    return fusion_res

In [11]:
def get_fusion_res3(s3h_res_pickle, topic, synset):
    topic_s3h_results = pickle.load(open(s3h_res_pickle,'rb'))
    topic_s3h_top100k_results = topic_s3h_results[:100000]
    babel_results, test = babelnet_syn_get_input(topic, synset)
    fus = np.array(np.ones(100000))
    fus = fus * len(babel_results)
    for i, s3h in enumerate(topic_s3h_top100k_results):
        for j, bab in enumerate(babel_results):
            if s3h[0] == bab:
                fus[i] = (i + j) / 2
    fusion_df = pd.DataFrame(data=topic_s3h_top100k_results, columns=["istex_id", "3sh_score"])
    fusion_df["fus_rank"] = fus
    fusion_res = fusion_df.sort_values("fus_rank")#["istex_id"].tolist()
    return fusion_res

In [17]:
bab = 'organ transplant, transplant, transplantation, Organ transplantation, Medical Transplantation, Transplant Surgery, Black market organs, First transplant, First transplantation, Intestinal transplant, Live organ transplants, Mixed chimerism, Organ and Tissue Donor, Organ doner, Organ farming, Organ transplantation in different countries, Organ transplantation therapy, Organ transplants, Skin transplant, Tissue transplant, Transplant Tourism and Organ Trafficking, Transplantation medicine, Transplantation surgery, Transplantation therapy, Transplanted organs, Transplantology'
synset = bab.split(',')
topic = 'Transplantation'
s3h_res_pickle = "results/res_Transplantation"
Transplantation_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

In [20]:
bab = 'spectroscopy, spectrographic analysis, spectrometry, spectroscopic analysis, spectrum analysis, Fingerprint region, Laser spectroscopy, Optical spectroscopy, Atomic line, Atomic line spectra, Atomic spectra, Atomic spectral line, Electromagnetic spectroscopy, Emission spectrochemical analysis, Spectrochemical Analysis, Spectrography, Spectrology, Spectroscopic, Spectroscopist, Spectroscopists'
synset = bab.split(',')
topic = 'Spectroscopy'
s3h_res_pickle = "results/res__Spectroscopy"
Spectroscopy_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

In [24]:
bab = ' surgery, operation, surgical operation, surgical procedure, surgical process, Chirurgery, Chirurgical, Chirurgy, Complications of surgery, Corrective surgery, Elliptical excision, Emergency Surgery, Post-operation, Post-operative, Specialties in surgery, Sterile drapes, surgeon, Surgeons, Surgeries, Surgery in general practice, Surgery operation, Surgery specialties, Surgical, Surgical excision, Surgical excision of malignant lesions, Surgical specialties, Surgical technique, Surgically'
synset = bab.split(',')
topic = 'Surgery'
s3h_res_pickle = "results/res__Surgery"
Surgery_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

In [25]:
bab = 'optics, Classical optics, Light physics, Optical, Optical system'
synset = bab.split(',')
topic = 'Optics'
s3h_res_pickle = "results/res__Optics"
Optics_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

In [26]:
bab = 'literature, Literary art, Literary, Literary arts, Literary work, Literary works, LiteraryArt, Literature by region, Literatures, Litterature, Ltierature, Prose fiction'
synset = bab.split(',')
topic = 'Literature'
s3h_res_pickle = "results/res__Literature"
Literature_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

In [27]:
bab = 'toxicology, Chemical toxicology, History of toxicology, Toxicological, Toxicologists'
synset = bab.split(',')
topic = 'Toxicology'
s3h_res_pickle = "results/res__Toxicology"
Toxicology_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

In [28]:
bab = 'artificial intelligence, AI, artilect, Artifical intelligence, Cognitive systems, Digital being, Machine intelligence, A.I., A I, AI-D, AI ethics, AI implications, AI Robotics, AI scripting, Aretificial intelligence, Artificial-intelligence, Artificial conciousness, Artificial inteligence, Artificial intellect, Artificial intellegence, Artificial Intelligence., Artificial intelligence for development, Artificial Intelligence Program, Artificial intelligences, Artificially-intelligent, Artificially intelligent, Artificual intelligence, Cognitive simulation, Cognitive system, Commonsense AI, Computational Rationality, Computer AI, Intelligent machine, Machine thought, Machine understanding, Ontology based approach, Pseudo intelligence, Semi-AI, Semi AI, Simulated intelligence, Soft AI, Sub-symbolic, Subsymbolic, The Artificial Intelligence, The Theory of Artificial Intelligence'
synset = bab.split(',')
topic = 'artificial intelligence'
s3h_res_pickle = "results/AI_results.pickle"
AI_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

KeyboardInterrupt: 

In [29]:
bab = 'cybernetics, Cybernetic, Cybernetic system, cybernetician, Cyberneticians, cyberneticist, Tha Masta'
synset = bab.split(',')
topic = 'Cybernetics'
s3h_res_pickle = "results/res__Cybernetics"
Cybernetics_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

In [None]:
bab = 'information system, data system, information systems, system info, Business computing, Computer information system, Computer information systems, Elements of Information System, Information in Computer Science, Information systems and technology, Information systems discipline, Information systems theory, Informationssystem'
synset = bab.split(',')
topic = 'Information Systems'
s3h_res_pickle = "results/infosys_results.pickle"
Information_Systems_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

In [30]:
bab = 'immunology, immunobiology, immunological, immunologist, Clinical immunology, Classical immunology, Evolutionary immunology, Immunologic, Immunologists, Imunologys'
synset = bab.split(',')
topic = 'Immunology'
s3h_res_pickle = "results/res__Immunology"
Immunology_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

In [None]:
bab = 'infection, infectious disease, communicable diseases, contagion, Infectious diseases, Acute infection, AIDS-related bacterial infections, AIDS-related viral infections, Anti-infective, Anti-infectives, Antiinfective, Bacterial Infections, Communicable disease, Contagious diseases, Definition to contagious, Infect, Infecting, Infectiology, Infections, Infectious, Infectious disease epidemiology, Infectious disease medicine, Infectology, Local infection, Primary infection, Rochalimea infections, Secondary infection, Tropical bacterial infections, Tropical infections, Viral Infections, Wound colonization, Wound infection'
synset = bab.split(',')
topic = 'Infectious Diseases'
s3h_res_pickle = "results/Infectious_results.pickle"
Infectious_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

In [31]:
bab = 'Biomaterial, Biomaterials Engineering, Bio material, Biomaterials'
synset = bab.split(',')
topic = 'Biomaterials'
s3h_res_pickle = "results/res__Biomaterials"
Biomaterials_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

In [32]:
bab = 'ceramics, Ceramic art, Art pottery, Art ware, Ceramic artist, Ceramic paint, Ceramics art, Fine art pot, Vase painting'
synset = bab.split(',')
topic = 'Ceramics'
s3h_res_pickle = "results/res__Ceramics"
Ceramics_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

In [33]:
topic = "biophysics"
synset_text = "biophysics, Biological physics, Biophysical, Biophysicists, History of biophysics"
synset = synset_text.split(',')
s3h_res_pickle = 'results/res__biophysics'
biophysics_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

In [12]:
ind_dict = OrderedDict((k,i) for i,k in enumerate(biophysics_fusion_res))
babel_results, test = babelnet_syn_get_input(topic, synset)
inter = set(ind_dict).intersection(babel_results)
indices = [ind_dict[x] for x in inter ]
cutt = np.array(indices).max()
cutt

99999

In [None]:
slic = len(indices) * 0.8
indices_slic = indices[:int(slic)]
#indices_slic

In [None]:
fusion_res = fusion_df.sort_values("fusin_rank")["istex_id"].tolist()
#fusion_res[:5000]

In [5]:
#Evaluate fusion df with manual cut
def eval_fusin_at_thresh_lst_1k(fusion_df, topic, synset):
    babelnet_eval_PR(topic, synset)
    babelnet_res, test = babelnet_syn_get_input(topic, synset)
    n = len(babelnet_res)
    if n < 1000:
        n = 1000
    t = len(test)
    res = fusion_df["istex_id"].tolist()[:n]
    recall = len(find_intersection(test,res))/float(t)
    precision = len(find_intersection(test,res))/float(n)
    return precision, recall, n, t

In [6]:
#Evaluate fusion df
def eval_fusin_at_5k(fusion_df, topic, synset):
    babelnet_eval_PR(topic, synset)
    babelnet_res, test = babelnet_syn_get_input(topic, synset)
    n = 5000#len(babelnet_res)
    t = len(test)
    res = fusion_df["istex_id"].tolist()[:n]
    recall = len(find_intersection(test,res))/float(t)
    precision = len(find_intersection(test,res))/float(n)
    print "precision fusion: " precision, "recall fusion: ",recall
    print "length of results: ", n, "length of test set", t

In [7]:
#Evaluate fusion df
def eval_fusin_at_2_bablesize((fusion_df, topic, synset):
    babelnet_eval_PR(topic, synset)
    babelnet_res, test = babelnet_syn_get_input(topic, synset)
    n = 2 * len(babelnet_res)
    t = len(test)
    res = fusion_df["istex_id"].tolist()[:n]
    recall = len(find_intersection(test,res))/float(t)
    precision = len(find_intersection(test,res))/float(n)
    print "precision fusion: " precision, "recall fusion: ",recall
    print "length of results: ", n, "length of test set", t

In [8]:
#Evaluate fusion df
def eval_fusin_at_bablesize(fusion_df, topic, synset):
    babelnet_eval_PR(topic, synset)
    babelnet_res, test = babelnet_syn_get_input(topic, synset)
    n = len(babelnet_res)
    t = len(test)
    res = fusion_df["istex_id"].tolist()[:n]
    recall = len(find_intersection(test,res))/float(t)
    precision = len(find_intersection(test,res))/float(n)
    print "precision fusion: " precision, "recall fusion: ",recall
    print "length of results: ", n, "length of test set", t

In [9]:
def get_fusion_res_2(s3h_res_pickle, topic, synset):
    topic_s3h_top100k_results = pickle.load(open(s3h_res_pickle,'rb'))
    babel_results, test = babelnet_syn_get_input(topic, synset)
    fus = np.array(np.ones(100000))
    fus = (fus + 100000) / 2
    for i, s3h in enumerate(topic_s3h_top100k_results):
        for j, bab in enumerate(babel_results):
            if s3h[0] == bab:
                fus[i] = (i + j) / 2
    fusion_df = pd.DataFrame(data=topic_s3h_top100k_results, columns=["istex_id", "3sh_score"])
    fusion_df["fus_rank"] = fus
    fusion_res = fusion_df.sort_values("fus_rank")#["istex_id"].tolist()
    return fusion_res

In [14]:
#Evaluate fusion df
def eval_all(fusion_df, topic, synset, res_pickle, tresh=0.75):
    eval_all_at_thresh_lst(res_pickle, topic, synset, thresh)
    babelnet_res, test = babelnet_syn_get_input(topic, synset)
    n = len(babelnet_res)
    t = len(test)

    print "length of results: ", n, "length of test set", t
    res = fusion_df["istex_id"].tolist()[:n]
    recall = len(find_intersection(test,res))/float(t)
    precision = len(find_intersection(test,res))/float(n)
    F1 = 2 * (precision * recall) / (precision + recall)
    print "F1: ", F1 
    print "precision fusion: ", precision, "recall fusion: ",recall

    n = 2 * len(babelnet_res)
    print "for length of results: ", n
    res = fusion_df["istex_id"].tolist()[:n]
    recall = len(find_intersection(test,res))/float(t)
    precision = len(find_intersection(test,res))/float(n)
    F1 = 2 * (precision * recall) / (precision + recall)
    print "F1: ", F1 
    print "precision fusion: ", precision, "recall fusion: ",recall

    n = 3 * len(babelnet_res)
    print " for length of results: ", n
    res = fusion_df["istex_id"].tolist()[:n]
    recall = len(find_intersection(test,res))/float(t)
    precision = len(find_intersection(test,res))/float(n)
    F1 = 2 * (precision * recall) / (precision + recall)
    print "F1: ", F1 
    print "precision fusion: ", precision, "recall fusion: ",recall
    
    

In [41]:
bab = 'toxicology, Chemical toxicology, History of toxicology, Toxicological, Toxicologists'
synset = bab.split(',')
topic = 'Toxicology'
get_fusion_res_2("results/res__Toxicology", topic, synset)
eval_fusin_at_thresh_lst(Toxicology_fusion_res, topic, synset)

babelnet results size of the topic "Toxicology": 5563
ground truth size 1552
intersection with the ground truth: 146
babel precision:  0.0262448319252
babel recall:  0.0940721649485


(0.004853496314937983, 0.017396907216494846, 5563, 1552)

In [52]:
bab = 'toxicology, Chemical toxicology, History of toxicology, Toxicological, Toxicologists'
synset = bab.split(',')
topic = 'Toxicology'
get_fusion_res_2("results/res__Toxicology", topic, synset)
eval_fusin_at_thresh_lst(Toxicology_fusion_res, topic, synset)
eval_fusin_at_thresh_lst2(Toxicology_fusion_res, topic, synset)

babelnet results size of the topic "Toxicology": 5563
ground truth size 1552
intersection with the ground truth: 146
babel precision:  0.0262448319252
babel recall:  0.0940721649485


(0.004853496314937983, 0.017396907216494846, 5563, 1552)

In [42]:
bab = 'Biomaterial, Biomaterials Engineering, Bio material, Biomaterials'
synset = bab.split(',')
topic = 'Biomaterials'
get_fusion_res_2("results/res__Biomaterials", topic, synset)
eval_fusin_at_thresh_lst(Biomaterials_fusion_res, topic, synset)

babelnet results size of the topic "Biomaterials": 3649
ground truth size 1020
intersection with the ground truth: 92
babel precision:  0.0252123869553
babel recall:  0.0901960784314


(0.00356261989586188, 0.012745098039215686, 3649, 1020)

In [54]:
bab = 'Biomaterial, Biomaterials Engineering, Bio material, Biomaterials'
synset = bab.split(',')
topic = 'Biomaterials'
get_fusion_res_2("results/res__Biomaterials", topic, synset)
eval_fusin_at_thresh_lst(Biomaterials_fusion_res, topic, synset)
eval_fusin_at_thresh_lst2(Toxicology_fusion_res, topic, synset)

babelnet results size of the topic "Biomaterials": 3649
ground truth size 1020
intersection with the ground truth: 92
babel precision:  0.0252123869553
babel recall:  0.0901960784314
babelnet results size of the topic "Biomaterials": 3649
ground truth size 1020
intersection with the ground truth: 92
babel precision:  0.0252123869553
babel recall:  0.0901960784314


(0.0, 0.0, 7298, 1020)

In [55]:
bab = 'immunology, immunobiology, immunological, immunologist, Clinical immunology, Classical immunology, Evolutionary immunology, Immunologic, Immunologists, Imunologys'
synset = bab.split(',')
topic = 'Immunology'
get_fusion_res_2("results/res__Immunology", topic, synset)
eval_fusin_at_thresh_lst(Immunology_fusion_res, topic, synset)
eval_fusin_at_thresh_lst2(Immunology_fusion_res, topic, synset)

babelnet results size of the topic "Immunology": 8787
ground truth size 4769
intersection with the ground truth: 203
babel precision:  0.023102310231
babel recall:  0.0425665758021
babelnet results size of the topic "Immunology": 8787
ground truth size 4769
intersection with the ground truth: 203
babel precision:  0.023102310231
babel recall:  0.0425665758021


(0.010356208034596564, 0.03816313692598029, 17574, 4769)

In [58]:
eval_fusin_at_thresh_lst(Immunology_fusion_res, topic, synset)

babelnet results size of the topic "Immunology": 8787
ground truth size 4769
intersection with the ground truth: 203
babel precision:  0.023102310231
babel recall:  0.0425665758021
0.0124 0.0130006290627 5000 4769


In [61]:
eval_fusin_at_thresh_lst3(Immunology_fusion_res, topic, synset)

babelnet results size of the topic "Immunology": 8787
ground truth size 4769
intersection with the ground truth: 203
babel precision:  0.023102310231
babel recall:  0.0425665758021
0.0119494708091 0.0220171943804 8787 4769


In [None]:
bab = 'immunology, immunobiology, immunological, immunologist, Clinical immunology, Classical immunology, Evolutionary immunology, Immunologic, Immunologists, Imunologys'
synset = bab.split(',')
topic = 'Immunology'
get_fusion_res_2("results/res__Immunology", topic, synset)
eval_fusin_at_thresh_lst2(Immunology_fusion_res, topic, synset)

In [46]:
bab = 'cybernetics, Cybernetic, Cybernetic system, cybernetician, Cyberneticians, cyberneticist, Tha Masta'
synset = bab.split(',')
topic = 'Cybernetics'
get_fusion_res_2("", topic, synset)
eval_fusin_at_thresh_lst2(Cybernetics_fusion_res, topic, synset)

babelnet results size of the topic "Cybernetics": 511
ground truth size 596
intersection with the ground truth: 47
babel precision:  0.0919765166341
babel recall:  0.0788590604027


(0.004, 0.006711409395973154, 1000, 596)

In [63]:
bab = 'cybernetics, Cybernetic, Cybernetic system, cybernetician, Cyberneticians, cyberneticist, Tha Masta'
synset = bab.split(',')
topic = 'Cybernetics'
Cybernetics_fusion_res = get_fusion_res_2("results/res__Cybernetics", topic, synset)
eval_fusin_at_thresh_lst2(Cybernetics_fusion_res, topic, synset)

babelnet results size of the topic "Cybernetics": 511
ground truth size 596
intersection with the ground truth: 47
babel precision:  0.0919765166341
babel recall:  0.0788590604027
0.0283757338552 0.0486577181208 1022 596


In [None]:
eval_all_at_thresh('results/AI_results.pickle', topic , synset, thresh = 0.75)

In [47]:
bab = 'literature, Literary art, Literary, Literary arts, Literary work, Literary works, LiteraryArt, Literature by region, Literatures, Litterature, Ltierature, Prose fiction'
bab = bab.replace('&', 'and')
synset = bab.split(',')
topic = 'Literature'
get_fusion_res_2(s3h_res_pickle, topic, synset)
eval_fusin_at_thresh_lst(Literature_fusion_res, topic, synset)

babelnet results size of the topic "Literature": 7357
ground truth size 860
intersection with the ground truth: 107
babel precision:  0.0145439717276
babel recall:  0.124418604651


(0.0020388745412532284, 0.01744186046511628, 7357, 860)

In [48]:
bab = 'optics, Classical optics, Light physics, Optical, Optical system'
synset = bab.split(',')
topic = 'Optics'
get_fusion_res_2(s3h_res_pickle, topic, synset)
eval_fusin_at_thresh_lst(Optics_fusion_res, topic, synset)

babelnet results size of the topic "Optics": 8730
ground truth size 3286
intersection with the ground truth: 271
babel precision:  0.0310423825888
babel recall:  0.0824710894705


(0.015005727376861398, 0.039866098600121726, 8730, 3286)

In [49]:
bab = 'surgery, operation, surgical operation, surgical procedure, surgical process, Chirurgery, Chirurgical, Chirurgy, Complications of surgery, Corrective surgery, Elliptical excision, Emergency Surgery, Post-operation, Post-operative, Specialties in surgery, Sterile drapes, surgeon, Surgeons, Surgeries, Surgery in general practice, Surgery operation, Surgery specialties, Surgical, Surgical excision, Surgical excision of malignant lesions, Surgical specialties, Surgical technique, Surgically'
synset = bab.split(',')
topic = 'Surgery'
get_fusion_res_2(s3h_res_pickle, topic, synset)
eval_fusin_at_thresh_lst(Surgery_fusion_res, topic, synset)

babelnet results size of the topic "Surgery": 8271
ground truth size 6412
intersection with the ground truth: 565
babel precision:  0.0683109660259
babel recall:  0.0881160324392


(0.027687099504292104, 0.03571428571428571, 8271, 6412)

In [50]:
bab = 'spectroscopy, spectrographic analysis, spectrometry, spectroscopic analysis, spectrum analysis, Fingerprint region, Laser spectroscopy, Optical spectroscopy, Atomic line, Atomic line spectra, Atomic spectra, Atomic spectral line, Electromagnetic spectroscopy, Emission spectrochemical analysis, Spectrochemical Analysis, Spectrography, Spectrology, Spectroscopic, Spectroscopist, Spectroscopists'
synset = bab.split(',')
topic = 'Spectroscopy'
get_fusion_res_2(s3h_res_pickle, topic, synset)
eval_fusin_at_thresh_lst(Spectroscopy_fusion_res, topic, synset)

babelnet results size of the topic "Spectroscopy": 8513
ground truth size 7294
intersection with the ground truth: 287
babel precision:  0.0337131446024
babel recall:  0.0393474088292


(0.025138024198284977, 0.029339182890046615, 8513, 7294)

In [11]:
bab = "Condensed matter physics, Condensed-matter physics, Condensed matter, Bulk matter, Condenced matter, Condensed matter physicist, Condensed matter system, Condensed matter theory, Condensed phase, History of condensed matter physics, Physics of condensed matter, Theoretical condensed matter physics"
synset = bab.split(',')
topic = 'Condensed Matter'
ID_fusion_res = get_fusion_res3("results/res__Condensed_matter", topic, synset)
eval_all(ID_fusion_res, topic, synset, "results/res__Condensed_matter", thresh=0.75)

initial_corpus size: 581
initial_corpus size: 581
babelnet results size of the topic "Condensed Matter": 1523
ground truth size 1514
intersection with the ground truth: 1
babel precision:  0.000656598818122
babel recall:  0.000660501981506
initial_corpus size: 581
length of s3h results:  9988 length of test set 1514
F1:  0.0153016866632
precision s3h:  0.00881057268722 recall s3h:  0.0581241743725
initial_corpus size: 581
length of results:  1523 length of test set 1514
F1:  0.00329272308199
precision fusion:  0.00328299409061 recall fusion:  0.00330250990753
for length of results:  3046
F1:  0.00394736842105
precision fusion:  0.00295469468155 recall fusion:  0.00594451783355
 for length of results:  4569
F1:  0.00493177708368
precision fusion:  0.00328299409061 recall fusion:  0.00990752972259


In [17]:
bab = "emergency medicine, Emergency care, Emergency med, Emergency treatment, Emergent condition, Emergentology, Er physician"
synset = bab.split(',')
topic = 'Emergency medicine'
res_pickle = "results/Emergence_results.pickle"
ID_fusion_res = get_fusion_res3(res_pickle, topic, synset)
eval_all(ID_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 810
initial_corpus size: 810
babelnet results size of the topic "Emergency medicine": 1379
ground truth size 285
intersection with the ground truth: 14
F1 0.0168269230769
babel precision:  0.010152284264
babel recall:  0.0491228070175
initial_corpus size: 810
length of s3h results:  2793 length of test set 285
F1:  0.00779727095517
precision s3h:  0.00429645542427 recall s3h:  0.0421052631579


URLError: <urlopen error [Errno -3] Temporary failure in name resolution>

In [18]:
eval_all(ID_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 810
babelnet results size of the topic "Emergency medicine": 1379
ground truth size 285
intersection with the ground truth: 14
F1 0.0168269230769
babel precision:  0.010152284264
babel recall:  0.0491228070175
initial_corpus size: 810
length of s3h results:  2793 length of test set 285
F1:  0.00779727095517
precision s3h:  0.00429645542427 recall s3h:  0.0421052631579
initial_corpus size: 810
length of results:  1379 length of test set 285
F1:  0.00120192307692
precision fusion:  0.000725163161711 recall fusion:  0.00350877192982
for length of results:  2758
F1:  0.00131449227736
precision fusion:  0.000725163161711 recall fusion:  0.00701754385965
 for length of results:  4137
F1:  0.00226142017187
precision fusion:  0.00120860526952 recall fusion:  0.0175438596491
 for length of results:  5516
F1:  0.00206860886054
precision fusion:  0.00108774474257 recall fusion:  0.0210526315789
 for length of results:  6895
F1:  0.00222841225627
precision fusion:  0.001160261

In [19]:
bab = "conservation biology, Animal conservation, Biological Conservation, Conservation, Biodiversity conservation, Conservation biologist, Conservation biologists, Conservation of natural resources, Conservation of wildlife, Conservation priority, Conservation science, Earth biologist, Earth biology, Ecological conservation, History of conservation biology, Wildlife Conservation"
synset = bab.split(',')
topic = 'Biodiversity Conservation'
res_pickle = "results/Biodiversity_Conservation_results.pickle"
BC_fusion_res = get_fusion_res3(res_pickle, topic, synset)
eval_all(BC_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 344
initial_corpus size: 344
babelnet results size of the topic "Biodiversity Conservation": 7481
ground truth size 55
intersection with the ground truth: 12
F1 0.0031847133758
babel precision:  0.00160406362786
babel recall:  0.218181818182
initial_corpus size: 344
length of s3h results:  3207 length of test set 55
F1:  0.00613120784795
precision s3h:  0.00311817898347 recall s3h:  0.181818181818
initial_corpus size: 344
length of results:  7481 length of test set 55
F1:  0.00132696390658
precision fusion:  0.000668359844941 recall fusion:  0.0909090909091
for length of results:  14962
F1:  0.000665911966438
precision fusion:  0.00033417992247 recall fusion:  0.0909090909091
 for length of results:  22443
F1:  0.000622277535781
precision fusion:  0.000311901260972 recall fusion:  0.127272727273
 for length of results:  29924
F1:  0.00046699356216
precision fusion:  0.000233925945729 recall fusion:  0.127272727273
 for length of results:  37405
F1:  0.0003737319807

In [22]:
bab = "organ transplant, transplant, transplantation, Organ transplantation, Medical Transplantation, Transplant Surgery, Black market organs, First transplant, First transplantation, Intestinal transplant, Live organ transplants, Mixed chimerism, Organ and Tissue Donor, Organ doner, Organ farming, Organ transplantation in different countries, Organ transplantation therapy, Organ transplants, Skin transplant, Tissue transplant, Transplant Tourism and Organ Trafficking, Transplantation medicine, Transplantation surgery, Transplantation therapy, Transplanted organs, Transplantology"
bab = bab.replace('&', 'and')
synset = bab.split(',')
topic = 'Transplantation'
res_pickle = "results/res_Transplantation"
Tr_fusion_res = get_fusion_res3(res_pickle, topic, synset)
eval_all(Tr_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 8999
initial_corpus size: 8999
babelnet results size of the topic "Transplantation": 8975
ground truth size 4997
intersection with the ground truth: 910
F1 0.130260521042
babel precision:  0.10139275766
babel recall:  0.182109265559
initial_corpus size: 8999
length of s3h results:  8281 length of test set 4997
F1:  0.105889441181
precision s3h:  0.0848931288492 recall s3h:  0.140684410646
initial_corpus size: 8999
length of results:  8975 length of test set 4997
F1:  0.0586888061838
precision fusion:  0.0456824512535 recall fusion:  0.0820492295377
for length of results:  17950
F1:  0.0446245696605
precision fusion:  0.0285236768802 recall fusion:  0.102461476886
 for length of results:  26925
F1:  0.0412881398409
precision fusion:  0.0244753946147 recall fusion:  0.131879127476
 for length of results:  35900
F1:  0.0357972467418
precision fusion:  0.0203899721448 recall fusion:  0.146487892736
 for length of results:  44875
F1:  0.0319217196022
precision fusion:  

In [23]:
bab = 'surgery, operation, surgical operation, surgical procedure, surgical process, Chirurgery, Chirurgical, Chirurgy, Complications of surgery, Corrective surgery, Elliptical excision, Emergency Surgery, Post-operation, Post-operative, Specialties in surgery, Sterile drapes, surgeon, Surgeons, Surgeries, Surgery in general practice, Surgery operation, Surgery specialties, Surgical, Surgical excision, Surgical excision of malignant lesions, Surgical specialties, Surgical technique, Surgically'
bab = bab.replace('&', 'and')
synset = bab.split(',')
topic = 'Surgery'
res_pickle = "results/res__Surgery"
Sr_fusion_res = get_fusion_res3(res_pickle, topic, synset)
eval_all(Sr_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 8455
initial_corpus size: 8455
babelnet results size of the topic "Surgery": 8271
ground truth size 6412
intersection with the ground truth: 565
F1 0.07695974937
babel precision:  0.0683109660259
babel recall:  0.0881160324392
initial_corpus size: 8455
length of s3h results:  13185 length of test set 6412
F1:  0.0702148288003
precision s3h:  0.0521805081532 recall s3h:  0.107298814722
initial_corpus size: 8455
length of results:  8271 length of test set 6412
F1:  0.0377307089832
precision fusion:  0.0334905090074 recall fusion:  0.0432002495321
for length of results:  16542
F1:  0.0407772065871
precision fusion:  0.0282916213275 recall fusion:  0.072988147224
 for length of results:  24813
F1:  0.0491273018415
precision fusion:  0.0309112158949 recall fusion:  0.119619463506
 for length of results:  33084
F1:  0.0480554992911
precision fusion:  0.0286845605126 recall fusion:  0.148003742982
 for length of results:  41355
F1:  0.045847551657
precision fusion:  0.026

In [24]:
bab = 'literature, Literary art, Literary, Literary arts, Literary work, Literary works, LiteraryArt, Literature by region, Literatures, Litterature, Ltierature, Prose fiction'
bab = bab.replace('&', 'and')
synset = bab.split(',')
topic = 'Literature'
res_pickle = "results/res__Literature"
Lit_fusion_res = get_fusion_res3(res_pickle, topic, synset)
eval_all(Lit_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 7941
initial_corpus size: 7941
babelnet results size of the topic "Literature": 7357
ground truth size 860
intersection with the ground truth: 107
F1 0.0260435682122
babel precision:  0.0145439717276
babel recall:  0.124418604651
initial_corpus size: 7941
length of s3h results:  8723 length of test set 860
F1:  0.00918292810185
precision s3h:  0.00504413619168 recall s3h:  0.0511627906977
initial_corpus size: 7941
length of results:  7357 length of test set 860
F1:  0.00340756967263
precision fusion:  0.00190294957184 recall fusion:  0.0162790697674
for length of results:  14714
F1:  0.0039809939643
precision fusion:  0.00210683702596 recall fusion:  0.0360465116279
 for length of results:  22071
F1:  0.00383759975579
precision fusion:  0.00199356621811 recall fusion:  0.0511627906977
 for length of results:  29428
F1:  0.00442419440042
precision fusion:  0.00227674323773 recall fusion:  0.0779069767442
 for length of results:  36785
F1:  0.00435648824545
precision

In [27]:
bab = "Condensed matter physics, Condensed-matter physics, Condensed matter, Bulk matter, Condenced matter, Condensed matter physicist, Condensed matter system, Condensed matter theory, Condensed phase, History of condensed matter physics, Physics of condensed matter, Theoretical condensed matter physics"
synset = bab.split(',')
topic = 'Condensed Matter'
ID_fusion_res = get_fusion_res5("results/res__Condensed_matter", topic, synset)
eval_all(ID_fusion_res, topic, synset, "results/res__Condensed_matter", thresh=0.75)

initial_corpus size: 581
initial_corpus size: 581
babelnet results size of the topic "Condensed Matter": 1523
ground truth size 1514
intersection with the ground truth: 1
F1 0.000658544616398
babel precision:  0.000656598818122
babel recall:  0.000660501981506
initial_corpus size: 581
length of s3h results:  9988 length of test set 1514
F1:  0.0153016866632
precision s3h:  0.00881057268722 recall s3h:  0.0581241743725
initial_corpus size: 581
length of results:  1523 length of test set 1514
F1:  0.00197563384919
precision fusion:  0.00196979645437 recall fusion:  0.00198150594452
for length of results:  3046
F1:  0.00789473684211
precision fusion:  0.0059093893631 recall fusion:  0.0118890356671
 for length of results:  4569
F1:  0.00986355416735
precision fusion:  0.00656598818122 recall fusion:  0.0198150594452
 for length of results:  6092
F1:  0.0128845648172
precision fusion:  0.008043335522 recall fusion:  0.0323645970938
 for length of results:  7615
F1:  0.0148975791434
precisi

In [28]:
bab = "emergency medicine, Emergency care, Emergency med, Emergency treatment, Emergent condition, Emergentology, Er physician"
synset = bab.split(',')
topic = 'Emergency medicine'
res_pickle = "results/Emergence_results.pickle"
Em_fusion_res = get_fusion_res5(res_pickle, topic, synset)
eval_all(Em_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 810
initial_corpus size: 810
babelnet results size of the topic "Emergency medicine": 1379
ground truth size 285
intersection with the ground truth: 14
F1 0.0168269230769
babel precision:  0.010152284264
babel recall:  0.0491228070175
initial_corpus size: 810
length of s3h results:  2793 length of test set 285
F1:  0.00779727095517
precision s3h:  0.00429645542427 recall s3h:  0.0421052631579
initial_corpus size: 810
length of results:  1379 length of test set 285
F1:  0.0144230769231
precision fusion:  0.00870195794054 recall fusion:  0.0421052631579
for length of results:  2758
F1:  0.0144594150509
precision fusion:  0.00797679477883 recall fusion:  0.0771929824561
 for length of results:  4137
F1:  0.0117593848937
precision fusion:  0.0062847474015 recall fusion:  0.0912280701754
 for length of results:  5516
F1:  0.00999827615928
precision fusion:  0.00525743292241 recall fusion:  0.101754385965
 for length of results:  6895
F1:  0.00974930362117
precision fusi

In [29]:
bab = "conservation biology, Animal conservation, Biological Conservation, Conservation, Biodiversity conservation, Conservation biologist, Conservation biologists, Conservation of natural resources, Conservation of wildlife, Conservation priority, Conservation science, Earth biologist, Earth biology, Ecological conservation, History of conservation biology, Wildlife Conservation"
synset = bab.split(',')
topic = 'Biodiversity Conservation'
res_pickle = "results/Biodiversity_Conservation_results.pickle"
BC_fusion_res = get_fusion_res5(res_pickle, topic, synset)
eval_all(BC_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 344
initial_corpus size: 344
babelnet results size of the topic "Biodiversity Conservation": 7481
ground truth size 55
intersection with the ground truth: 12
F1 0.0031847133758
babel precision:  0.00160406362786
babel recall:  0.218181818182
initial_corpus size: 344
length of s3h results:  3207 length of test set 55
F1:  0.00613120784795
precision s3h:  0.00311817898347 recall s3h:  0.181818181818
initial_corpus size: 344
length of results:  7481 length of test set 55
F1:  0.00530785562633
precision fusion:  0.00267343937976 recall fusion:  0.363636363636
for length of results:  14962
F1:  0.0031963774389
precision fusion:  0.00160406362786 recall fusion:  0.436363636364
 for length of results:  22443
F1:  0.00231131656147
precision fusion:  0.0011584903979 recall fusion:  0.472727272727
 for length of results:  29924
F1:  0.00186797424864
precision fusion:  0.000935703782917 recall fusion:  0.509090909091
 for length of results:  37405
F1:  0.00154831820609
precis

In [30]:
bab = "organ transplant, transplant, transplantation, Organ transplantation, Medical Transplantation, Transplant Surgery, Black market organs, First transplant, First transplantation, Intestinal transplant, Live organ transplants, Mixed chimerism, Organ and Tissue Donor, Organ doner, Organ farming, Organ transplantation in different countries, Organ transplantation therapy, Organ transplants, Skin transplant, Tissue transplant, Transplant Tourism and Organ Trafficking, Transplantation medicine, Transplantation surgery, Transplantation therapy, Transplanted organs, Transplantology"
bab = bab.replace('&', 'and')
synset = bab.split(',')
topic = 'Transplantation'
res_pickle = "results/res_Transplantation"
Tr_fusion_res = get_fusion_res5(res_pickle, topic, synset)
eval_all(Tr_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 8999
initial_corpus size: 8999
babelnet results size of the topic "Transplantation": 8975
ground truth size 4997
intersection with the ground truth: 910
F1 0.130260521042
babel precision:  0.10139275766
babel recall:  0.182109265559
initial_corpus size: 8999
length of s3h results:  8281 length of test set 4997
F1:  0.105889441181
precision s3h:  0.0848931288492 recall s3h:  0.140684410646
initial_corpus size: 8999
length of results:  8975 length of test set 4997
F1:  0.130546807902
precision fusion:  0.101615598886 recall fusion:  0.182509505703
for length of results:  17950
F1:  0.156708937988
precision fusion:  0.100167130919 recall fusion:  0.359815889534
 for length of results:  26925
F1:  0.143098803333
precision fusion:  0.0848282265552 recall fusion:  0.457074244547
 for length of results:  35900
F1:  0.128224564149
precision fusion:  0.0730362116992 recall fusion:  0.524714828897
 for length of results:  44875
F1:  0.114252486365
precision fusion:  0.063487

In [31]:
bab = 'surgery, operation, surgical operation, surgical procedure, surgical process, Chirurgery, Chirurgical, Chirurgy, Complications of surgery, Corrective surgery, Elliptical excision, Emergency Surgery, Post-operation, Post-operative, Specialties in surgery, Sterile drapes, surgeon, Surgeons, Surgeries, Surgery in general practice, Surgery operation, Surgery specialties, Surgical, Surgical excision, Surgical excision of malignant lesions, Surgical specialties, Surgical technique, Surgically'
bab = bab.replace('&', 'and')
synset = bab.split(',')
topic = 'Surgery'
res_pickle = "results/res__Surgery"
Sr_fusion_res = get_fusion_res5(res_pickle, topic, synset)
eval_all(Sr_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 8455
initial_corpus size: 8455
babelnet results size of the topic "Surgery": 8271
ground truth size 6412
intersection with the ground truth: 565
F1 0.07695974937
babel precision:  0.0683109660259
babel recall:  0.0881160324392
initial_corpus size: 8455
length of s3h results:  13185 length of test set 6412
F1:  0.0702148288003
precision s3h:  0.0521805081532 recall s3h:  0.107298814722
initial_corpus size: 8455
length of results:  8271 length of test set 6412
F1:  0.0753252060206
precision fusion:  0.0668601136501 recall fusion:  0.0862445414847
for length of results:  16542
F1:  0.0864337370393
precision fusion:  0.0599685648652 recall fusion:  0.154709918902
 for length of results:  24813
F1:  0.0912089671737
precision fusion:  0.0573892717527 recall fusion:  0.222083593263
 for length of results:  33084
F1:  0.0949969617176
precision fusion:  0.0567041470197 recall fusion:  0.292576419214
 for length of results:  41355
F1:  0.0963007934348
precision fusion:  0.05

In [32]:
bab = "faith, religion, religious belief, belief, creed, Faiths, Religions, Religious beliefs, Allegory of faith, Co-religionism, Co-religionist, Co-religionists, Coreligionism, Coreligionist, Coreligionists, Creating Stories, Dereligionization, Faithful, Faithfully, Fictitous, Magical thinking/Revised, Magickal thinking, Relegious, Relgion, Relig, Relig., Religionistic, Religionistical, Religionistically, Religionists, Religious concepts, Religious faith, Religious issues, Religious tradition, Religious traditions, Religiousity, Religon, Relligion, Totalitarian religious group"
bab.replace('&', 'and')
synset = bab.split(',')
topic = 'Religion'
res_pickle = "results/res__Religion"
Re_fusion_res = get_fusion_res5(res_pickle, topic, synset)
eval_all(Re_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 4073
initial_corpus size: 4073
babelnet results size of the topic "Religion": 6956
ground truth size 587
intersection with the ground truth: 98
F1 0.0259843563569
babel precision:  0.0140885566417
babel recall:  0.166950596252
initial_corpus size: 4073
length of s3h results:  17721 length of test set 587
F1:  0.011579637317
precision s3h:  0.00598160374697 recall s3h:  0.180579216354
initial_corpus size: 4073
length of results:  6956 length of test set 587
F1:  0.0201511335013
precision fusion:  0.0109258194365 recall fusion:  0.129471890971
for length of results:  13912
F1:  0.016139044072
precision fusion:  0.00841000575043 recall fusion:  0.199318568995
 for length of results:  20868
F1:  0.0132370076905
precision fusion:  0.00680467701744 recall fusion:  0.241908006814
 for length of results:  27824
F1:  0.0114744289184
precision fusion:  0.00585825186889 recall fusion:  0.277683134583
 for length of results:  34780
F1:  0.0107444793169
precision fusion:  0.005

In [34]:
bab = " physiology, animal physiology, History of physiology, Institutes of Medicine, Phisiology, Physiologic, Physiological, Physiologists"
bab.replace('&', 'and')
synset = bab.split(',')
topic = 'Physiology'
res_pickle = "results/res__Physiology"
Phys_fusion_res = get_fusion_res5(res_pickle, topic, synset)
eval_all(Phys_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 8525
initial_corpus size: 8525
babelnet results size of the topic "Physiology": 8494
ground truth size 2761
intersection with the ground truth: 123
F1 0.0218569524656
babel precision:  0.0144808099835
babel recall:  0.0445490764216
initial_corpus size: 8525
length of s3h results:  2208 length of test set 2761
F1:  0.00120748641578
precision s3h:  0.00135869565217 recall s3h:  0.00108656283955
initial_corpus size: 8525
length of results:  8494 length of test set 2761
F1:  0.00941803642825
precision fusion:  0.00623969861078 recall fusion:  0.0191959434987
for length of results:  16988
F1:  0.00951946934022
precision fusion:  0.00553331763598 recall fusion:  0.0340456356393
 for length of results:  25482
F1:  0.00977233296746
precision fusion:  0.00541558747351 recall fusion:  0.0499818906193
 for length of results:  33976
F1:  0.00881944633476
precision fusion:  0.00476807157994 recall fusion:  0.0586743933357
 for length of results:  42470
F1:  0.0083128827574
prec

In [35]:
bab = " pathology, Pathology as a medical specialty, General pathology, Autopsy Surgeon, Pathoanatomy, Pathobiology, Pathologic processes, Pathological, Pathological case, Pathologically, Pathologies, Pathologism, Pathologisms, Pathology as a science, Study of disease"
bab.replace('&', 'and')
synset = bab.split(',')
topic = 'Pathology'
res_pickle = "results/res__Pathology"
Path_fusion_res = get_fusion_res5(res_pickle, topic, synset)
eval_all(Path_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 8384
initial_corpus size: 8384
babelnet results size of the topic "Pathology": 8544
ground truth size 2726
intersection with the ground truth: 174
F1 0.0308784383319
babel precision:  0.0203651685393
babel recall:  0.063829787234
initial_corpus size: 8384
length of s3h results:  7065 length of test set 2726
F1:  0.0157287304668
precision s3h:  0.0108987968861 recall s3h:  0.0282465150404
initial_corpus size: 8384
length of results:  8544 length of test set 2726
F1:  0.0204081632653
precision fusion:  0.0134597378277 recall fusion:  0.0421863536317
for length of results:  17088
F1:  0.0212980720703
precision fusion:  0.0123478464419 recall fusion:  0.0774027879677
 for length of results:  25632
F1:  0.0214401579801
precision fusion:  0.0118601747815 recall fusion:  0.111518708731
 for length of results:  34176
F1:  0.020703484906
precision fusion:  0.0111774344569 recall fusion:  0.140132061629
 for length of results:  42720
F1:  0.0203758306562
precision fusion:  0

In [36]:
bab = "mycology, fungology, History of mycology, Micology, Mycological, Mycologists, Study of fungi"
bab.replace('&', 'and')
synset = bab.split(',')
topic = 'Mycology'
res_pickle = "results/res__Mycology"
Myc_fusion_res = get_fusion_res5(res_pickle, topic, synset)
eval_all(Myc_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 179
initial_corpus size: 179
babelnet results size of the topic "Mycology": 542
ground truth size 530
intersection with the ground truth: 3
F1 0.00559701492537
babel precision:  0.00553505535055
babel recall:  0.00566037735849
initial_corpus size: 179
length of s3h results:  4269 length of test set 530
F1:  0.0041675349031
precision s3h:  0.00234246896229 recall s3h:  0.0188679245283
initial_corpus size: 179
length of results:  542 length of test set 530
F1:  0.00373134328358
precision fusion:  0.00369003690037 recall fusion:  0.00377358490566
for length of results:  1084
F1:  0.00371747211896
precision fusion:  0.00276752767528 recall fusion:  0.00566037735849
 for length of results:  1626
F1:  0.00556586270872
precision fusion:  0.00369003690037 recall fusion:  0.011320754717
 for length of results:  2168
F1:  0.00444773906597
precision fusion:  0.00276752767528 recall fusion:  0.011320754717
 for length of results:  2710
F1:  0.00432098765432
precision fusion:  

In [37]:
bab = " immunology, immunobiology, immunological, immunologist, Clinical immunology, Classical immunology, Evolutionary immunology, Immunologic, Immunologists, Imunology"
bab.replace('&', 'and')
synset = bab.split(',')
topic = 'Immunology'
res_pickle = "results/res__Immunology"
Imm_fusion_res = get_fusion_res5(res_pickle, topic, synset)
eval_all(Imm_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 1514
initial_corpus size: 1514
babelnet results size of the topic "Immunology": 8787
ground truth size 4769
intersection with the ground truth: 203
F1 0.0299498377102
babel precision:  0.023102310231
babel recall:  0.0425665758021
initial_corpus size: 1514
length of s3h results:  4262 length of test set 4769
F1:  0.0301184807884
precision s3h:  0.0319099014547 recall s3h:  0.0285175089117
initial_corpus size: 1514
length of results:  8787 length of test set 4769
F1:  0.0346709943936
precision fusion:  0.0267440537157 recall fusion:  0.0492765778989
for length of results:  17574
F1:  0.0360739381462
precision fusion:  0.0229316035052 recall fusion:  0.0845040889075
 for length of results:  26361
F1:  0.0352714423386
precision fusion:  0.0208262205531 recall fusion:  0.115118473475
 for length of results:  35148
F1:  0.0333191372097
precision fusion:  0.0189199954478 recall fusion:  0.139442231076
 for length of results:  43935
F1:  0.031208935611
precision fusion:  

In [38]:
bab = "Biomaterial, Biomaterials Engineering, Bio material, Biomaterials"
bab.replace('&', 'and')
synset = bab.split(',')
topic = 'Biomaterials'
res_pickle = "results/res__Biomaterials"
bioma_fusion_res = get_fusion_res5(res_pickle, topic, synset)
eval_all(bioma_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 2442
initial_corpus size: 2442
babelnet results size of the topic "Biomaterials": 3649
ground truth size 1020
intersection with the ground truth: 92
F1 0.0394088669951
babel precision:  0.0252123869553
babel recall:  0.0901960784314
initial_corpus size: 2442
length of s3h results:  6157 length of test set 1020
F1:  0.0359481677581
precision s3h:  0.0209517622219 recall s3h:  0.126470588235
initial_corpus size: 2442
length of results:  3649 length of test set 1020
F1:  0.0334118654958
precision fusion:  0.0213757193752 recall fusion:  0.0764705882353
for length of results:  7298
F1:  0.040153883145
precision fusion:  0.0228829816388 recall fusion:  0.163725490196
 for length of results:  10947
F1:  0.0402774295981
precision fusion:  0.0220151639719 recall fusion:  0.236274509804
 for length of results:  14596
F1:  0.038550204918
precision fusion:  0.0206220882434 recall fusion:  0.295098039216
 for length of results:  18245
F1:  0.0366467687516
precision fusion:  0.

In [39]:
bab = "nursing, Nursing Science, Staff nurse, Adult nursing, Flightnurse, nurse, Nursing History, Nursing Officer, Nursing practice, Nursing skills, Nursing staff, Nursing Student, Nursing unit, Nurxing, Practice of nursing"
bab.replace('&', 'and')
synset = bab.split(',')
topic = 'Nursing'
res_pickle = "results/Nursing_results.pickle"
Nurs_fusion_res = get_fusion_res5(res_pickle, topic, synset)
eval_all(Nurs_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 8160
initial_corpus size: 8160
babelnet results size of the topic "Nursing": 8252
ground truth size 3282
intersection with the ground truth: 276
F1 0.0478585052887
babel precision:  0.0334464372273
babel recall:  0.0840950639854
initial_corpus size: 8160
length of s3h results:  12509 length of test set 3282
F1:  0.117661959344
precision s3h:  0.0742665280998 recall s3h:  0.283059110299
initial_corpus size: 8160
length of results:  8252 length of test set 3282
F1:  0.0508063117739
precision fusion:  0.0355065438682 recall fusion:  0.0892748324193
for length of results:  16504
F1:  0.124229253007
precision fusion:  0.0744667959283 recall fusion:  0.374466788544
 for length of results:  24756
F1:  0.115557457736
precision fusion:  0.0654386815317 recall fusion:  0.493601462523
 for length of results:  33008
F1:  0.103224028658
precision fusion:  0.0567438196801 recall fusion:  0.570688604509
 for length of results:  41260
F1:  0.0924969691527
precision fusion:  0.0499