In [6]:
import numpy as np
import pandas as pd
import random, pickle, argparse, json, os, urllib2
from collections import OrderedDict
from operator import itemgetter
from sklearn.ensemble import RandomForestClassifier

In [48]:
# functions
########################################################################################
def query_from(q, f):
    q = q+'&from='+str(f)
    response = urllib2.urlopen(q)
    data = json.load(response)
    subject_ids = np.array(range(len(data['hits'])), dtype=np.object)
    for (i, hit) in enumerate(data['hits']):
        subject_ids[i] = hit['id']
    return subject_ids

def query(q):
    response = urllib2.urlopen(q)
    data = json.load(response)
    nb_requests = 1 + data['total'] / 1000
    if nb_requests > 10: # maximum number of pages due to API pagination restrection
        nb_requests = 10
    subject_ids = query_from(q, 0)
    for i in range(nb_requests)[1:]:
        f = i * 1000
        next_request = query_from(q, f)
        subject_ids = np.hstack((subject_ids, next_request))
    return subject_ids.tolist()

def find_intersection(list_a, list_b):
    return list(set(list_a) & set(list_b))

def term2url(string):
    string = string.split(' ')
    res = '%22'
    for s in string:
        res = res + s + '%20'
    res = res[:-3]
    res = res + '%22'
    return res

def babel_synset(synset):
    q = 'https://api.istex.fr/document/?q=(('
    for syn in synset:
        syn = term2url(syn)
        q = q + 'title:' + syn + '%20OR%20abstract:' + syn + '%20OR%20'
    q = q[:-8]
    q = q + ')%20AND%20(qualityIndicators.abstractWordCount:[35%20500]%20AND%20qualityIndicators.pdfPageCount:[3%2060]%20AND%20publicationDate:[1990%202016]%20AND%20language:(%22eng%22%20OR%20%22unknown%22)%20AND%20genre:(%22research_article%22%20OR%20%22conference[eBooks]%22%20OR%20%22article%22%20)%20))&size=1000&output=id'
    return q

def babel_subj_keyword(topic):
    q = 'https://api.istex.fr/document/?q=(('
    topic = term2url(topic)
    q = q+ 'subject.value:' + topic + '%20OR%20keywords:' + topic
    q = q + ')%20AND%20(qualityIndicators.abstractWordCount:[35%20500]%20AND%20qualityIndicators.pdfPageCount:[3%2060]%20AND%20publicationDate:[1990%202016]%20AND%20language:(%22eng%22%20OR%20%22unknown%22)%20AND%20genre:(%22research_article%22%20OR%20%22conference[eBooks]%22%20OR%20%22article%22%20)%20))&size=1000&output=id'
    return q
 
def babel_title_abst(topic):
    q = 'https://api.istex.fr/document/?q=(('
    topic = term2url(topic)
    q = q+ 'title:' + topic + '%20OR%20abstract:' + topic
    q = q + ')%20AND%20(qualityIndicators.abstractWordCount:[35%20500]%20AND%20qualityIndicators.pdfPageCount:[3%2060]%20AND%20publicationDate:[1990%202016]%20AND%20language:(%22eng%22%20OR%20%22unknown%22)%20AND%20genre:(%22research_article%22%20OR%20%22conference[eBooks]%22%20OR%20%22article%22%20)%20))&size=1000&output=id'
    return q
 
def babelnet_syn_get_input(topic, synset):
    results = query(babel_synset(synset))
    _gs = query(babel_subj_keyword(topic))
    results = find_intersection(results, inversed_index.keys())
    _abst_title = query(babel_title_abst(topic))
    test_set = _inter = {x for x in _gs if x not in _abst_title}
    test_set = find_intersection(test_set, inversed_index.keys())
    results = list(results)
    test = list(test_set)
    print 'initial_corpus size:', len(find_intersection(_abst_title, inversed_index.keys()))
    return results, test

def babelnet_eval(topic, synset):
    babelnet_results, test = babelnet_syn_get_input(topic, synset)
    print 'results size of the topic "' + topic + '":', len(babelnet_results) 
    print 'ground truth size', len(test)
    babel_test_intersection = find_intersection(test,babelnet_results)
    babel_test_intersection_size = len(babel_test_intersection)
    print 'intersection with the ground truth:', babel_test_intersection_size
    
def top_thresh(ordered_dict_pickle, thresh):
    ranked_all = pickle.load(open(ordered_dict_pickle, 'rb'))
    ranked_all_np = np.array(ranked_all.items())
    ranked_all_df = pd.DataFrame(data=ranked_all_np, index=None, columns=['istex_id', 'score'])
    ranked_all_df['score'] = ranked_all_df[['score']].astype(float)
    return ranked_all_df[ranked_all_df['score'] > thresh]

def top_thresh_lst(res_lst_pickle, thresh):
    ranked_all = pickle.load(open(res_lst_pickle, 'rb'))
    if type(ranked_all) is OrderedDict:
        ranked_all = ranked_all.items()[:100000]
    ranked_all_np = np.array(ranked_all)
    ranked_all_df = pd.DataFrame(data=ranked_all_np, index=None, columns=['istex_id', 'score'])
    ranked_all_df['score'] = ranked_all_df[['score']].astype(float)
    return ranked_all_df[ranked_all_df['score'] > thresh]

def babelnet_eval_PR(topic, synset):
    babelnet_results, test = babelnet_syn_get_input(topic, synset)
    print 'babelnet results size of the topic "' + topic + '":', len(babelnet_results) 
    print 'ground truth size', len(test)
    babel_test_intersection = find_intersection(test,babelnet_results)
    babel_test_intersection_size = len(babel_test_intersection)
    print 'intersection with the ground truth:', babel_test_intersection_size
    precision = babel_test_intersection_size / float(len(babelnet_results))
    recall = babel_test_intersection_size / float(len(test))
    if babel_test_intersection_size is not 0:
        F1 = 2 * (precision * recall) / (precision + recall)
    else:
        F1 = 0.0 
    print "F1", F1
    print 'babel precision: ', precision
    print 'babel recall: ', recall
    

#Evaluate 3SH results at treshold
def eval_all_at_thresh(ordered_dict_pickle, topic, synset, thresh):
    babelnet_eval_PR(topic, synset)
    _, test = babelnet_syn_get_input(topic, synset)
    t = len(test)
    top_res = top_thresh(ordered_dict_pickle, thresh)
    n = len(top_res)
    if n > 10000:
        thresh = thresh + 0.1
        top_res = top_thresh(ordered_dict_pickle, thresh)
    elif n < 1000:
        thresh = thresh - 0.1
        top_res = top_thresh(ordered_dict_pickle, thresh)
    n = len(top_res)
    res = list(top_res['istex_id'])
    recall = len(find_intersection(test,res))/float(t)
    precision = len(find_intersection(test,res))/float(n)
    return precision, recall, n, t, thresh

#Evaluate 3SH results list at treshold
def eval_all_at_thresh_lst(res_pickle, topic, synset, thresh=0.75):
    babelnet_eval_PR(topic, synset)
    _, test = babelnet_syn_get_input(topic, synset)
    t = len(test)
    top_res = top_thresh_lst(res_pickle, thresh)
    n = len(top_res)
    if n > 10000:
        thresh = thresh + 0.1
        top_res = top_thresh_lst(res_pickle, thresh)
        if len(top_res) > 10000:
            thresh = thresh + 0.05
            top_res = top_thresh_lst(res_pickle, thresh)
    elif n < 1000:
        thresh = thresh - 0.1
        top_res = top_thresh_lst(res_pickle, thresh)
        if len(top_res) > 1000:
            thresh = thresh - 0.05
            top_res = top_thresh_lst(res_pickle, thresh)
    n = len(top_res)
    res = list(top_res['istex_id'])
    recall = len(find_intersection(test,res))/float(t)
    precision = len(find_intersection(test,res))/float(n)
    F1 = 2 * (precision * recall) / (precision + recall)
    print "length of s3h results: ", n, "length of test set", t
    print "F1: ", F1
    print "precision s3h: ", precision, "recall s3h: ", recall

def get_fusion_res3(s3h_res_pickle, topic, synset):
    topic_s3h_results = pickle.load(open(s3h_res_pickle,'rb'))
    if type(topic_s3h_results) is OrderedDict:
        topic_s3h_results = topic_s3h_results.items()
    topic_s3h_top100k_results = topic_s3h_results[:100000]
    babel_results, test = babelnet_syn_get_input(topic, synset)
    fus = np.array(np.ones(100000))
    fus = fus * len(babel_results)
    for i, s3h in enumerate(topic_s3h_top100k_results):
        for j, bab in enumerate(babel_results):
            if s3h[0] == bab:
                fus[i] = (i + j) / 2
    fusion_df = pd.DataFrame(data=topic_s3h_top100k_results, columns=["istex_id", "3sh_score"])
    fusion_df["fus_rank"] = fus
    fusion_res = fusion_df.sort_values("fus_rank")#["istex_id"].tolist()
    return fusion_res

def get_fusion_res4(s3h_res_pickle, topic, synset):
    topic_s3h_results = pickle.load(open(s3h_res_pickle,'rb'))
    if type(topic_s3h_results) is OrderedDict:
        topic_s3h_results = topic_s3h_results.items()
    topic_s3h_top100k_results = topic_s3h_results[:100000]
    babel_results, test = babelnet_syn_get_input(topic, synset)
    fus = np.array(np.ones(100000))
    fus = fus * 2 * len(babel_results)
    for i, s3h in enumerate(topic_s3h_top100k_results):
        for j, bab in enumerate(babel_results):
            if s3h[0] == bab:
                fus[i] = (i + j) / 2
    fusion_df = pd.DataFrame(data=topic_s3h_top100k_results, columns=["istex_id", "3sh_score"])
    fusion_df["fus_rank"] = fus
    fusion_res = fusion_df.sort_values("fus_rank")#["istex_id"].tolist()
    return fusion_res

def get_fusion_res5(s3h_res_pickle, topic, synset):
    topic_s3h_results = pickle.load(open(s3h_res_pickle,'rb'))
    if type(topic_s3h_results) is OrderedDict:
        topic_s3h_results = topic_s3h_results.items()
    topic_s3h_top100k_results = topic_s3h_results[:100000]
    babel_results, test = babelnet_syn_get_input(topic, synset)
    fus = np.array(range(100000))
    fus = fus * len(babel_results)
    for i, s3h in enumerate(topic_s3h_top100k_results):
        for j, bab in enumerate(babel_results):
            if s3h[0] == bab:
                fus[i] = (i + j) / 2
    fusion_df = pd.DataFrame(data=topic_s3h_top100k_results, columns=["istex_id", "3sh_score"])
    fusion_df["fus_rank"] = fus
    fusion_res = fusion_df.sort_values("fus_rank")#["istex_id"].tolist()
    return fusion_res

def get_fusion_res6(s3h_res_pickle, topic, synset):
    topic_s3h_results = pickle.load(open(s3h_res_pickle,'rb'))
    if type(topic_s3h_results) is OrderedDict:
        topic_s3h_results = topic_s3h_results.items()
    topic_s3h_top100k_results = topic_s3h_results[:100000]
    babel_results, test = babelnet_syn_get_input(topic, synset)
    fus = np.array(range(100000))
    fus = fus * 3 * len(babel_results)
    for i, s3h in enumerate(topic_s3h_top100k_results):
        for j, bab in enumerate(babel_results):
            if s3h[0] == bab:
                fus[i] = (2*i + j) / 2
    fusion_df = pd.DataFrame(data=topic_s3h_top100k_results, columns=["istex_id", "3sh_score"])
    fusion_df["fus_rank"] = fus
    fusion_res = fusion_df.sort_values("fus_rank")#["istex_id"].tolist()
    return fusion_res

#Evaluate fusion df
def eval_all(fusion_df, topic, synset, res_pickle, thresh=0.75):
    eval_all_at_thresh_lst(res_pickle, topic, synset, thresh)
    babelnet_res, test = babelnet_syn_get_input(topic, synset)
    n = len(babelnet_res)
    t = len(test)

    print "length of results: ", n, "length of test set", t
    res = fusion_df["istex_id"].tolist()[:n]
    matches = len(find_intersection(test,res))
    recall = matches / float(t)
    precision = matches / float(n)
    if matches is not 0:
        F1 = 2 * (precision * recall) / (precision + recall)
    else:
        F1 = 0
    print "F1: ", F1 
    print "precision fusion: ", precision, "recall fusion: ",recall

    n = 2 * len(babelnet_res)
    print "for length of results: ", n
    res = fusion_df["istex_id"].tolist()[:n]
    matches = len(find_intersection(test,res))
    recall = matches / float(t)
    precision = matches / float(n)
    if matches is not 0:
        F1 = 2 * (precision * recall) / (precision + recall)
    else:
        F1 = 0
    print "F1: ", F1 
    print "precision fusion: ", precision, "recall fusion: ",recall

    n = 3 * len(babelnet_res)
    print " for length of results: ", n
    res = fusion_df["istex_id"].tolist()[:n]
    matches = len(find_intersection(test,res))
    recall = matches / float(t)
    precision = matches / float(n)
    if matches is not 0:
        F1 = 2 * (precision * recall) / (precision + recall)
    else:
        F1 = 0
    print "F1: ", F1 
    print "precision fusion: ", precision, "recall fusion: ",recall
    
    n = 4 * len(babelnet_res)
    print " for length of results: ", n
    res = fusion_df["istex_id"].tolist()[:n]
    matches = len(find_intersection(test,res))
    recall = matches / float(t)
    precision = matches / float(n)
    if matches is not 0:
        F1 = 2 * (precision * recall) / (precision + recall)
    else:
        F1 = 0
    print "F1: ", F1 
    print "precision fusion: ", precision, "recall fusion: ",recall
    
    n = 5 * len(babelnet_res)
    print " for length of results: ", n
    res = fusion_df["istex_id"].tolist()[:n]
    matches = len(find_intersection(test,res))
    recall = matches / float(t)
    precision = matches / float(n)
    if matches is not 0:
        F1 = 2 * (precision * recall) / (precision + recall)
    else:
        F1 = 0
    print "F1: ", F1  
    print "precision fusion: ", precision, "recall fusion: ",recall

In [8]:
#loading SDV of istex articles
inv_index = json.load(open('../RecSys_Exp_files/182_381_vec150_results/output_paragraph_inversed_index.json','rb'))
print 'original inversed_index'
print inv_index.items()[:3]
inversed_index = dict()
for (k, v) in inv_index.items():
    key = k.split('_')[1]
    inversed_index[key] = v
print 'processed inversed_index'
print inversed_index.items()[:3]

original inversed_index
[(u'ISTEX_D89FA3AC3521074D46F4245762153DF497BFFA1F', 2002320), (u'ISTEX_18EAF4D6A126B077EB38667801D1B7292F32FF49', 2483732), (u'ISTEX_5F91044435FCC4FABB9F02E31467DCFE75F4A7BE', 1429049)]
processed inversed_index
[(u'FCF1393F9B8136AC08FB67E88F94F3CF62C17288', 3517138), (u'482E1102A1114327A744FD2ADB4D9F8FF7E9A70B', 751643), (u'A81022B6295AE66F68A10222C3B94A06B033C1BA', 3983232)]


In [4]:
def get_fusion_res(s3h_res_pickle, topic, synset):
    topic_s3h_results = pickle.load(open(s3h_res_pickle,'rb'))
    topic_s3h_top100k_results = topic_s3h_results[:100000]
    babel_results, test = babelnet_syn_get_input(topic, synset)
    fus = np.array(np.ones(100000))
    fus = (fus + len(babel_results)) / 2
    for i, s3h in enumerate(topic_s3h_top100k_results):
        for j, bab in enumerate(babel_results):
            if s3h[0] == bab:
                fus[i] = (i + j) / 2
    fusion_df = pd.DataFrame(data=topic_s3h_top100k_results, columns=["istex_id", "3sh_score"])
    fusion_df["fus_rank"] = fus
    fusion_res = fusion_df.sort_values("fus_rank")#["istex_id"].tolist()
    return fusion_res

In [11]:
def get_fusion_res3(s3h_res_pickle, topic, synset):
    topic_s3h_results = pickle.load(open(s3h_res_pickle,'rb'))
    topic_s3h_top100k_results = topic_s3h_results[:100000]
    babel_results, test = babelnet_syn_get_input(topic, synset)
    fus = np.array(np.ones(100000))
    fus = fus * len(babel_results)
    for i, s3h in enumerate(topic_s3h_top100k_results):
        for j, bab in enumerate(babel_results):
            if s3h[0] == bab:
                fus[i] = (i + j) / 2
    fusion_df = pd.DataFrame(data=topic_s3h_top100k_results, columns=["istex_id", "3sh_score"])
    fusion_df["fus_rank"] = fus
    fusion_res = fusion_df.sort_values("fus_rank")#["istex_id"].tolist()
    return fusion_res

In [17]:
bab = 'organ transplant, transplant, transplantation, Organ transplantation, Medical Transplantation, Transplant Surgery, Black market organs, First transplant, First transplantation, Intestinal transplant, Live organ transplants, Mixed chimerism, Organ and Tissue Donor, Organ doner, Organ farming, Organ transplantation in different countries, Organ transplantation therapy, Organ transplants, Skin transplant, Tissue transplant, Transplant Tourism and Organ Trafficking, Transplantation medicine, Transplantation surgery, Transplantation therapy, Transplanted organs, Transplantology'
synset = bab.split(',')
topic = 'Transplantation'
s3h_res_pickle = "results/res_Transplantation"
Transplantation_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

In [20]:
bab = 'spectroscopy, spectrographic analysis, spectrometry, spectroscopic analysis, spectrum analysis, Fingerprint region, Laser spectroscopy, Optical spectroscopy, Atomic line, Atomic line spectra, Atomic spectra, Atomic spectral line, Electromagnetic spectroscopy, Emission spectrochemical analysis, Spectrochemical Analysis, Spectrography, Spectrology, Spectroscopic, Spectroscopist, Spectroscopists'
synset = bab.split(',')
topic = 'Spectroscopy'
s3h_res_pickle = "results/res__Spectroscopy"
Spectroscopy_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

In [24]:
bab = ' surgery, operation, surgical operation, surgical procedure, surgical process, Chirurgery, Chirurgical, Chirurgy, Complications of surgery, Corrective surgery, Elliptical excision, Emergency Surgery, Post-operation, Post-operative, Specialties in surgery, Sterile drapes, surgeon, Surgeons, Surgeries, Surgery in general practice, Surgery operation, Surgery specialties, Surgical, Surgical excision, Surgical excision of malignant lesions, Surgical specialties, Surgical technique, Surgically'
synset = bab.split(',')
topic = 'Surgery'
s3h_res_pickle = "results/res__Surgery"
Surgery_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

In [25]:
bab = 'optics, Classical optics, Light physics, Optical, Optical system'
synset = bab.split(',')
topic = 'Optics'
s3h_res_pickle = "results/res__Optics"
Optics_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

In [26]:
bab = 'literature, Literary art, Literary, Literary arts, Literary work, Literary works, LiteraryArt, Literature by region, Literatures, Litterature, Ltierature, Prose fiction'
synset = bab.split(',')
topic = 'Literature'
s3h_res_pickle = "results/res__Literature"
Literature_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

In [27]:
bab = 'toxicology, Chemical toxicology, History of toxicology, Toxicological, Toxicologists'
synset = bab.split(',')
topic = 'Toxicology'
s3h_res_pickle = "results/res__Toxicology"
Toxicology_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

In [28]:
bab = 'artificial intelligence, AI, artilect, Artifical intelligence, Cognitive systems, Digital being, Machine intelligence, A.I., A I, AI-D, AI ethics, AI implications, AI Robotics, AI scripting, Aretificial intelligence, Artificial-intelligence, Artificial conciousness, Artificial inteligence, Artificial intellect, Artificial intellegence, Artificial Intelligence., Artificial intelligence for development, Artificial Intelligence Program, Artificial intelligences, Artificially-intelligent, Artificially intelligent, Artificual intelligence, Cognitive simulation, Cognitive system, Commonsense AI, Computational Rationality, Computer AI, Intelligent machine, Machine thought, Machine understanding, Ontology based approach, Pseudo intelligence, Semi-AI, Semi AI, Simulated intelligence, Soft AI, Sub-symbolic, Subsymbolic, The Artificial Intelligence, The Theory of Artificial Intelligence'
synset = bab.split(',')
topic = 'artificial intelligence'
s3h_res_pickle = "results/AI_results.pickle"
AI_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

KeyboardInterrupt: 

In [29]:
bab = 'cybernetics, Cybernetic, Cybernetic system, cybernetician, Cyberneticians, cyberneticist, Tha Masta'
synset = bab.split(',')
topic = 'Cybernetics'
s3h_res_pickle = "results/res__Cybernetics"
Cybernetics_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

In [None]:
bab = 'information system, data system, information systems, system info, Business computing, Computer information system, Computer information systems, Elements of Information System, Information in Computer Science, Information systems and technology, Information systems discipline, Information systems theory, Informationssystem'
synset = bab.split(',')
topic = 'Information Systems'
s3h_res_pickle = "results/infosys_results.pickle"
Information_Systems_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

In [30]:
bab = 'immunology, immunobiology, immunological, immunologist, Clinical immunology, Classical immunology, Evolutionary immunology, Immunologic, Immunologists, Imunologys'
synset = bab.split(',')
topic = 'Immunology'
s3h_res_pickle = "results/res__Immunology"
Immunology_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

In [None]:
bab = 'infection, infectious disease, communicable diseases, contagion, Infectious diseases, Acute infection, AIDS-related bacterial infections, AIDS-related viral infections, Anti-infective, Anti-infectives, Antiinfective, Bacterial Infections, Communicable disease, Contagious diseases, Definition to contagious, Infect, Infecting, Infectiology, Infections, Infectious, Infectious disease epidemiology, Infectious disease medicine, Infectology, Local infection, Primary infection, Rochalimea infections, Secondary infection, Tropical bacterial infections, Tropical infections, Viral Infections, Wound colonization, Wound infection'
synset = bab.split(',')
topic = 'Infectious Diseases'
s3h_res_pickle = "results/Infectious_results.pickle"
Infectious_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

In [31]:
bab = 'Biomaterial, Biomaterials Engineering, Bio material, Biomaterials'
synset = bab.split(',')
topic = 'Biomaterials'
s3h_res_pickle = "results/res__Biomaterials"
Biomaterials_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

In [32]:
bab = 'ceramics, Ceramic art, Art pottery, Art ware, Ceramic artist, Ceramic paint, Ceramics art, Fine art pot, Vase painting'
synset = bab.split(',')
topic = 'Ceramics'
s3h_res_pickle = "results/res__Ceramics"
Ceramics_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

In [33]:
topic = "biophysics"
synset_text = "biophysics, Biological physics, Biophysical, Biophysicists, History of biophysics"
synset = synset_text.split(',')
s3h_res_pickle = 'results/res__biophysics'
biophysics_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

In [12]:
ind_dict = OrderedDict((k,i) for i,k in enumerate(biophysics_fusion_res))
babel_results, test = babelnet_syn_get_input(topic, synset)
inter = set(ind_dict).intersection(babel_results)
indices = [ind_dict[x] for x in inter ]
cutt = np.array(indices).max()
cutt

99999

In [None]:
slic = len(indices) * 0.8
indices_slic = indices[:int(slic)]
#indices_slic

In [None]:
fusion_res = fusion_df.sort_values("fusin_rank")["istex_id"].tolist()
#fusion_res[:5000]

In [5]:
#Evaluate fusion df with manual cut
def eval_fusin_at_thresh_lst_1k(fusion_df, topic, synset):
    babelnet_eval_PR(topic, synset)
    babelnet_res, test = babelnet_syn_get_input(topic, synset)
    n = len(babelnet_res)
    if n < 1000:
        n = 1000
    t = len(test)
    res = fusion_df["istex_id"].tolist()[:n]
    recall = len(find_intersection(test,res))/float(t)
    precision = len(find_intersection(test,res))/float(n)
    return precision, recall, n, t

In [6]:
#Evaluate fusion df
def eval_fusin_at_5k(fusion_df, topic, synset):
    babelnet_eval_PR(topic, synset)
    babelnet_res, test = babelnet_syn_get_input(topic, synset)
    n = 5000#len(babelnet_res)
    t = len(test)
    res = fusion_df["istex_id"].tolist()[:n]
    recall = len(find_intersection(test,res))/float(t)
    precision = len(find_intersection(test,res))/float(n)
    print "precision fusion: " precision, "recall fusion: ",recall
    print "length of results: ", n, "length of test set", t

In [7]:
#Evaluate fusion df
def eval_fusin_at_2_bablesize((fusion_df, topic, synset):
    babelnet_eval_PR(topic, synset)
    babelnet_res, test = babelnet_syn_get_input(topic, synset)
    n = 2 * len(babelnet_res)
    t = len(test)
    res = fusion_df["istex_id"].tolist()[:n]
    recall = len(find_intersection(test,res))/float(t)
    precision = len(find_intersection(test,res))/float(n)
    print "precision fusion: " precision, "recall fusion: ",recall
    print "length of results: ", n, "length of test set", t

In [8]:
#Evaluate fusion df
def eval_fusin_at_bablesize(fusion_df, topic, synset):
    babelnet_eval_PR(topic, synset)
    babelnet_res, test = babelnet_syn_get_input(topic, synset)
    n = len(babelnet_res)
    t = len(test)
    res = fusion_df["istex_id"].tolist()[:n]
    recall = len(find_intersection(test,res))/float(t)
    precision = len(find_intersection(test,res))/float(n)
    print "precision fusion: " precision, "recall fusion: ",recall
    print "length of results: ", n, "length of test set", t

In [9]:
def get_fusion_res_2(s3h_res_pickle, topic, synset):
    topic_s3h_top100k_results = pickle.load(open(s3h_res_pickle,'rb'))
    babel_results, test = babelnet_syn_get_input(topic, synset)
    fus = np.array(np.ones(100000))
    fus = (fus + 100000) / 2
    for i, s3h in enumerate(topic_s3h_top100k_results):
        for j, bab in enumerate(babel_results):
            if s3h[0] == bab:
                fus[i] = (i + j) / 2
    fusion_df = pd.DataFrame(data=topic_s3h_top100k_results, columns=["istex_id", "3sh_score"])
    fusion_df["fus_rank"] = fus
    fusion_res = fusion_df.sort_values("fus_rank")#["istex_id"].tolist()
    return fusion_res

In [14]:
#Evaluate fusion df
def eval_all(fusion_df, topic, synset, res_pickle, tresh=0.75):
    eval_all_at_thresh_lst(res_pickle, topic, synset, thresh)
    babelnet_res, test = babelnet_syn_get_input(topic, synset)
    n = len(babelnet_res)
    t = len(test)

    print "length of results: ", n, "length of test set", t
    res = fusion_df["istex_id"].tolist()[:n]
    recall = len(find_intersection(test,res))/float(t)
    precision = len(find_intersection(test,res))/float(n)
    F1 = 2 * (precision * recall) / (precision + recall)
    print "F1: ", F1 
    print "precision fusion: ", precision, "recall fusion: ",recall

    n = 2 * len(babelnet_res)
    print "for length of results: ", n
    res = fusion_df["istex_id"].tolist()[:n]
    recall = len(find_intersection(test,res))/float(t)
    precision = len(find_intersection(test,res))/float(n)
    F1 = 2 * (precision * recall) / (precision + recall)
    print "F1: ", F1 
    print "precision fusion: ", precision, "recall fusion: ",recall

    n = 3 * len(babelnet_res)
    print " for length of results: ", n
    res = fusion_df["istex_id"].tolist()[:n]
    recall = len(find_intersection(test,res))/float(t)
    precision = len(find_intersection(test,res))/float(n)
    F1 = 2 * (precision * recall) / (precision + recall)
    print "F1: ", F1 
    print "precision fusion: ", precision, "recall fusion: ",recall
    
    

In [41]:
bab = 'toxicology, Chemical toxicology, History of toxicology, Toxicological, Toxicologists'
synset = bab.split(',')
topic = 'Toxicology'
get_fusion_res_2("results/res__Toxicology", topic, synset)
eval_fusin_at_thresh_lst(Toxicology_fusion_res, topic, synset)

babelnet results size of the topic "Toxicology": 5563
ground truth size 1552
intersection with the ground truth: 146
babel precision:  0.0262448319252
babel recall:  0.0940721649485


(0.004853496314937983, 0.017396907216494846, 5563, 1552)

In [52]:
bab = 'toxicology, Chemical toxicology, History of toxicology, Toxicological, Toxicologists'
synset = bab.split(',')
topic = 'Toxicology'
get_fusion_res_2("results/res__Toxicology", topic, synset)
eval_fusin_at_thresh_lst(Toxicology_fusion_res, topic, synset)
eval_fusin_at_thresh_lst2(Toxicology_fusion_res, topic, synset)

babelnet results size of the topic "Toxicology": 5563
ground truth size 1552
intersection with the ground truth: 146
babel precision:  0.0262448319252
babel recall:  0.0940721649485


(0.004853496314937983, 0.017396907216494846, 5563, 1552)

In [42]:
bab = 'Biomaterial, Biomaterials Engineering, Bio material, Biomaterials'
synset = bab.split(',')
topic = 'Biomaterials'
get_fusion_res_2("results/res__Biomaterials", topic, synset)
eval_fusin_at_thresh_lst(Biomaterials_fusion_res, topic, synset)

babelnet results size of the topic "Biomaterials": 3649
ground truth size 1020
intersection with the ground truth: 92
babel precision:  0.0252123869553
babel recall:  0.0901960784314


(0.00356261989586188, 0.012745098039215686, 3649, 1020)

In [54]:
bab = 'Biomaterial, Biomaterials Engineering, Bio material, Biomaterials'
synset = bab.split(',')
topic = 'Biomaterials'
get_fusion_res_2("results/res__Biomaterials", topic, synset)
eval_fusin_at_thresh_lst(Biomaterials_fusion_res, topic, synset)
eval_fusin_at_thresh_lst2(Toxicology_fusion_res, topic, synset)

babelnet results size of the topic "Biomaterials": 3649
ground truth size 1020
intersection with the ground truth: 92
babel precision:  0.0252123869553
babel recall:  0.0901960784314
babelnet results size of the topic "Biomaterials": 3649
ground truth size 1020
intersection with the ground truth: 92
babel precision:  0.0252123869553
babel recall:  0.0901960784314


(0.0, 0.0, 7298, 1020)

In [55]:
bab = 'immunology, immunobiology, immunological, immunologist, Clinical immunology, Classical immunology, Evolutionary immunology, Immunologic, Immunologists, Imunologys'
synset = bab.split(',')
topic = 'Immunology'
get_fusion_res_2("results/res__Immunology", topic, synset)
eval_fusin_at_thresh_lst(Immunology_fusion_res, topic, synset)
eval_fusin_at_thresh_lst2(Immunology_fusion_res, topic, synset)

babelnet results size of the topic "Immunology": 8787
ground truth size 4769
intersection with the ground truth: 203
babel precision:  0.023102310231
babel recall:  0.0425665758021
babelnet results size of the topic "Immunology": 8787
ground truth size 4769
intersection with the ground truth: 203
babel precision:  0.023102310231
babel recall:  0.0425665758021


(0.010356208034596564, 0.03816313692598029, 17574, 4769)

In [58]:
eval_fusin_at_thresh_lst(Immunology_fusion_res, topic, synset)

babelnet results size of the topic "Immunology": 8787
ground truth size 4769
intersection with the ground truth: 203
babel precision:  0.023102310231
babel recall:  0.0425665758021
0.0124 0.0130006290627 5000 4769


In [61]:
eval_fusin_at_thresh_lst3(Immunology_fusion_res, topic, synset)

babelnet results size of the topic "Immunology": 8787
ground truth size 4769
intersection with the ground truth: 203
babel precision:  0.023102310231
babel recall:  0.0425665758021
0.0119494708091 0.0220171943804 8787 4769


In [None]:
bab = 'immunology, immunobiology, immunological, immunologist, Clinical immunology, Classical immunology, Evolutionary immunology, Immunologic, Immunologists, Imunologys'
synset = bab.split(',')
topic = 'Immunology'
get_fusion_res_2("results/res__Immunology", topic, synset)
eval_fusin_at_thresh_lst2(Immunology_fusion_res, topic, synset)

In [46]:
bab = 'cybernetics, Cybernetic, Cybernetic system, cybernetician, Cyberneticians, cyberneticist, Tha Masta'
synset = bab.split(',')
topic = 'Cybernetics'
get_fusion_res_2("", topic, synset)
eval_fusin_at_thresh_lst2(Cybernetics_fusion_res, topic, synset)

babelnet results size of the topic "Cybernetics": 511
ground truth size 596
intersection with the ground truth: 47
babel precision:  0.0919765166341
babel recall:  0.0788590604027


(0.004, 0.006711409395973154, 1000, 596)

In [63]:
bab = 'cybernetics, Cybernetic, Cybernetic system, cybernetician, Cyberneticians, cyberneticist, Tha Masta'
synset = bab.split(',')
topic = 'Cybernetics'
Cybernetics_fusion_res = get_fusion_res_2("results/res__Cybernetics", topic, synset)
eval_fusin_at_thresh_lst2(Cybernetics_fusion_res, topic, synset)

babelnet results size of the topic "Cybernetics": 511
ground truth size 596
intersection with the ground truth: 47
babel precision:  0.0919765166341
babel recall:  0.0788590604027
0.0283757338552 0.0486577181208 1022 596


In [None]:
eval_all_at_thresh('results/AI_results.pickle', topic , synset, thresh = 0.75)

In [47]:
bab = 'literature, Literary art, Literary, Literary arts, Literary work, Literary works, LiteraryArt, Literature by region, Literatures, Litterature, Ltierature, Prose fiction'
synset = bab.split(',')
topic = 'Literature'
get_fusion_res_2(s3h_res_pickle, topic, synset)
eval_fusin_at_thresh_lst(Literature_fusion_res, topic, synset)

babelnet results size of the topic "Literature": 7357
ground truth size 860
intersection with the ground truth: 107
babel precision:  0.0145439717276
babel recall:  0.124418604651


(0.0020388745412532284, 0.01744186046511628, 7357, 860)

In [48]:
bab = 'optics, Classical optics, Light physics, Optical, Optical system'
synset = bab.split(',')
topic = 'Optics'
get_fusion_res_2(s3h_res_pickle, topic, synset)
eval_fusin_at_thresh_lst(Optics_fusion_res, topic, synset)

babelnet results size of the topic "Optics": 8730
ground truth size 3286
intersection with the ground truth: 271
babel precision:  0.0310423825888
babel recall:  0.0824710894705


(0.015005727376861398, 0.039866098600121726, 8730, 3286)

In [49]:
bab = ' surgery, operation, surgical operation, surgical procedure, surgical process, Chirurgery, Chirurgical, Chirurgy, Complications of surgery, Corrective surgery, Elliptical excision, Emergency Surgery, Post-operation, Post-operative, Specialties in surgery, Sterile drapes, surgeon, Surgeons, Surgeries, Surgery in general practice, Surgery operation, Surgery specialties, Surgical, Surgical excision, Surgical excision of malignant lesions, Surgical specialties, Surgical technique, Surgically'
synset = bab.split(',')
topic = 'Surgery'
get_fusion_res_2(s3h_res_pickle, topic, synset)
eval_fusin_at_thresh_lst(Surgery_fusion_res, topic, synset)

babelnet results size of the topic "Surgery": 8271
ground truth size 6412
intersection with the ground truth: 565
babel precision:  0.0683109660259
babel recall:  0.0881160324392


(0.027687099504292104, 0.03571428571428571, 8271, 6412)

In [50]:
bab = 'spectroscopy, spectrographic analysis, spectrometry, spectroscopic analysis, spectrum analysis, Fingerprint region, Laser spectroscopy, Optical spectroscopy, Atomic line, Atomic line spectra, Atomic spectra, Atomic spectral line, Electromagnetic spectroscopy, Emission spectrochemical analysis, Spectrochemical Analysis, Spectrography, Spectrology, Spectroscopic, Spectroscopist, Spectroscopists'
synset = bab.split(',')
topic = 'Spectroscopy'
get_fusion_res_2(s3h_res_pickle, topic, synset)
eval_fusin_at_thresh_lst(Spectroscopy_fusion_res, topic, synset)

babelnet results size of the topic "Spectroscopy": 8513
ground truth size 7294
intersection with the ground truth: 287
babel precision:  0.0337131446024
babel recall:  0.0393474088292


(0.025138024198284977, 0.029339182890046615, 8513, 7294)

In [None]:
bab = 'artificial intelligence, AI, artilect, Artifical intelligence, Cognitive systems, Digital being, Machine intelligence, A.I., A I, AI-D, AI ethics, AI implications, AI Robotics, AI scripting, Aretificial intelligence, Artificial-intelligence, Artificial conciousness, Artificial inteligence, Artificial intellect, Artificial intellegence, Artificial Intelligence., Artificial intelligence for development, Artificial Intelligence Program, Artificial intelligences, Artificially-intelligent, Artificially intelligent, Artificual intelligence, Cognitive simulation, Cognitive system, Commonsense AI, Computational Rationality, Computer AI, Intelligent machine, Machine thought, Machine understanding, Ontology based approach, Pseudo intelligence, Semi-AI, Semi AI, Simulated intelligence, Soft AI, Sub-symbolic, Subsymbolic, The Artificial Intelligence, The Theory of Artificial Intelligence'
synset = bab.split(',')
topic = 'artificial intelligence'
AI_fusion_res = get_fusion_res3("results/AI_results.pickle", topic, synset)
eval_all(AI_fusion_res, topic, synset, "results/AI_results.pickle", tresh=0.75)

In [10]:
bab = 'infection, infectious disease, communicable diseases, contagion, Infectious diseases, Acute infection, AIDS-related bacterial infections, AIDS-related viral infections, Anti-infective, Anti-infectives, Antiinfective, Bacterial Infections, Communicable disease, Contagious diseases, Definition to contagious, Infect, Infecting, Infectiology, Infections, Infectious, Infectious disease epidemiology, Infectious disease medicine, Infectology, Local infection, Primary infection, Rochalimea infections, Secondary infection, Tropical bacterial infections, Tropical infections, Viral Infections, Wound colonization, Wound infection'
synset = bab.split(',')
topic = 'Infectious Diseases'
ID_fusion_res = get_fusion_res3("results/Infectious_results.pickle", topic, synset)
eval_all(ID_fusion_res, topic, synset, "results/Infectious_results.pickle", thresh=0.75)

initial_corpus size: 2588


URLError: <urlopen error [Errno -3] Temporary failure in name resolution>

In [15]:
bab = "respiratory system, systema respiratorium, Respiration organ, Breathing organ, Breathing system, Development of respiratory system, Human Respiration, Human respiratory system, Pulmonary respiration, Pulmonary system, Respatory system, Respiration of human, Respiration system, Respiratory, Respiratory organs, Respiratory system agents, Respiratory system disorders, Respiratory systems, Respitory System, Subglottic airway, The respiratory system"
synset = bab.split(',')
topic = 'Respiratory system'
ID_fusion_res = get_fusion_res3("results/Respiratory_system_results.pickle", topic, synset)
eval_all(ID_fusion_res, topic, synset, "results/Respiratory_system_results.pickle", thresh=0.75)

initial_corpus size: 795
initial_corpus size: 795
babelnet results size of the topic "Respiratory system": 8968
ground truth size 189
intersection with the ground truth: 18
F1 0.00393141858687
babel precision:  0.00200713648528
babel recall:  0.0952380952381
initial_corpus size: 795
length of s3h results:  1120 length of test set 189
F1:  0.0061115355233
precision s3h:  0.00357142857143 recall s3h:  0.021164021164


URLError: <urlopen error [Errno -3] Temporary failure in name resolution>

In [16]:
eval_all(ID_fusion_res, topic, synset, "results/Respiratory_system_results.pickle", thresh=0.75)

initial_corpus size: 795
babelnet results size of the topic "Respiratory system": 8968
ground truth size 189
intersection with the ground truth: 18
F1 0.00393141858687
babel precision:  0.00200713648528
babel recall:  0.0952380952381
initial_corpus size: 795
length of s3h results:  1120 length of test set 189
F1:  0.0061115355233
precision s3h:  0.00357142857143 recall s3h:  0.021164021164
initial_corpus size: 795
length of results:  8968 length of test set 189
F1:  0.00174729714972
precision fusion:  0.000892060660125 recall fusion:  0.042328042328
for length of results:  17936
F1:  0.00110344827586
precision fusion:  0.000557537912578 recall fusion:  0.0529100529101
 for length of results:  26904
F1:  0.000959657476101
precision fusion:  0.000483199524234 recall fusion:  0.0687830687831
 for length of results:  35872
F1:  0.000831923684867
precision fusion:  0.000418153434434 recall fusion:  0.0793650793651
 for length of results:  44840
F1:  0.000843900597393
precision fusion:  0.00

In [17]:
bab = "international affairs, international relations, world affairs, International politics, Intrel, Diplomacy and Statecraft, Diplomatic relationship, Global relations, International relation, International Relation Studies, International trade relations, Interstate relations, Levels of Analysis in international relations, Study of International Relations"
synset = bab.split(',')
topic = 'international relations'
res_pickle = "results/international_relations_results.pickle"
IR_fusion_res = get_fusion_res3(res_pickle, topic, synset)
eval_all(IR_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 724
initial_corpus size: 724
babelnet results size of the topic "international relations": 983
ground truth size 60
intersection with the ground truth: 5
F1 0.00958772770853
babel precision:  0.00508646998983
babel recall:  0.0833333333333
initial_corpus size: 724
length of s3h results:  5867 length of test set 60
F1:  0.00269951071368
precision s3h:  0.0013635588887 recall s3h:  0.133333333333
initial_corpus size: 724
length of results:  983 length of test set 60
F1:  0.00191754554171
precision fusion:  0.00101729399797 recall fusion:  0.0166666666667
for length of results:  1966
F1:  0.000987166831194
precision fusion:  0.000508646998983 recall fusion:  0.0166666666667
 for length of results:  2949
F1:  0.000664672648721
precision fusion:  0.000339097999322 recall fusion:  0.0166666666667
 for length of results:  3932
F1:  0.000501002004008
precision fusion:  0.000254323499491 recall fusion:  0.0166666666667
 for length of results:  4915
F1:  0.000402010050251
pr

In [18]:
bab = "neuroimaging, Brain imaging, Brain scanning, Brain function map, Brain scan, Brain scans, Functional neurological mapping, Neuro-imaging, Neuroradiography"
synset = bab.split(',')
topic = 'Neuroimaging'
res_pickle = "results/Neuroimaging_results.pickle"
Ne_fusion_res = get_fusion_res3(res_pickle, topic, synset)
eval_all(Ne_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 2497
initial_corpus size: 2497
babelnet results size of the topic "Neuroimaging": 3679
ground truth size 522
intersection with the ground truth: 52
F1 0.0247560104737
babel precision:  0.0141342756184
babel recall:  0.0996168582375
initial_corpus size: 2497
length of s3h results:  9294 length of test set 522
F1:  0.0350448247759
precision s3h:  0.0185065633742 recall s3h:  0.329501915709
initial_corpus size: 2497
length of results:  3679 length of test set 522
F1:  0.00476077124494
precision fusion:  0.00271812992661 recall fusion:  0.0191570881226
for length of results:  7358
F1:  0.00304568527919
precision fusion:  0.00163087795597 recall fusion:  0.0229885057471
 for length of results:  11037
F1:  0.00294143091963
precision fusion:  0.00154027362508 recall fusion:  0.0325670498084
 for length of results:  14716
F1:  0.00301876886731
precision fusion:  0.0015629247078 recall fusion:  0.044061302682
 for length of results:  18395
F1:  0.00296030025903
precision fu

In [19]:
bab = "Toxicology, Chemical toxicology, History of toxicology, Toxicological, Toxicologists"
synset = bab.split(',')
topic = 'Toxicology'
res_pickle = "results/res__Toxicology"
To_fusion_res = get_fusion_res3(res_pickle, topic, synset)
eval_all(To_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 2550
initial_corpus size: 2550
babelnet results size of the topic "Toxicology": 5563
ground truth size 1552
intersection with the ground truth: 146
F1 0.0410400562193
babel precision:  0.0262448319252
babel recall:  0.0940721649485
initial_corpus size: 2550
length of s3h results:  8547 length of test set 1552
F1:  0.0243588474106
precision s3h:  0.014391014391 recall s3h:  0.0792525773196
initial_corpus size: 2550
length of results:  5563 length of test set 1552
F1:  0.010681658468
precision fusion:  0.00683084666547 recall fusion:  0.0244845360825
for length of results:  11126
F1:  0.00914970815586
precision fusion:  0.00521301456049 recall fusion:  0.0373711340206
 for length of results:  16689
F1:  0.00964859382709
precision fusion:  0.00527293426808 recall fusion:  0.0567010309278
 for length of results:  22252
F1:  0.00957822214754
precision fusion:  0.0051231349991 recall fusion:  0.0734536082474
 for length of results:  27815
F1:  0.00912588960398
precision 

In [31]:
bab = "Toxicology, Chemical toxicology, History of toxicology, Toxicological, Toxicologists"
synset = bab.split(',')
topic = 'Toxicology'
res_pickle = "results/res__Toxicology"
To_fusion_res = get_fusion_res4(res_pickle, topic, synset)
eval_all(To_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 2550
initial_corpus size: 2550
babelnet results size of the topic "Toxicology": 5563
ground truth size 1552
intersection with the ground truth: 146
F1 0.0410400562193
babel precision:  0.0262448319252
babel recall:  0.0940721649485
initial_corpus size: 2550
length of s3h results:  8547 length of test set 1552
F1:  0.0243588474106
precision s3h:  0.014391014391 recall s3h:  0.0792525773196
initial_corpus size: 2550
length of results:  5563 length of test set 1552
F1:  0.0137737174982
precision fusion:  0.008808197016 recall fusion:  0.0315721649485
for length of results:  11126
F1:  0.0110427512226
precision fusion:  0.00629156929714 recall fusion:  0.0451030927835
 for length of results:  16689
F1:  0.0107450249438
precision fusion:  0.005872131344 recall fusion:  0.0631443298969
 for length of results:  22252
F1:  0.00999831961015
precision fusion:  0.00534783390257 recall fusion:  0.076675257732
 for length of results:  27815
F1:  0.00973882248783
precision fusio

In [35]:
bab = "Toxicology, Chemical toxicology, History of toxicology, Toxicological, Toxicologists"
synset = bab.split(',')
topic = 'Toxicology'
res_pickle = "results/res__Toxicology"
To_fusion_res = get_fusion_res5(res_pickle, topic, synset)
eval_all(To_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 2550
initial_corpus size: 2550
babelnet results size of the topic "Toxicology": 5563
ground truth size 1552
intersection with the ground truth: 146
F1 0.0410400562193
babel precision:  0.0262448319252
babel recall:  0.0940721649485
initial_corpus size: 2550
length of s3h results:  8547 length of test set 1552
F1:  0.0243588474106
precision s3h:  0.014391014391 recall s3h:  0.0792525773196
initial_corpus size: 2550
length of results:  5563 length of test set 1552
F1:  0.0309205903022
precision fusion:  0.0197735035053 recall fusion:  0.0708762886598
for length of results:  11126
F1:  0.0310774570121
precision fusion:  0.0177062735934 recall fusion:  0.126932989691
 for length of results:  16689
F1:  0.0288361383696
precision fusion:  0.0157588830967 recall fusion:  0.169458762887
 for length of results:  22252
F1:  0.0261300621744
precision fusion:  0.0139762717958 recall fusion:  0.200386597938
 for length of results:  27815
F1:  0.0237000715088
precision fusion:  

In [40]:
bab = "Toxicology, Chemical toxicology, History of toxicology, Toxicological, Toxicologists"
synset = bab.split(',')
topic = 'Toxicology'
res_pickle = "results/res__Toxicology"
To_fusion_res = get_fusion_res6(res_pickle, topic, synset)
eval_all(To_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 2550
initial_corpus size: 2550
babelnet results size of the topic "Toxicology": 5563
ground truth size 1552
intersection with the ground truth: 146
F1 0.0410400562193
babel precision:  0.0262448319252
babel recall:  0.0940721649485
initial_corpus size: 2550
length of s3h results:  8547 length of test set 1552
F1:  0.0243588474106
precision s3h:  0.014391014391 recall s3h:  0.0792525773196
initial_corpus size: 2550
length of results:  5563 length of test set 1552
F1:  0.0309205903022
precision fusion:  0.0197735035053 recall fusion:  0.0708762886598
for length of results:  11126
F1:  0.0310774570121
precision fusion:  0.0177062735934 recall fusion:  0.126932989691
 for length of results:  16689
F1:  0.0288361383696
precision fusion:  0.0157588830967 recall fusion:  0.169458762887
 for length of results:  22252
F1:  0.0261300621744
precision fusion:  0.0139762717958 recall fusion:  0.200386597938
 for length of results:  27815
F1:  0.0237000715088
precision fusion:  

In [41]:
bab = "respiratory system, systema respiratorium, Respiration organ, Breathing organ, Breathing system, Development of respiratory system, Human Respiration, Human respiratory system, Pulmonary respiration, Pulmonary system, Respatory system, Respiration of human, Respiration system, Respiratory, Respiratory organs, Respiratory system agents, Respiratory system disorders, Respiratory systems, Respitory System, Subglottic airway, The respiratory system"
synset = bab.split(',')
topic = 'Respiratory system'
ID_fusion_res = get_fusion_res5("results/Respiratory_system_results.pickle", topic, synset)
eval_all(ID_fusion_res, topic, synset, "results/Respiratory_system_results.pickle", thresh=0.75)

initial_corpus size: 795
initial_corpus size: 795
babelnet results size of the topic "Respiratory system": 8968
ground truth size 189
intersection with the ground truth: 18
F1 0.00393141858687
babel precision:  0.00200713648528
babel recall:  0.0952380952381
initial_corpus size: 795
length of s3h results:  1120 length of test set 189
F1:  0.0061115355233
precision s3h:  0.00357142857143 recall s3h:  0.021164021164
initial_corpus size: 795
length of results:  8968 length of test set 189
F1:  0.00502347930545
precision fusion:  0.00256467439786 recall fusion:  0.121693121693
for length of results:  17936
F1:  0.00342068965517
precision fusion:  0.00172836752899 recall fusion:  0.164021164021
 for length of results:  26904
F1:  0.00250987339903
precision fusion:  0.00126375260184 recall fusion:  0.179894179894
 for length of results:  35872
F1:  0.00216300158066
precision fusion:  0.00108719892953 recall fusion:  0.206349206349
 for length of results:  44840
F1:  0.00195429612028
precisio

In [42]:
bab = "international affairs, international relations, world affairs, International politics, Intrel, Diplomacy and Statecraft, Diplomatic relationship, Global relations, International relation, International Relation Studies, International trade relations, Interstate relations, Levels of Analysis in international relations, Study of International Relations"
synset = bab.split(',')
topic = 'international relations'
res_pickle = "results/international_relations_results.pickle"
IR_fusion_res = get_fusion_res5(res_pickle, topic, synset)
eval_all(IR_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 724
initial_corpus size: 724
babelnet results size of the topic "international relations": 983
ground truth size 60
intersection with the ground truth: 5
F1 0.00958772770853
babel precision:  0.00508646998983
babel recall:  0.0833333333333
initial_corpus size: 724
length of s3h results:  5867 length of test set 60
F1:  0.00269951071368
precision s3h:  0.0013635588887 recall s3h:  0.133333333333
initial_corpus size: 724
length of results:  983 length of test set 60
F1:  0.00958772770853
precision fusion:  0.00508646998983 recall fusion:  0.0833333333333
for length of results:  1966
F1:  0.00691016781836
precision fusion:  0.00356052899288 recall fusion:  0.116666666667
 for length of results:  2949
F1:  0.00465270854104
precision fusion:  0.00237368599525 recall fusion:  0.116666666667
 for length of results:  3932
F1:  0.00400801603206
precision fusion:  0.00203458799593 recall fusion:  0.133333333333
 for length of results:  4915
F1:  0.00402010050251
precision fu

In [44]:
bab = "neuroimaging, Brain imaging, Brain scanning, Brain function map, Brain scan, Brain scans, Functional neurological mapping, Neuro-imaging, Neuroradiography"
bab.replace('&', 'and')
synset = bab.split(',')
topic = 'Neuroimaging'
res_pickle = "results/Neuroimaging_results.pickle"
Ne_fusion_res = get_fusion_res5(res_pickle, topic, synset)
eval_all(Ne_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 2497
initial_corpus size: 2497
babelnet results size of the topic "Neuroimaging": 3679
ground truth size 522
intersection with the ground truth: 52
F1 0.0247560104737
babel precision:  0.0141342756184
babel recall:  0.0996168582375
initial_corpus size: 2497
length of s3h results:  9294 length of test set 522
F1:  0.0350448247759
precision s3h:  0.0185065633742 recall s3h:  0.329501915709
initial_corpus size: 2497
length of results:  3679 length of test set 522
F1:  0.0252320875982
precision fusion:  0.014406088611 recall fusion:  0.10153256705
for length of results:  7358
F1:  0.0378172588832
precision fusion:  0.0202500679532 recall fusion:  0.285440613027
 for length of results:  11037
F1:  0.0358162470802
precision fusion:  0.0187550964936 recall fusion:  0.396551724138
 for length of results:  14716
F1:  0.0312376952356
precision fusion:  0.0161728730633 recall fusion:  0.455938697318
 for length of results:  18395
F1:  0.0284400274885
precision fusion:  0.0146

In [45]:
bab = 'literature, Literary art, Literary, Literary arts, Literary work, Literary works, LiteraryArt, Literature by region, Literatures, Litterature, Ltierature, Prose fiction'
bab = bab.replace('&', 'and')
synset = bab.split(',')
topic = 'Literature'
res_pickle = "results/res__Literature"
Lit_fusion_res = get_fusion_res5(res_pickle, topic, synset)
eval_all(Lit_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 7941
initial_corpus size: 7941
babelnet results size of the topic "Literature": 7357
ground truth size 860
intersection with the ground truth: 107
F1 0.0260435682122
babel precision:  0.0145439717276
babel recall:  0.124418604651
initial_corpus size: 7941
length of s3h results:  8723 length of test set 860
F1:  0.00918292810185
precision s3h:  0.00504413619168 recall s3h:  0.0511627906977
initial_corpus size: 7941
length of results:  7357 length of test set 860
F1:  0.0136302786905
precision fusion:  0.00761179828735 recall fusion:  0.0651162790698
for length of results:  14714
F1:  0.0137408501348
precision fusion:  0.0072719858638 recall fusion:  0.124418604651
 for length of results:  22071
F1:  0.0109894902098
precision fusion:  0.00570884871551 recall fusion:  0.146511627907
 for length of results:  29428
F1:  0.00990491283677
precision fusion:  0.00509718635313 recall fusion:  0.174418604651
 for length of results:  36785
F1:  0.00940363926152
precision fusio

In [46]:
bab = "sociology, Sociological, Sociologists, Marketing sociologist, Marketing sociology, Scientific sociology, Social physics, Socialogy, Sociol, Sociological inquiry, Sociological term, Sociological terms, sociologist, Sociology versus social theory, Sociology vs. Social Theory, Sosiology, Study of culture"
bab = bab.replace('&', 'and')
synset = bab.split(',')
topic = 'Sociology'
res_pickle = "results/res__Sociology"
Soc_fusion_res = get_fusion_res5(res_pickle, topic, synset)
eval_all(Soc_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 2356
initial_corpus size: 2356
babelnet results size of the topic "Sociology": 4718
ground truth size 698
intersection with the ground truth: 53
F1 0.0195716395864
babel precision:  0.0112335735481
babel recall:  0.0759312320917
initial_corpus size: 2356
length of s3h results:  13403 length of test set 698
F1:  0.00439685128714
precision s3h:  0.00231291501903 recall s3h:  0.0444126074499
initial_corpus size: 2356
length of results:  4718 length of test set 698
F1:  0.0162481536189
precision fusion:  0.00932598558711 recall fusion:  0.0630372492837
for length of results:  9436
F1:  0.0100651272943
precision fusion:  0.00540483255617 recall fusion:  0.0730659025788
 for length of results:  14154
F1:  0.00794505790466
precision fusion:  0.00416843295182 recall fusion:  0.0845272206304
 for length of results:  18872
F1:  0.00674501788452
precision fusion:  0.00349724459517 recall fusion:  0.0945558739255
 for length of results:  23590
F1:  0.00601119894598
precision f

In [47]:
bab = "robotics, Autonomous Systems, Flying robots, Robotic, Robotic leg, Robotic legs, Socionics, Under water robotics, Unmanned systems"
bab.replace('&', 'and')
synset = bab.split(',')
topic = 'Robotics'
res_pickle = "results/res__Robotics"
Ro_fusion_res = get_fusion_res5(res_pickle, topic, synset)
eval_all(Ro_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 1231
initial_corpus size: 1231
babelnet results size of the topic "Robotics": 3705
ground truth size 747
intersection with the ground truth: 242
F1 0.108715184187
babel precision:  0.0653171390013
babel recall:  0.323962516734
initial_corpus size: 1231
length of s3h results:  3122 length of test set 747
F1:  0.0558283794262
precision s3h:  0.0345932094811 recall s3h:  0.144578313253
initial_corpus size: 1231
length of results:  3705 length of test set 747
F1:  0.0974842767296
precision fusion:  0.0585695006748 recall fusion:  0.290495314592
for length of results:  7410
F1:  0.0649748682114
precision fusion:  0.0357624831309 recall fusion:  0.354752342704
 for length of results:  11115
F1:  0.0512561119541
precision fusion:  0.0273504273504 recall fusion:  0.406961178046
 for length of results:  14820
F1:  0.0423973790711
precision fusion:  0.0222672064777 recall fusion:  0.441767068273
 for length of results:  18525
F1:  0.0367372353674
precision fusion:  0.0191093

In [49]:
bab = "psychiatry, psychopathology, psychological medicine, Adult Psychiatry, Criticism of psychiatry, Ethics of psychiatry, History of psychiatry, Mental Pathology, Modern psychiatry, Personalistic disease theories, Psichiatry, Psychaitry, Psychiatric, Psychiatric medicine, Psychiatric syndrome, Psychiatric treatment, psychiatrist, Psychiatrists, Psychological pathology, Psychopathological, Psychopathologist, Psycopathological, Psycopathology, Pyschiatric care, Pyschiatry"
bab.replace('&', 'and')
synset = bab.split(',')
topic = 'Psychiatry'
res_pickle = "results/res__Psychiatry"
Psych_fusion_res = get_fusion_res5(res_pickle, topic, synset)
eval_all(Psych_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 2145
initial_corpus size: 2145
babelnet results size of the topic "Psychiatry": 7448
ground truth size 1700
intersection with the ground truth: 250
F1 0.054656755575
babel precision:  0.0335660580021
babel recall:  0.147058823529
initial_corpus size: 2145
length of s3h results:  5746 length of test set 1700
F1:  0.0252484555466
precision s3h:  0.0163592064045 recall s3h:  0.0552941176471
initial_corpus size: 2145
length of results:  7448 length of test set 1700
F1:  0.0476606908614
precision fusion:  0.0292696025779 recall fusion:  0.128235294118
for length of results:  14896
F1:  0.0384429983128
precision fusion:  0.0214151450054 recall fusion:  0.187647058824
 for length of results:  22344
F1:  0.0318582598569
precision fusion:  0.0171410669531 recall fusion:  0.225294117647
 for length of results:  29792
F1:  0.0286421948431
precision fusion:  0.015138292159 recall fusion:  0.265294117647
 for length of results:  37240
F1:  0.0255778120185
precision fusion:  0.0

In [50]:
bab = " pediatrics, paediatrics, pediatric medicine, pedology, pediatrician, paediatrician, pædiatrics, Paediatric, Paediatricians, Pediatric, Pediatricians, Pediatrist, Pediatry, Pædiatric, Pædiatrician"
bab.replace('&', 'and')
synset = bab.split(',')
topic = 'Pediatrics'
res_pickle = "results/res__Pediatrics"
Ped_fusion_res = get_fusion_res5(res_pickle, topic, synset)
eval_all(Ped_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 565
initial_corpus size: 565
babelnet results size of the topic "Pediatrics": 8233
ground truth size 747
intersection with the ground truth: 223
F1 0.0496659242762
babel precision:  0.0270861168468
babel recall:  0.298527443106
initial_corpus size: 565
length of s3h results:  8303 length of test set 747
F1:  0.0117127071823
precision s3h:  0.00638323497531 recall s3h:  0.0709504685408
initial_corpus size: 565
length of results:  8233 length of test set 747
F1:  0.0307349665924
precision fusion:  0.0167618122191 recall fusion:  0.184738955823
for length of results:  16466
F1:  0.0197525126358
precision fusion:  0.0103243046277 recall fusion:  0.227576974565
 for length of results:  24699
F1:  0.016191149886
precision fusion:  0.00834041864043 recall fusion:  0.275769745649
 for length of results:  32932
F1:  0.0141334362659
precision fusion:  0.0072270132394 recall fusion:  0.318607764391
 for length of results:  41165
F1:  0.0127409811033
precision fusion:  0.00648

In [51]:
bab = "oncology, oncologist, AllergoOncology, Cancer care, Medical oncology, Oncologic, Oncological, Oncologists, Oncology research, Oncology unit, Pediatric cancers"
bab.replace('&', 'and')
synset = bab.split(',')
topic = 'Oncology'
res_pickle = "results/res__Oncology"
On_fusion_res = get_fusion_res5(res_pickle, topic, synset)
eval_all(On_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 3922
initial_corpus size: 3922
babelnet results size of the topic "Oncology": 5705
ground truth size 2937
intersection with the ground truth: 107
F1 0.024762786392
babel precision:  0.0187554776512
babel recall:  0.0364317330609
initial_corpus size: 3922
length of s3h results:  11875 length of test set 2937
F1:  0.0421280043208
precision s3h:  0.0262736842105 recall s3h:  0.106230847804
initial_corpus size: 3922
length of results:  5705 length of test set 2937
F1:  0.0242999305716
precision fusion:  0.0184049079755 recall fusion:  0.0357507660878
for length of results:  11410
F1:  0.0398689621524
precision fusion:  0.0250657318142 recall fusion:  0.0973782771536
 for length of results:  17115
F1:  0.0443846000399
precision fusion:  0.0260005842828 recall fusion:  0.151515151515
 for length of results:  22820
F1:  0.0459680863455
precision fusion:  0.0259421560035 recall fusion:  0.201566224038
 for length of results:  28525
F1:  0.046723030958
precision fusion:  0.

In [52]:
bab = " mechanics, History of mechanics, Mechanical processes, Particle mechanics, Theoretical mechanics"
bab.replace('&', 'and')
synset = bab.split(',')
topic = 'Mechanics'
res_pickle = "results/res__Mechanics"
Mecha_fusion_res = get_fusion_res5(res_pickle, topic, synset)
eval_all(Mecha_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 8210
initial_corpus size: 8210
babelnet results size of the topic "Mechanics": 8222
ground truth size 4640
intersection with the ground truth: 1
F1 0.000155496812315
babel precision:  0.000121624908781
babel recall:  0.000215517241379
initial_corpus size: 8210
length of s3h results:  9556 length of test set 4640
F1:  0.028740490279
precision s3h:  0.0213478442863 recall s3h:  0.0439655172414
initial_corpus size: 8210
length of results:  8222 length of test set 4640
F1:  0.000932980873892
precision fusion:  0.000729749452688 recall fusion:  0.00129310344828
for length of results:  16444
F1:  0.0247581104155
precision fusion:  0.015872050596 recall fusion:  0.05625
 for length of results:  24666
F1:  0.0337814781956
precision fusion:  0.0200681099489 recall fusion:  0.106681034483
 for length of results:  32888
F1:  0.035813259433
precision fusion:  0.0204329846753 recall fusion:  0.144827586207
 for length of results:  41110
F1:  0.0356284153005
precision fusion:  0

In [53]:
bab = "biophysics, Biological physics, Biophysical, Biophysicists, History of biophysics"
bab.replace('&', 'and')
synset = bab.split(',')
topic = 'biophysics'
res_pickle = "results/res__biophysics"
biophy_fusion_res = get_fusion_res5(res_pickle, topic, synset)
eval_all(biophy_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 275
initial_corpus size: 275
babelnet results size of the topic "biophysics": 3674
ground truth size 323
intersection with the ground truth: 19
F1 0.00950713034776
babel precision:  0.00517147523136
babel recall:  0.0588235294118
initial_corpus size: 275
length of s3h results:  9602 length of test set 323
F1:  0.00584382871537
precision s3h:  0.00302020412414 recall s3h:  0.0897832817337
initial_corpus size: 275
length of results:  3674 length of test set 323
F1:  0.00800600450338
precision fusion:  0.00435492651062 recall fusion:  0.0495356037152
for length of results:  7348
F1:  0.00677877721288
precision fusion:  0.00353837778987 recall fusion:  0.0804953560372
 for length of results:  11022
F1:  0.00564125165271
precision fusion:  0.00290328434041 recall fusion:  0.0990712074303
 for length of results:  14696
F1:  0.00519342166589
precision fusion:  0.00265378334241 recall fusion:  0.120743034056
 for length of results:  18370
F1:  0.00492162841705
precision fu