In [1]:
import numpy as np
import pandas as pd
import random, pickle, argparse, json, os, urllib2
from collections import OrderedDict
from operator import itemgetter
from sklearn.ensemble import RandomForestClassifier

In [59]:
# functions
########################################################################################
def query_from(q, f):
    q = q+'&from='+str(f)
    response = urllib2.urlopen(q)
    data = json.load(response)
    subject_ids = np.array(range(len(data['hits'])), dtype=np.object)
    for (i, hit) in enumerate(data['hits']):
        subject_ids[i] = hit['id']
    return subject_ids

def query(q):
    response = urllib2.urlopen(q)
    data = json.load(response)
    nb_requests = 1 + data['total'] / 1000
    if nb_requests > 10: # maximum number of pages due to API pagination restrection
        nb_requests = 10
    subject_ids = query_from(q, 0)
    for i in range(nb_requests)[1:]:
        f = i * 1000
        next_request = query_from(q, f)
        subject_ids = np.hstack((subject_ids, next_request))
    return subject_ids.tolist()

def find_intersection(list_a, list_b):
    return list(set(list_a) & set(list_b))

def term2url(string):
    string = string.split(' ')
    res = '%22'
    for s in string:
        res = res + s + '%20'
    res = res[:-3]
    res = res + '%22'
    return res

def babel_synset(synset):
    q = 'https://api.istex.fr/document/?q=(('
    for syn in synset:
        syn = term2url(syn)
        q = q + 'title:' + syn + '%20OR%20abstract:' + syn + '%20OR%20'
    q = q[:-8]
    q = q + ')%20AND%20(qualityIndicators.abstractWordCount:[35%20500]%20AND%20qualityIndicators.pdfPageCount:[3%2060]%20AND%20publicationDate:[1990%202016]%20AND%20language:(%22eng%22%20OR%20%22unknown%22)%20AND%20genre:(%22research_article%22%20OR%20%22conference[eBooks]%22%20OR%20%22article%22%20)%20))&size=1000&output=id'
    return q

def babel_subj_keyword(topic):
    q = 'https://api.istex.fr/document/?q=(('
    topic = term2url(topic)
    q = q+ 'subject.value:' + topic + '%20OR%20keywords:' + topic
    q = q + ')%20AND%20(qualityIndicators.abstractWordCount:[35%20500]%20AND%20qualityIndicators.pdfPageCount:[3%2060]%20AND%20publicationDate:[1990%202016]%20AND%20language:(%22eng%22%20OR%20%22unknown%22)%20AND%20genre:(%22research_article%22%20OR%20%22conference[eBooks]%22%20OR%20%22article%22%20)%20))&size=1000&output=id'
    return q
 
def babel_title_abst(topic):
    q = 'https://api.istex.fr/document/?q=(('
    topic = term2url(topic)
    q = q+ 'title:' + topic + '%20OR%20abstract:' + topic
    q = q + ')%20AND%20(qualityIndicators.abstractWordCount:[35%20500]%20AND%20qualityIndicators.pdfPageCount:[3%2060]%20AND%20publicationDate:[1990%202016]%20AND%20language:(%22eng%22%20OR%20%22unknown%22)%20AND%20genre:(%22research_article%22%20OR%20%22conference[eBooks]%22%20OR%20%22article%22%20)%20))&size=1000&output=id'
    return q
 
def babelnet_syn_get_input(topic, synset):
    results = query(babel_synset(synset))
    _gs = query(babel_subj_keyword(topic))
    results = find_intersection(results, inversed_index.keys())
    _abst_title = query(babel_title_abst(topic))
    test_set = _inter = {x for x in _gs if x not in _abst_title}
    test_set = find_intersection(test_set, inversed_index.keys())
    results = list(results)
    test = list(test_set)
    print 'initial_corpus size:', len(find_intersection(_abst_title, inversed_index.keys()))
    return results, test

def babelnet_eval(topic, synset):
    babelnet_results, test = babelnet_syn_get_input(topic, synset)
    print 'results size of the topic "' + topic + '":', len(babelnet_results) 
    print 'ground truth size', len(test)
    babel_test_intersection = find_intersection(test,babelnet_results)
    babel_test_intersection_size = len(babel_test_intersection)
    print 'intersection with the ground truth:', babel_test_intersection_size
    
def top_thresh(ordered_dict_pickle, thresh):
    ranked_all = pickle.load(open(ordered_dict_pickle, 'rb'))
    ranked_all_np = np.array(ranked_all.items())
    ranked_all_df = pd.DataFrame(data=ranked_all_np, index=None, columns=['istex_id', 'score'])
    ranked_all_df['score'] = ranked_all_df[['score']].astype(float)
    return ranked_all_df[ranked_all_df['score'] > thresh]

def top_thresh_lst(res_lst_pickle, thresh):
    ranked_all = pickle.load(open(res_lst_pickle, 'rb'))
    if type(ranked_all) is OrderedDict:
        ranked_all = ranked_all.items()[:100000]
    ranked_all_np = np.array(ranked_all)
    ranked_all_df = pd.DataFrame(data=ranked_all_np, index=None, columns=['istex_id', 'score'])
    ranked_all_df['score'] = ranked_all_df[['score']].astype(float)
    return ranked_all_df[ranked_all_df['score'] > thresh]

def babelnet_eval_PR(topic, synset):
    babelnet_results, test = babelnet_syn_get_input(topic, synset)
    print 'babelnet results size of the topic "' + topic + '":', len(babelnet_results) 
    print 'ground truth size', len(test)
    babel_test_intersection = find_intersection(test,babelnet_results)
    babel_test_intersection_size = len(babel_test_intersection)
    print 'intersection with the ground truth:', babel_test_intersection_size
    precision = babel_test_intersection_size / float(len(babelnet_results))
    recall = babel_test_intersection_size / float(len(test))
    if babel_test_intersection_size is not 0:
        F1 = 2 * (precision * recall) / (precision + recall)
    else:
        F1 = 0.0 
    print "F1", F1
    print 'babel precision: ', precision
    print 'babel recall: ', recall
    

#Evaluate 3SH results at treshold
def eval_all_at_thresh(ordered_dict_pickle, topic, synset, thresh):
    babelnet_eval_PR(topic, synset)
    _, test = babelnet_syn_get_input(topic, synset)
    t = len(test)
    top_res = top_thresh(ordered_dict_pickle, thresh)
    n = len(top_res)
    if n > 10000:
        thresh = thresh + 0.1
        top_res = top_thresh(ordered_dict_pickle, thresh)
    elif n < 1000:
        thresh = thresh - 0.1
        top_res = top_thresh(ordered_dict_pickle, thresh)
    n = len(top_res)
    res = list(top_res['istex_id'])
    recall = len(find_intersection(test,res))/float(t)
    precision = len(find_intersection(test,res))/float(n)
    return precision, recall, n, t, thresh

#Evaluate 3SH results list at treshold
def eval_all_at_thresh_lst(res_pickle, topic, synset, thresh=0.75):
    babelnet_eval_PR(topic, synset)
    _, test = babelnet_syn_get_input(topic, synset)
    t = len(test)
    top_res = top_thresh_lst(res_pickle, thresh)
    n = len(top_res)
    if n > 10000:
        thresh = thresh + 0.1
        top_res = top_thresh_lst(res_pickle, thresh)
        if len(top_res) > 10000:
            thresh = thresh + 0.05
            top_res = top_thresh_lst(res_pickle, thresh)
    elif n < 1000:
        thresh = thresh - 0.1
        top_res = top_thresh_lst(res_pickle, thresh)
        if len(top_res) > 1000:
            thresh = thresh - 0.05
            top_res = top_thresh_lst(res_pickle, thresh)
    n = len(top_res)
    res = list(top_res['istex_id'])
    recall = len(find_intersection(test,res))/float(t)
    precision = len(find_intersection(test,res))/float(n)
    F1 = 2 * (precision * recall) / (precision + recall)
    print "length of s3h results: ", n, "length of test set", t
    print "F1: ", F1
    print "precision s3h: ", precision, "recall s3h: ", recall

def get_fusion_res3(s3h_res_pickle, topic, synset):
    topic_s3h_results = pickle.load(open(s3h_res_pickle,'rb'))
    if type(topic_s3h_results) is OrderedDict:
        topic_s3h_results = topic_s3h_results.items()
    topic_s3h_top100k_results = topic_s3h_results[:100000]
    babel_results, test = babelnet_syn_get_input(topic, synset)
    fus = np.array(np.ones(100000))
    fus = fus * len(babel_results)
    for i, s3h in enumerate(topic_s3h_top100k_results):
        for j, bab in enumerate(babel_results):
            if s3h[0] == bab:
                fus[i] = (i + j) / 2
    fusion_df = pd.DataFrame(data=topic_s3h_top100k_results, columns=["istex_id", "3sh_score"])
    fusion_df["fus_rank"] = fus
    fusion_res = fusion_df.sort_values("fus_rank")#["istex_id"].tolist()
    return fusion_res

def get_fusion_res4(s3h_res_pickle, topic, synset):
    topic_s3h_results = pickle.load(open(s3h_res_pickle,'rb'))
    if type(topic_s3h_results) is OrderedDict:
        topic_s3h_results = topic_s3h_results.items()
    topic_s3h_top100k_results = topic_s3h_results[:100000]
    babel_results, test = babelnet_syn_get_input(topic, synset)
    fus = np.array(np.ones(100000))
    fus = fus * 2 * len(babel_results)
    for i, s3h in enumerate(topic_s3h_top100k_results):
        for j, bab in enumerate(babel_results):
            if s3h[0] == bab:
                fus[i] = (i + j) / 2
    fusion_df = pd.DataFrame(data=topic_s3h_top100k_results, columns=["istex_id", "3sh_score"])
    fusion_df["fus_rank"] = fus
    fusion_res = fusion_df.sort_values("fus_rank")#["istex_id"].tolist()
    return fusion_res

def get_fusion_res5(s3h_res_pickle, topic, synset):
    topic_s3h_results = pickle.load(open(s3h_res_pickle,'rb'))
    if type(topic_s3h_results) is OrderedDict:
        topic_s3h_results = topic_s3h_results.items()
    topic_s3h_top100k_results = topic_s3h_results[:100000]
    babel_results, test = babelnet_syn_get_input(topic, synset)
    fus = np.array(range(100000))
    fus = fus * len(babel_results)
    for i, s3h in enumerate(topic_s3h_top100k_results):
        for j, bab in enumerate(babel_results):
            if s3h[0] == bab:
                fus[i] = (i + j) / 2
    fusion_df = pd.DataFrame(data=topic_s3h_top100k_results, columns=["istex_id", "3sh_score"])
    fusion_df["fus_rank"] = fus
    fusion_res = fusion_df.sort_values("fus_rank")#["istex_id"].tolist()
    return fusion_res

def get_fusion_res6(s3h_res_pickle, topic, synset):
    topic_s3h_results = pickle.load(open(s3h_res_pickle,'rb'))
    if type(topic_s3h_results) is OrderedDict:
        topic_s3h_results = topic_s3h_results.items()
    topic_s3h_top100k_results = topic_s3h_results[:100000]
    babel_results, test = babelnet_syn_get_input(topic, synset)
    fus = np.array(range(100000))
    fus = fus * 3 * len(babel_results)
    for i, s3h in enumerate(topic_s3h_top100k_results):
        for j, bab in enumerate(babel_results):
            if s3h[0] == bab:
                fus[i] = (2*i + j) / 2
    fusion_df = pd.DataFrame(data=topic_s3h_top100k_results, columns=["istex_id", "3sh_score"])
    fusion_df["fus_rank"] = fus
    fusion_res = fusion_df.sort_values("fus_rank")#["istex_id"].tolist()
    return fusion_res

#Evaluate fusion df
def eval_all(fusion_df, topic, synset, res_pickle, thresh=0.75):
    eval_all_at_thresh_lst(res_pickle, topic, synset, thresh)
    babelnet_res, test = babelnet_syn_get_input(topic, synset)
    n = len(babelnet_res)
    t = len(test)

    print "length of results: ", n, "length of test set", t
    res = fusion_df["istex_id"].tolist()[:n]
    matches = len(find_intersection(test,res))
    recall = matches / float(t)
    precision = matches / float(n)
    if matches is not 0:
        F1 = 2 * (precision * recall) / (precision + recall)
    else:
        F1 = 0
    print "F1: ", F1 
    print "precision fusion: ", precision, "recall fusion: ",recall

    n = 2 * len(babelnet_res)
    print "for length of results: ", n
    res = fusion_df["istex_id"].tolist()[:n]
    matches = len(find_intersection(test,res))
    recall = matches / float(t)
    precision = matches / float(n)
    if matches is not 0:
        F1 = 2 * (precision * recall) / (precision + recall)
    else:
        F1 = 0
    print "F1: ", F1 
    print "precision fusion: ", precision, "recall fusion: ",recall

    n = 3 * len(babelnet_res)
    print " for length of results: ", n
    res = fusion_df["istex_id"].tolist()[:n]
    matches = len(find_intersection(test,res))
    recall = matches / float(t)
    precision = matches / float(n)
    if matches is not 0:
        F1 = 2 * (precision * recall) / (precision + recall)
    else:
        F1 = 0
    print "F1: ", F1 
    print "precision fusion: ", precision, "recall fusion: ",recall
    
    n = 4 * len(babelnet_res)
    print " for length of results: ", n
    res = fusion_df["istex_id"].tolist()[:n]
    matches = len(find_intersection(test,res))
    recall = matches / float(t)
    precision = matches / float(n)
    if matches is not 0:
        F1 = 2 * (precision * recall) / (precision + recall)
    else:
        F1 = 0
    print "F1: ", F1 
    print "precision fusion: ", precision, "recall fusion: ",recall
    
    n = 5 * len(babelnet_res)
    print " for length of results: ", n
    res = fusion_df["istex_id"].tolist()[:n]
    matches = len(find_intersection(test,res))
    recall = matches / float(t)
    precision = matches / float(n)
    if matches is not 0:
        F1 = 2 * (precision * recall) / (precision + recall)
    else:
        F1 = 0
    print "F1: ", F1  
    print "precision fusion: ", precision, "recall fusion: ",recall

In [3]:
#loading SDV of istex articles
inv_index = json.load(open('../RecSys_Exp_files/182_381_vec150_results/output_paragraph_inversed_index.json','rb'))
print 'original inversed_index'
print inv_index.items()[:3]
inversed_index = dict()
for (k, v) in inv_index.items():
    key = k.split('_')[1]
    inversed_index[key] = v
print 'processed inversed_index'
print inversed_index.items()[:3]

original inversed_index
[(u'ISTEX_D89FA3AC3521074D46F4245762153DF497BFFA1F', 2002320), (u'ISTEX_18EAF4D6A126B077EB38667801D1B7292F32FF49', 2483732), (u'ISTEX_5F91044435FCC4FABB9F02E31467DCFE75F4A7BE', 1429049)]
processed inversed_index
[(u'FCF1393F9B8136AC08FB67E88F94F3CF62C17288', 3517138), (u'482E1102A1114327A744FD2ADB4D9F8FF7E9A70B', 751643), (u'A81022B6295AE66F68A10222C3B94A06B033C1BA', 3983232)]


In [4]:
def get_fusion_res(s3h_res_pickle, topic, synset):
    topic_s3h_results = pickle.load(open(s3h_res_pickle,'rb'))
    topic_s3h_top100k_results = topic_s3h_results[:100000]
    babel_results, test = babelnet_syn_get_input(topic, synset)
    fus = np.array(np.ones(100000))
    fus = (fus + len(babel_results)) / 2
    for i, s3h in enumerate(topic_s3h_top100k_results):
        for j, bab in enumerate(babel_results):
            if s3h[0] == bab:
                fus[i] = (i + j) / 2
    fusion_df = pd.DataFrame(data=topic_s3h_top100k_results, columns=["istex_id", "3sh_score"])
    fusion_df["fus_rank"] = fus
    fusion_res = fusion_df.sort_values("fus_rank")#["istex_id"].tolist()
    return fusion_res

In [11]:
def get_fusion_res3(s3h_res_pickle, topic, synset):
    topic_s3h_results = pickle.load(open(s3h_res_pickle,'rb'))
    topic_s3h_top100k_results = topic_s3h_results[:100000]
    babel_results, test = babelnet_syn_get_input(topic, synset)
    fus = np.array(np.ones(100000))
    fus = fus * len(babel_results)
    for i, s3h in enumerate(topic_s3h_top100k_results):
        for j, bab in enumerate(babel_results):
            if s3h[0] == bab:
                fus[i] = (i + j) / 2
    fusion_df = pd.DataFrame(data=topic_s3h_top100k_results, columns=["istex_id", "3sh_score"])
    fusion_df["fus_rank"] = fus
    fusion_res = fusion_df.sort_values("fus_rank")#["istex_id"].tolist()
    return fusion_res

In [17]:
bab = 'organ transplant, transplant, transplantation, Organ transplantation, Medical Transplantation, Transplant Surgery, Black market organs, First transplant, First transplantation, Intestinal transplant, Live organ transplants, Mixed chimerism, Organ and Tissue Donor, Organ doner, Organ farming, Organ transplantation in different countries, Organ transplantation therapy, Organ transplants, Skin transplant, Tissue transplant, Transplant Tourism and Organ Trafficking, Transplantation medicine, Transplantation surgery, Transplantation therapy, Transplanted organs, Transplantology'
synset = bab.split(',')
topic = 'Transplantation'
s3h_res_pickle = "results/res_Transplantation"
Transplantation_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

In [20]:
bab = 'spectroscopy, spectrographic analysis, spectrometry, spectroscopic analysis, spectrum analysis, Fingerprint region, Laser spectroscopy, Optical spectroscopy, Atomic line, Atomic line spectra, Atomic spectra, Atomic spectral line, Electromagnetic spectroscopy, Emission spectrochemical analysis, Spectrochemical Analysis, Spectrography, Spectrology, Spectroscopic, Spectroscopist, Spectroscopists'
synset = bab.split(',')
topic = 'Spectroscopy'
s3h_res_pickle = "results/res__Spectroscopy"
Spectroscopy_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

In [24]:
bab = ' surgery, operation, surgical operation, surgical procedure, surgical process, Chirurgery, Chirurgical, Chirurgy, Complications of surgery, Corrective surgery, Elliptical excision, Emergency Surgery, Post-operation, Post-operative, Specialties in surgery, Sterile drapes, surgeon, Surgeons, Surgeries, Surgery in general practice, Surgery operation, Surgery specialties, Surgical, Surgical excision, Surgical excision of malignant lesions, Surgical specialties, Surgical technique, Surgically'
synset = bab.split(',')
topic = 'Surgery'
s3h_res_pickle = "results/res__Surgery"
Surgery_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

In [25]:
bab = 'optics, Classical optics, Light physics, Optical, Optical system'
synset = bab.split(',')
topic = 'Optics'
s3h_res_pickle = "results/res__Optics"
Optics_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

In [26]:
bab = 'literature, Literary art, Literary, Literary arts, Literary work, Literary works, LiteraryArt, Literature by region, Literatures, Litterature, Ltierature, Prose fiction'
synset = bab.split(',')
topic = 'Literature'
s3h_res_pickle = "results/res__Literature"
Literature_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

In [27]:
bab = 'toxicology, Chemical toxicology, History of toxicology, Toxicological, Toxicologists'
synset = bab.split(',')
topic = 'Toxicology'
s3h_res_pickle = "results/res__Toxicology"
Toxicology_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

In [28]:
bab = 'artificial intelligence, AI, artilect, Artifical intelligence, Cognitive systems, Digital being, Machine intelligence, A.I., A I, AI-D, AI ethics, AI implications, AI Robotics, AI scripting, Aretificial intelligence, Artificial-intelligence, Artificial conciousness, Artificial inteligence, Artificial intellect, Artificial intellegence, Artificial Intelligence., Artificial intelligence for development, Artificial Intelligence Program, Artificial intelligences, Artificially-intelligent, Artificially intelligent, Artificual intelligence, Cognitive simulation, Cognitive system, Commonsense AI, Computational Rationality, Computer AI, Intelligent machine, Machine thought, Machine understanding, Ontology based approach, Pseudo intelligence, Semi-AI, Semi AI, Simulated intelligence, Soft AI, Sub-symbolic, Subsymbolic, The Artificial Intelligence, The Theory of Artificial Intelligence'
synset = bab.split(',')
topic = 'artificial intelligence'
s3h_res_pickle = "results/AI_results.pickle"
AI_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

KeyboardInterrupt: 

In [29]:
bab = 'cybernetics, Cybernetic, Cybernetic system, cybernetician, Cyberneticians, cyberneticist, Tha Masta'
synset = bab.split(',')
topic = 'Cybernetics'
s3h_res_pickle = "results/res__Cybernetics"
Cybernetics_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

In [None]:
bab = 'information system, data system, information systems, system info, Business computing, Computer information system, Computer information systems, Elements of Information System, Information in Computer Science, Information systems and technology, Information systems discipline, Information systems theory, Informationssystem'
synset = bab.split(',')
topic = 'Information Systems'
s3h_res_pickle = "results/infosys_results.pickle"
Information_Systems_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

In [30]:
bab = 'immunology, immunobiology, immunological, immunologist, Clinical immunology, Classical immunology, Evolutionary immunology, Immunologic, Immunologists, Imunologys'
synset = bab.split(',')
topic = 'Immunology'
s3h_res_pickle = "results/res__Immunology"
Immunology_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

In [None]:
bab = 'infection, infectious disease, communicable diseases, contagion, Infectious diseases, Acute infection, AIDS-related bacterial infections, AIDS-related viral infections, Anti-infective, Anti-infectives, Antiinfective, Bacterial Infections, Communicable disease, Contagious diseases, Definition to contagious, Infect, Infecting, Infectiology, Infections, Infectious, Infectious disease epidemiology, Infectious disease medicine, Infectology, Local infection, Primary infection, Rochalimea infections, Secondary infection, Tropical bacterial infections, Tropical infections, Viral Infections, Wound colonization, Wound infection'
synset = bab.split(',')
topic = 'Infectious Diseases'
s3h_res_pickle = "results/Infectious_results.pickle"
Infectious_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

In [31]:
bab = 'Biomaterial, Biomaterials Engineering, Bio material, Biomaterials'
synset = bab.split(',')
topic = 'Biomaterials'
s3h_res_pickle = "results/res__Biomaterials"
Biomaterials_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

In [32]:
bab = 'ceramics, Ceramic art, Art pottery, Art ware, Ceramic artist, Ceramic paint, Ceramics art, Fine art pot, Vase painting'
synset = bab.split(',')
topic = 'Ceramics'
s3h_res_pickle = "results/res__Ceramics"
Ceramics_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

In [33]:
topic = "biophysics"
synset_text = "biophysics, Biological physics, Biophysical, Biophysicists, History of biophysics"
synset = synset_text.split(',')
s3h_res_pickle = 'results/res__biophysics'
biophysics_fusion_res = get_fusion_res(s3h_res_pickle, topic, synset)

In [12]:
ind_dict = OrderedDict((k,i) for i,k in enumerate(biophysics_fusion_res))
babel_results, test = babelnet_syn_get_input(topic, synset)
inter = set(ind_dict).intersection(babel_results)
indices = [ind_dict[x] for x in inter ]
cutt = np.array(indices).max()
cutt

99999

In [None]:
slic = len(indices) * 0.8
indices_slic = indices[:int(slic)]
#indices_slic

In [None]:
fusion_res = fusion_df.sort_values("fusin_rank")["istex_id"].tolist()
#fusion_res[:5000]

In [5]:
#Evaluate fusion df with manual cut
def eval_fusin_at_thresh_lst_1k(fusion_df, topic, synset):
    babelnet_eval_PR(topic, synset)
    babelnet_res, test = babelnet_syn_get_input(topic, synset)
    n = len(babelnet_res)
    if n < 1000:
        n = 1000
    t = len(test)
    res = fusion_df["istex_id"].tolist()[:n]
    recall = len(find_intersection(test,res))/float(t)
    precision = len(find_intersection(test,res))/float(n)
    return precision, recall, n, t

In [6]:
#Evaluate fusion df
def eval_fusin_at_5k(fusion_df, topic, synset):
    babelnet_eval_PR(topic, synset)
    babelnet_res, test = babelnet_syn_get_input(topic, synset)
    n = 5000#len(babelnet_res)
    t = len(test)
    res = fusion_df["istex_id"].tolist()[:n]
    recall = len(find_intersection(test,res))/float(t)
    precision = len(find_intersection(test,res))/float(n)
    print "precision fusion: " precision, "recall fusion: ",recall
    print "length of results: ", n, "length of test set", t

In [7]:
#Evaluate fusion df
def eval_fusin_at_2_bablesize((fusion_df, topic, synset):
    babelnet_eval_PR(topic, synset)
    babelnet_res, test = babelnet_syn_get_input(topic, synset)
    n = 2 * len(babelnet_res)
    t = len(test)
    res = fusion_df["istex_id"].tolist()[:n]
    recall = len(find_intersection(test,res))/float(t)
    precision = len(find_intersection(test,res))/float(n)
    print "precision fusion: " precision, "recall fusion: ",recall
    print "length of results: ", n, "length of test set", t

In [8]:
#Evaluate fusion df
def eval_fusin_at_bablesize(fusion_df, topic, synset):
    babelnet_eval_PR(topic, synset)
    babelnet_res, test = babelnet_syn_get_input(topic, synset)
    n = len(babelnet_res)
    t = len(test)
    res = fusion_df["istex_id"].tolist()[:n]
    recall = len(find_intersection(test,res))/float(t)
    precision = len(find_intersection(test,res))/float(n)
    print "precision fusion: " precision, "recall fusion: ",recall
    print "length of results: ", n, "length of test set", t

In [9]:
def get_fusion_res_2(s3h_res_pickle, topic, synset):
    topic_s3h_top100k_results = pickle.load(open(s3h_res_pickle,'rb'))
    babel_results, test = babelnet_syn_get_input(topic, synset)
    fus = np.array(np.ones(100000))
    fus = (fus + 100000) / 2
    for i, s3h in enumerate(topic_s3h_top100k_results):
        for j, bab in enumerate(babel_results):
            if s3h[0] == bab:
                fus[i] = (i + j) / 2
    fusion_df = pd.DataFrame(data=topic_s3h_top100k_results, columns=["istex_id", "3sh_score"])
    fusion_df["fus_rank"] = fus
    fusion_res = fusion_df.sort_values("fus_rank")#["istex_id"].tolist()
    return fusion_res

In [41]:
bab = 'toxicology, Chemical toxicology, History of toxicology, Toxicological, Toxicologists'
synset = bab.split(',')
topic = 'Toxicology'
get_fusion_res_2("results/res__Toxicology", topic, synset)
eval_fusin_at_thresh_lst(Toxicology_fusion_res, topic, synset)

babelnet results size of the topic "Toxicology": 5563
ground truth size 1552
intersection with the ground truth: 146
babel precision:  0.0262448319252
babel recall:  0.0940721649485


(0.004853496314937983, 0.017396907216494846, 5563, 1552)

In [52]:
bab = 'toxicology, Chemical toxicology, History of toxicology, Toxicological, Toxicologists'
synset = bab.split(',')
topic = 'Toxicology'
get_fusion_res_2("results/res__Toxicology", topic, synset)
eval_fusin_at_thresh_lst(Toxicology_fusion_res, topic, synset)
eval_fusin_at_thresh_lst2(Toxicology_fusion_res, topic, synset)

babelnet results size of the topic "Toxicology": 5563
ground truth size 1552
intersection with the ground truth: 146
babel precision:  0.0262448319252
babel recall:  0.0940721649485


(0.004853496314937983, 0.017396907216494846, 5563, 1552)

In [42]:
bab = 'Biomaterial, Biomaterials Engineering, Bio material, Biomaterials'
synset = bab.split(',')
topic = 'Biomaterials'
get_fusion_res_2("results/res__Biomaterials", topic, synset)
eval_fusin_at_thresh_lst(Biomaterials_fusion_res, topic, synset)

babelnet results size of the topic "Biomaterials": 3649
ground truth size 1020
intersection with the ground truth: 92
babel precision:  0.0252123869553
babel recall:  0.0901960784314


(0.00356261989586188, 0.012745098039215686, 3649, 1020)

In [54]:
bab = 'Biomaterial, Biomaterials Engineering, Bio material, Biomaterials'
synset = bab.split(',')
topic = 'Biomaterials'
get_fusion_res_2("results/res__Biomaterials", topic, synset)
eval_fusin_at_thresh_lst(Biomaterials_fusion_res, topic, synset)
eval_fusin_at_thresh_lst2(Toxicology_fusion_res, topic, synset)

babelnet results size of the topic "Biomaterials": 3649
ground truth size 1020
intersection with the ground truth: 92
babel precision:  0.0252123869553
babel recall:  0.0901960784314
babelnet results size of the topic "Biomaterials": 3649
ground truth size 1020
intersection with the ground truth: 92
babel precision:  0.0252123869553
babel recall:  0.0901960784314


(0.0, 0.0, 7298, 1020)

In [55]:
bab = 'immunology, immunobiology, immunological, immunologist, Clinical immunology, Classical immunology, Evolutionary immunology, Immunologic, Immunologists, Imunologys'
synset = bab.split(',')
topic = 'Immunology'
get_fusion_res_2("results/res__Immunology", topic, synset)
eval_fusin_at_thresh_lst(Immunology_fusion_res, topic, synset)
eval_fusin_at_thresh_lst2(Immunology_fusion_res, topic, synset)

babelnet results size of the topic "Immunology": 8787
ground truth size 4769
intersection with the ground truth: 203
babel precision:  0.023102310231
babel recall:  0.0425665758021
babelnet results size of the topic "Immunology": 8787
ground truth size 4769
intersection with the ground truth: 203
babel precision:  0.023102310231
babel recall:  0.0425665758021


(0.010356208034596564, 0.03816313692598029, 17574, 4769)

In [58]:
eval_fusin_at_thresh_lst(Immunology_fusion_res, topic, synset)

babelnet results size of the topic "Immunology": 8787
ground truth size 4769
intersection with the ground truth: 203
babel precision:  0.023102310231
babel recall:  0.0425665758021
0.0124 0.0130006290627 5000 4769


In [61]:
eval_fusin_at_thresh_lst3(Immunology_fusion_res, topic, synset)

babelnet results size of the topic "Immunology": 8787
ground truth size 4769
intersection with the ground truth: 203
babel precision:  0.023102310231
babel recall:  0.0425665758021
0.0119494708091 0.0220171943804 8787 4769


In [None]:
bab = 'immunology, immunobiology, immunological, immunologist, Clinical immunology, Classical immunology, Evolutionary immunology, Immunologic, Immunologists, Imunologys'
synset = bab.split(',')
topic = 'Immunology'
get_fusion_res_2("results/res__Immunology", topic, synset)
eval_fusin_at_thresh_lst2(Immunology_fusion_res, topic, synset)

In [46]:
bab = 'cybernetics, Cybernetic, Cybernetic system, cybernetician, Cyberneticians, cyberneticist, Tha Masta'
synset = bab.split(',')
topic = 'Cybernetics'
get_fusion_res_2("", topic, synset)
eval_fusin_at_thresh_lst2(Cybernetics_fusion_res, topic, synset)

babelnet results size of the topic "Cybernetics": 511
ground truth size 596
intersection with the ground truth: 47
babel precision:  0.0919765166341
babel recall:  0.0788590604027


(0.004, 0.006711409395973154, 1000, 596)

In [63]:
bab = 'cybernetics, Cybernetic, Cybernetic system, cybernetician, Cyberneticians, cyberneticist, Tha Masta'
bab = bab.replace('&', 'and')
synset = bab.split(',')
topic = 'Cybernetics'
Cybernetics_fusion_res = get_fusion_res_2("results/res__Cybernetics", topic, synset)
eval_fusin_at_thresh_lst2(Cybernetics_fusion_res, topic, synset)

babelnet results size of the topic "Cybernetics": 511
ground truth size 596
intersection with the ground truth: 47
babel precision:  0.0919765166341
babel recall:  0.0788590604027
0.0283757338552 0.0486577181208 1022 596


In [None]:
eval_all_at_thresh('results/AI_results.pickle', topic , synset, thresh = 0.75)

In [47]:
bab = 'literature, Literary art, Literary, Literary arts, Literary work, Literary works, LiteraryArt, Literature by region, Literatures, Litterature, Ltierature, Prose fiction'
synset = bab.split(',')
topic = 'Literature'
get_fusion_res_2(s3h_res_pickle, topic, synset)
eval_fusin_at_thresh_lst(Literature_fusion_res, topic, synset)

babelnet results size of the topic "Literature": 7357
ground truth size 860
intersection with the ground truth: 107
babel precision:  0.0145439717276
babel recall:  0.124418604651


(0.0020388745412532284, 0.01744186046511628, 7357, 860)

In [48]:
bab = 'optics, Classical optics, Light physics, Optical, Optical system'
bab = bab.replace('&', 'and')
synset = bab.split(',')
topic = 'Optics'
get_fusion_res_2(s3h_res_pickle, topic, synset)
eval_fusin_at_thresh_lst(Optics_fusion_res, topic, synset)

babelnet results size of the topic "Optics": 8730
ground truth size 3286
intersection with the ground truth: 271
babel precision:  0.0310423825888
babel recall:  0.0824710894705


(0.015005727376861398, 0.039866098600121726, 8730, 3286)

In [49]:
bab = ' surgery, operation, surgical operation, surgical procedure, surgical process, Chirurgery, Chirurgical, Chirurgy, Complications of surgery, Corrective surgery, Elliptical excision, Emergency Surgery, Post-operation, Post-operative, Specialties in surgery, Sterile drapes, surgeon, Surgeons, Surgeries, Surgery in general practice, Surgery operation, Surgery specialties, Surgical, Surgical excision, Surgical excision of malignant lesions, Surgical specialties, Surgical technique, Surgically'
synset = bab.split(',')
topic = 'Surgery'
get_fusion_res_2(s3h_res_pickle, topic, synset)
eval_fusin_at_thresh_lst(Surgery_fusion_res, topic, synset)

babelnet results size of the topic "Surgery": 8271
ground truth size 6412
intersection with the ground truth: 565
babel precision:  0.0683109660259
babel recall:  0.0881160324392


(0.027687099504292104, 0.03571428571428571, 8271, 6412)

In [50]:
bab = 'spectroscopy, spectrographic analysis, spectrometry, spectroscopic analysis, spectrum analysis, Fingerprint region, Laser spectroscopy, Optical spectroscopy, Atomic line, Atomic line spectra, Atomic spectra, Atomic spectral line, Electromagnetic spectroscopy, Emission spectrochemical analysis, Spectrochemical Analysis, Spectrography, Spectrology, Spectroscopic, Spectroscopist, Spectroscopists'
synset = bab.split(',')
topic = 'Spectroscopy'
get_fusion_res_2(s3h_res_pickle, topic, synset)
eval_fusin_at_thresh_lst(Spectroscopy_fusion_res, topic, synset)

babelnet results size of the topic "Spectroscopy": 8513
ground truth size 7294
intersection with the ground truth: 287
babel precision:  0.0337131446024
babel recall:  0.0393474088292


(0.025138024198284977, 0.029339182890046615, 8513, 7294)

In [24]:
bab = 'artificial intelligence, AI, artilect, Artifical intelligence, Cognitive systems, Digital being, Machine intelligence, A.I., A I, AI-D, AI ethics, AI implications, AI Robotics, AI scripting, Aretificial intelligence, Artificial-intelligence, Artificial conciousness, Artificial inteligence, Artificial intellect, Artificial intellegence, Artificial Intelligence., Artificial intelligence for development, Artificial Intelligence Program, Artificial intelligences, Artificially-intelligent, Artificially intelligent, Artificual intelligence, Cognitive simulation, Cognitive system, Commonsense AI, Computational Rationality, Computer AI, Intelligent machine, Machine thought, Machine understanding, Ontology based approach, Pseudo intelligence, Semi-AI, Semi AI, Simulated intelligence, Soft AI, Sub-symbolic, Subsymbolic, The Artificial Intelligence, The Theory of Artificial Intelligence'
synset = bab.split(',')
topic = 'artificial intelligence'
AI_fusion_res = get_fusion_res3("results/AI_results.pickle", topic, synset)
eval_all(AI_fusion_res, topic, synset, "results/AI_results.pickle", tresh=0.75)

TypeError: unhashable type

In [26]:
AI_fusion_res = get_fusion_res3("results/AI_results.pickle", topic, synset)

initial_corpus size: 2091


In [29]:
eval_all(AI_fusion_res, topic, synset, "results/AI_results.pickle", thresh=0.75)

initial_corpus size: 2091
babelnet results size of the topic "artificial intelligence": 7903
ground truth size 657
intersection with the ground truth: 43
babel precision:  0.00544097178287
babel recall:  0.0654490106545
initial_corpus size: 2091
length of s3h results:  8566 length of test set 657
precision s3h:  0.00933924819052 recall s3h:  0.121765601218
initial_corpus size: 2091
length of results:  7903 length of test set 657
F1:  0.00514018691589
precision fusion:  0.00278375300519 recall fusion:  0.0334855403349
for length of results:  15806
F1:  0.00413047439713
precision fusion:  0.00215108186765 recall fusion:  0.0517503805175
 for length of results:  23709
F1:  0.00418616104408
precision fusion:  0.00215108186765 recall fusion:  0.0776255707763


In [41]:
bab = "remote sensing, Infrared remote sensing, Passive remote sensing, Remote-sensing, Remote-Sensing Image, Remote Sensing Satellites, Remote sensor"
synset = bab.split(',')
topic = 'Remote sensing'
AI_fusion_res = get_fusion_res3("results/Remote_sensing_results.pickle", topic, synset)
eval_all(AI_fusion_res, topic, synset, "results/Remote_sensing_results.pickle", thresh=0.75)

initial_corpus size: 4642
initial_corpus size: 4642
babelnet results size of the topic "Remote sensing": 4685
ground truth size 1192
intersection with the ground truth: 3
F1 0.00102092904543
babel precision:  0.000640341515475
babel recall:  0.00251677852349
initial_corpus size: 4642
length of s3h results:  7655 length of test set 1192
F1:  0.0580987905505
precision s3h:  0.0335728282169 recall s3h:  0.215604026846


error: [Errno 104] Connection reset by peer

In [42]:
eval_all(AI_fusion_res, topic, synset, "results/Remote_sensing_results.pickle", thresh=0.75)

initial_corpus size: 4642
babelnet results size of the topic "Remote sensing": 4685
ground truth size 1192
intersection with the ground truth: 3
F1 0.00102092904543
babel precision:  0.000640341515475
babel recall:  0.00251677852349
initial_corpus size: 4642
length of s3h results:  7655 length of test set 1192
F1:  0.0580987905505
precision s3h:  0.0335728282169 recall s3h:  0.215604026846
initial_corpus size: 4642
length of results:  4685 length of test set 1192
F1:  0.00204185809086
precision fusion:  0.00128068303095 recall fusion:  0.00503355704698
for length of results:  9370
F1:  0.00511266805529
precision fusion:  0.00288153681964 recall fusion:  0.0226510067114
 for length of results:  14055
F1:  0.00511576047747
precision fusion:  0.00277481323372 recall fusion:  0.0327181208054
 for length of results:  18740
F1:  0.00581978727674
precision fusion:  0.00309498399146 recall fusion:  0.0486577181208
 for length of results:  23425
F1:  0.00544339277735
precision fusion:  0.002860

In [43]:
bab = "substance abuse, drug abuse, habit, addiction, dependency, Abuse potential, Abusing drugs, Abusive drug use, Anti-drug, Cannabis abuse, Drug-abuse, Drug misuse, Drug prevention, Drugs of abuse, Illegal drug abuse, Illegal drug use, Misuse of drugs, Narcotic abuse theory, Nondependent abuse of drugs, Prescription drug abuse, Prescription Drug Misuse"
synset = bab.split(',')
topic = 'Substance abuse'
res_pickle = "results/Substance_results.pickle"
SA_fusion_res = get_fusion_res3(res_pickle, topic, synset)
eval_all(SA_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 2658
initial_corpus size: 2658
babelnet results size of the topic "Substance abuse": 7893
ground truth size 466
intersection with the ground truth: 66
F1 0.0157913626032
babel precision:  0.00836183960471
babel recall:  0.141630901288
initial_corpus size: 2658
length of s3h results:  4183 length of test set 466
F1:  0.0172080017208
precision s3h:  0.00956251494143 recall s3h:  0.0858369098712
initial_corpus size: 2658
length of results:  7893 length of test set 466
F1:  0.00550305060414
precision fusion:  0.0029139744077 recall fusion:  0.049356223176
for length of results:  15786
F1:  0.0040610386414
precision fusion:  0.00209045990118 recall fusion:  0.0708154506438
 for length of results:  23679
F1:  0.00397597846345
precision fusion:  0.00202711263145 recall fusion:  0.103004291845
 for length of results:  31572
F1:  0.00330857107185
precision fusion:  0.00167870264792 recall fusion:  0.113733905579
 for length of results:  39465
F1:  0.00315544313942
precision

In [44]:
bab = "information system, data system, information systems, system info, Business computing, Computer information system, Computer information systems, Elements of Information System, Information in Computer Science, Information systems and technology, Information systems discipline, Information systems theory, Informationssystem"
synset = bab.split(',')
topic = 'Information Systems'
res_pickle = "results/infosys_results.pickle"
IS_fusion_res = get_fusion_res3(res_pickle, topic, synset)
eval_all(IS_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 5033
initial_corpus size: 5033
babelnet results size of the topic "Information Systems": 8440
ground truth size 1313
intersection with the ground truth: 298
F1 0.0611094022352
babel precision:  0.035308056872
babel recall:  0.226961157654
initial_corpus size: 5033
length of s3h results:  6820 length of test set 1313
F1:  0.0174597319562
precision s3h:  0.0104105571848 recall s3h:  0.0540746382331
initial_corpus size: 5033
length of results:  8440 length of test set 1313
F1:  0.00984312519225
precision fusion:  0.00568720379147 recall fusion:  0.036557501904
for length of results:  16880
F1:  0.00747540262738
precision fusion:  0.00402843601896 recall fusion:  0.0517897943641
 for length of results:  25320
F1:  0.00750948071941
precision fusion:  0.00394944707741 recall fusion:  0.0761614623001
 for length of results:  33760
F1:  0.00672882274114
precision fusion:  0.00349526066351 recall fusion:  0.0898705255141
 for length of results:  42200
F1:  0.00611311562062


In [46]:
bab = "thermodynamics, Thermo-dynamics, Thermodynamic functions, Applied thermodynamics, Classical thermodynamics, entropy, Macroscopic thermodynamics, Phenomenological thermodynamics, second law of thermodynamics, Termodynamics, Thermal behavior, Thermics, Thermodymanics, Thermodynamic, Thermodynamic function, Thermodynamic law, Thermodynamic Laws"
synset = bab.split(',')
topic = 'Thermodynamics'
res_pickle = "results/res__Thermodynamics"
TD_fusion_res = get_fusion_res3(res_pickle, topic, synset)
eval_all(TD_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 6351
initial_corpus size: 6351
babelnet results size of the topic "Thermodynamics": 8375
ground truth size 2727
intersection with the ground truth: 392
F1 0.0706179066835
babel precision:  0.0468059701493
babel recall:  0.143747708104
initial_corpus size: 6351
length of s3h results:  7119 length of test set 2727
F1:  0.02985984156
precision s3h:  0.0206489675516 recall s3h:  0.0539053905391
initial_corpus size: 6351
length of results:  8375 length of test set 2727
F1:  0.0216177265358
precision fusion:  0.014328358209 recall fusion:  0.04400440044
for length of results:  16750
F1:  0.0203316732556
precision fusion:  0.0118208955224 recall fusion:  0.0726072607261
 for length of results:  25125
F1:  0.0193881947436
precision fusion:  0.0107462686567 recall fusion:  0.0990099009901
 for length of results:  33500
F1:  0.01799762608
precision fusion:  0.00973134328358 recall fusion:  0.119545287862
 for length of results:  41875
F1:  0.0169947535985
precision fusion:  

In [47]:
bab = "spectroscopy, spectrographic analysis, spectrometry, spectroscopic analysis, spectrum analysis, Fingerprint region, Laser spectroscopy, Optical spectroscopy, Atomic line, Atomic line spectra, Atomic spectra, Atomic spectral line, Electromagnetic spectroscopy, Emission spectrochemical analysis, Spectrochemical Analysis, Spectrography, Spectrology, Spectroscopic, Spectroscopist, Spectroscopists"
bab = bab.replace('&', 'and')
synset = bab.split(',')
topic = 'spectroscopy'
res_pickle = "results/res__Spectroscopy"
BC_fusion_res = get_fusion_res3(res_pickle, topic, synset)
eval_all(BC_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 8607
initial_corpus size: 8607
babelnet results size of the topic "spectroscopy": 8513
ground truth size 7294
intersection with the ground truth: 287
F1 0.0363130258746
babel precision:  0.0337131446024
babel recall:  0.0393474088292
initial_corpus size: 8607
length of s3h results:  9511 length of test set 7294
F1:  0.0443915501339
precision s3h:  0.0392177478709 recall s3h:  0.0511379215794
initial_corpus size: 8607
length of results:  8513 length of test set 7294
F1:  0.0283418738534
precision fusion:  0.0263126982262 recall fusion:  0.0307101727447
for length of results:  17026
F1:  0.0352796052632
precision fusion:  0.0251967578997 recall fusion:  0.0588154647656
 for length of results:  25539
F1:  0.0418481405903
precision fusion:  0.0269000352402 recall fusion:  0.0941870030162
 for length of results:  34052
F1:  0.0417452716103
precision fusion:  0.0253435921532 recall fusion:  0.118316424458
 for length of results:  42565
F1:  0.0423193405403
precision fusi

In [48]:
bab = 'optics, Classical optics, Light physics, Optical, Optical system'
bab = bab.replace('&', 'and')
synset = bab.split(',')
topic = 'Optics'
res_pickle = "results/res__Optics"
Opt_fusion_res = get_fusion_res3(res_pickle, topic, synset)
eval_all(Opt_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 5349
initial_corpus size: 5349
babelnet results size of the topic "Optics": 8730
ground truth size 3286
intersection with the ground truth: 271
F1 0.0451065246338
babel precision:  0.0310423825888
babel recall:  0.0824710894705
initial_corpus size: 5349
length of s3h results:  4958 length of test set 3286
F1:  0.0499757399321
precision s3h:  0.0415490116983 recall s3h:  0.0626902008521
initial_corpus size: 5349
length of results:  8730 length of test set 3286
F1:  0.0291278295606
precision fusion:  0.0200458190149 recall fusion:  0.0532562385879
for length of results:  17460
F1:  0.0264147305505
precision fusion:  0.0156930126002 recall fusion:  0.0833840535606
 for length of results:  26190
F1:  0.0269371692224
precision fusion:  0.0151584574265 recall fusion:  0.120815581254
 for length of results:  34920
F1:  0.0245511176255
precision fusion:  0.01343069874 recall fusion:  0.142726719416
 for length of results:  43650
F1:  0.0236492244759
precision fusion:  0.01

In [49]:
bab = 'cybernetics, Cybernetic, Cybernetic system, cybernetician, Cyberneticians, cyberneticist, Tha Masta'
bab = bab.replace('&', 'and')
synset = bab.split(',')
topic = 'Cybernetics'
res_pickle = "results/res__Cybernetics"
Cy_fusion_res = get_fusion_res3(res_pickle, topic, synset)
eval_all(Cy_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 291
initial_corpus size: 291
babelnet results size of the topic "Cybernetics": 511
ground truth size 596
intersection with the ground truth: 47
F1 0.0849141824752
babel precision:  0.0919765166341
babel recall:  0.0788590604027
initial_corpus size: 291
length of s3h results:  9797 length of test set 596
F1:  0.0144327913018
precision s3h:  0.00765540471573 recall s3h:  0.125838926174
initial_corpus size: 291
length of results:  511 length of test set 596
F1:  0.00722673893406
precision fusion:  0.00782778864971 recall fusion:  0.00671140939597
for length of results:  1022
F1:  0.0061804697157
precision fusion:  0.00489236790607 recall fusion:  0.00838926174497
 for length of results:  1533
F1:  0.00469704086426
precision fusion:  0.00326157860404 recall fusion:  0.00838926174497
 for length of results:  2044
F1:  0.00378787878788
precision fusion:  0.00244618395303 recall fusion:  0.00838926174497
 for length of results:  2555
F1:  0.00317359568391
precision fusion

In [52]:
bab = 'artificial intelligence, AI, artilect, Artifical intelligence, Cognitive systems, Digital being, Machine intelligence, A.I., A I, AI-D, AI ethics, AI implications, AI Robotics, AI scripting, Aretificial intelligence, Artificial-intelligence, Artificial conciousness, Artificial inteligence, Artificial intellect, Artificial intellegence, Artificial Intelligence., Artificial intelligence for development, Artificial Intelligence Program, Artificial intelligences, Artificially-intelligent, Artificially intelligent, Artificual intelligence, Cognitive simulation, Cognitive system, Commonsense AI, Computational Rationality, Computer AI, Intelligent machine, Machine thought, Machine understanding, Ontology based approach, Pseudo intelligence, Semi-AI, Semi AI, Simulated intelligence, Soft AI, Sub-symbolic, Subsymbolic, The Artificial Intelligence, The Theory of Artificial Intelligence'
synset = bab.split(',')
topic = 'artificial intelligence'
AI_fusion_res = get_fusion_res5("results/AI_results.pickle", topic, synset)
eval_all(AI_fusion_res, topic, synset, "results/AI_results.pickle", thresh=0.75)

initial_corpus size: 2091
initial_corpus size: 2091
babelnet results size of the topic "artificial intelligence": 7903
ground truth size 657
intersection with the ground truth: 43
F1 0.010046728972
babel precision:  0.00544097178287
babel recall:  0.0654490106545
initial_corpus size: 2091
length of s3h results:  8566 length of test set 657
F1:  0.0173479345115
precision s3h:  0.00933924819052 recall s3h:  0.121765601218
initial_corpus size: 2091
length of results:  7903 length of test set 657
F1:  0.0200934579439
precision fusion:  0.0108819435657 recall fusion:  0.130898021309
for length of results:  15806
F1:  0.0153070521776
precision fusion:  0.00797165633304 recall fusion:  0.191780821918
 for length of results:  23709
F1:  0.0125584831322
precision fusion:  0.00645324560294 recall fusion:  0.232876712329
 for length of results:  31612
F1:  0.0112801760203
precision fusion:  0.00575730735164 recall fusion:  0.27701674277
 for length of results:  39515
F1:  0.00985761226725
precisi

In [53]:
bab = "substance abuse, drug abuse, habit, addiction, dependency, Abuse potential, Abusing drugs, Abusive drug use, Anti-drug, Cannabis abuse, Drug-abuse, Drug misuse, Drug prevention, Drugs of abuse, Illegal drug abuse, Illegal drug use, Misuse of drugs, Narcotic abuse theory, Nondependent abuse of drugs, Prescription drug abuse, Prescription Drug Misuse"
synset = bab.split(',')
topic = 'Substance abuse'
res_pickle = "results/Substance_results.pickle"
SA_fusion_res = get_fusion_res5(res_pickle, topic, synset)
eval_all(SA_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 2658
initial_corpus size: 2658
babelnet results size of the topic "Substance abuse": 7893
ground truth size 466
intersection with the ground truth: 66
F1 0.0157913626032
babel precision:  0.00836183960471
babel recall:  0.141630901288
initial_corpus size: 2658
length of s3h results:  4183 length of test set 466
F1:  0.0172080017208
precision s3h:  0.00956251494143 recall s3h:  0.0858369098712
initial_corpus size: 2658
length of results:  7893 length of test set 466
F1:  0.0212944132073
precision fusion:  0.0112758140124 recall fusion:  0.190987124464
for length of results:  15786
F1:  0.0162441545656
precision fusion:  0.00836183960471 recall fusion:  0.283261802575
 for length of results:  23679
F1:  0.0124249326983
precision fusion:  0.00633472697327 recall fusion:  0.321888412017
 for length of results:  31572
F1:  0.0106123977776
precision fusion:  0.00538451792728 recall fusion:  0.364806866953
 for length of results:  39465
F1:  0.00976684781248
precision fus

In [54]:
bab = "information system, data system, information systems, system info, Business computing, Computer information system, Computer information systems, Elements of Information System, Information in Computer Science, Information systems and technology, Information systems discipline, Information systems theory, Informationssystem"
synset = bab.split(',')
topic = 'Information Systems'
res_pickle = "results/infosys_results.pickle"
IS_fusion_res = get_fusion_res5(res_pickle, topic, synset)
eval_all(IS_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 5033
initial_corpus size: 5033
babelnet results size of the topic "Information Systems": 8440
ground truth size 1313
intersection with the ground truth: 298
F1 0.0611094022352
babel precision:  0.035308056872
babel recall:  0.226961157654
initial_corpus size: 5033
length of s3h results:  6820 length of test set 1313
F1:  0.0174597319562
precision s3h:  0.0104105571848 recall s3h:  0.0540746382331
initial_corpus size: 5033
length of results:  8440 length of test set 1313
F1:  0.0467548446632
precision fusion:  0.0270142180095 recall fusion:  0.173648134044
for length of results:  16880
F1:  0.036057824438
precision fusion:  0.0194312796209 recall fusion:  0.249809596344
 for length of results:  25320
F1:  0.030188112492
precision fusion:  0.0158767772512 recall fusion:  0.306169078446
 for length of results:  33760
F1:  0.0262310039061
precision fusion:  0.0136255924171 recall fusion:  0.35034272658
 for length of results:  42200
F1:  0.0227058580194
precision fusio

In [55]:
bab = "thermodynamics, Thermo-dynamics, Thermodynamic functions, Applied thermodynamics, Classical thermodynamics, entropy, Macroscopic thermodynamics, Phenomenological thermodynamics, second law of thermodynamics, Termodynamics, Thermal behavior, Thermics, Thermodymanics, Thermodynamic, Thermodynamic function, Thermodynamic law, Thermodynamic Laws"
synset = bab.split(',')
topic = 'Thermodynamics'
res_pickle = "results/res__Thermodynamics"
TD_fusion_res = get_fusion_res5(res_pickle, topic, synset)
eval_all(TD_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 6351
initial_corpus size: 6351
babelnet results size of the topic "Thermodynamics": 8375
ground truth size 2727
intersection with the ground truth: 392
F1 0.0706179066835
babel precision:  0.0468059701493
babel recall:  0.143747708104
initial_corpus size: 6351
length of s3h results:  7119 length of test set 2727
F1:  0.02985984156
precision s3h:  0.0206489675516 recall s3h:  0.0539053905391
initial_corpus size: 6351
length of results:  8375 length of test set 2727
F1:  0.0590884525311
precision fusion:  0.0391641791045 recall fusion:  0.120278694536
for length of results:  16750
F1:  0.0507264979206
precision fusion:  0.0294925373134 recall fusion:  0.181151448478
 for length of results:  25125
F1:  0.0449518885538
precision fusion:  0.0249154228856 recall fusion:  0.229556288962
 for length of results:  33500
F1:  0.0407982996108
precision fusion:  0.0220597014925 recall fusion:  0.270993766043
 for length of results:  41875
F1:  0.0380700417022
precision fusion: 

In [58]:
bab = "rehabilitation, Neurocognitive Rehabilitation, Neurological rehabilitation, Neuropsychological rehabilitation, Rehabilitation Neuropsychology"
bab.replace('&', 'and')
synset = bab.split(',')
topic = 'Rehabilitation'
res_pickle = "results/res__Rehabilitation"
Reh_fusion_res = get_fusion_res5(res_pickle, topic, synset)
eval_all(Reh_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 7458
initial_corpus size: 7458
babelnet results size of the topic "Rehabilitation": 7449
ground truth size 773
intersection with the ground truth: 0
F1 0.0
babel precision:  0.0
babel recall:  0.0
initial_corpus size: 7458
length of s3h results:  7137 length of test set 773
F1:  0.0111251580278
precision s3h:  0.00616505534538 recall s3h:  0.0569210866753
initial_corpus size: 7458
length of results:  7449 length of test set 773


ZeroDivisionError: float division by zero

In [60]:
eval_all(Reh_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 7458
babelnet results size of the topic "Rehabilitation": 7449
ground truth size 773
intersection with the ground truth: 0
F1 0.0
babel precision:  0.0
babel recall:  0.0
initial_corpus size: 7458
length of s3h results:  7137 length of test set 773
F1:  0.0111251580278
precision s3h:  0.00616505534538 recall s3h:  0.0569210866753
initial_corpus size: 7458
length of results:  7449 length of test set 773
F1:  0
precision fusion:  0.0 recall fusion:  0.0
for length of results:  14898
F1:  0.010209941931
precision fusion:  0.00536984830179 recall fusion:  0.103492884864
 for length of results:  22347
F1:  0.0113321799308
precision fusion:  0.00586208439612 recall fusion:  0.169469598965
 for length of results:  29796
F1:  0.0112532303968
precision fusion:  0.00577258692442 recall fusion:  0.222509702458
 for length of results:  37245
F1:  0.0111000052607
precision fusion:  0.00566518995838 recall fusion:  0.272962483829


In [61]:
bab = "psychology, psychological science, Human psychology, Psychological, Psychologically, Criticism of psychology, Human trait, Hyde event, Phsycology, Physcology, Professional psychology, Psychogenics, Psychologic, Psychological sciences, Psychological terms, Psychological theories, Psychological theory, psychologist, Psychologists, Psychology/rewrite, Psycologic, Psycological, Psycology, Pyhscology, Self-report study, WEIRD"
bab.replace('&', 'and')
synset = bab.split(',')
topic = 'Psychology'
res_pickle = "results/res__Psychology"
Psych_fusion_res = get_fusion_res5(res_pickle, topic, synset)
eval_all(Psych_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 6857
initial_corpus size: 6857
babelnet results size of the topic "Psychology": 7187
ground truth size 1871
intersection with the ground truth: 134
F1 0.0295871053213
babel precision:  0.0186447752887
babel recall:  0.071619454837
initial_corpus size: 6857
length of s3h results:  12847 length of test set 1871
F1:  0.00937627395026
precision s3h:  0.00537090371293 recall s3h:  0.0368786745056
initial_corpus size: 6857
length of results:  7187 length of test set 1871
F1:  0.0143519540737
precision fusion:  0.00904410741617 recall fusion:  0.0347407803314
for length of results:  14374
F1:  0.0134195136965
precision fusion:  0.00758313621817 recall fusion:  0.058257616248
 for length of results:  21561
F1:  0.0123762376238
precision fusion:  0.00672510551459 recall fusion:  0.0774986638161
 for length of results:  28748
F1:  0.0122799568895
precision fusion:  0.00653958536246 recall fusion:  0.100481026189
 for length of results:  35935
F1:  0.0129080040205
precision f

In [62]:
bab = "philosophy, Philosophic, Philosophical, Philosophical Subdisciplines, Philosophically, Philosophized, Applied philosophy, Branch of philosophy, Branches of philosophy, DefinitionOfPhilosophy, Definitions of philosophy, Filosofy, Philisophical, Philisophy, Philo-sophy, Philosophae, philosopher, Philosophers, Philosophhy, Philosophiae, PhilosophicalSubdisciplines, Philosophicians, Philosophies, Philosophise, Philosophised, Philosophiser, Philosophisers, Philosophises, Philosophising, Philosophize, Philosophizer, Philosophizers, Philosophizes, Philosophizing, PhilosophyAndLogic, Philosphical, Philosphy, Phylosophy, Roman ideals, Sage"
bab.replace('&', 'and')
synset = bab.split(',')
topic = 'Philosophy'
res_pickle = "results/res__Philosophy"
Phil_fusion_res = get_fusion_res5(res_pickle, topic, synset)
eval_all(Phil_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 6692
initial_corpus size: 6692
babelnet results size of the topic "Philosophy": 7116
ground truth size 677
intersection with the ground truth: 110
F1 0.0282304632362
babel precision:  0.0154581225408
babel recall:  0.162481536189
initial_corpus size: 6692
length of s3h results:  11104 length of test set 677
F1:  0.0152788388083
precision s3h:  0.00810518731988 recall s3h:  0.1329394387
initial_corpus size: 6692
length of results:  7116 length of test set 677
F1:  0.0218144488644
precision fusion:  0.0119449128724 recall fusion:  0.125553914328
for length of results:  14232
F1:  0.0187806023207
precision fusion:  0.00983698707139 recall fusion:  0.206794682422
 for length of results:  21348
F1:  0.0174347332577
precision fusion:  0.00899381675098 recall fusion:  0.283604135894
 for length of results:  28464
F1:  0.016059846951
precision fusion:  0.00822091062395 recall fusion:  0.34564254062
 for length of results:  35580
F1:  0.0141765727997
precision fusion:  0.00

In [63]:
bab = "ophthalmology, All India Ophthalmological Conference, Clincial ophthalmology, Clincial opthalmology, Clinical ophtalmology, Clinical Ophthalmology, Clinical Opthalmology, General ophthalmic services, Oculists, Oftamology, Ofthamology, Ophthalmic surgeon, Ophthalmologic, Ophthalmological, Ophthalmologicals, Ophthalmologists, Ophthamologist, Ophthamology, Optamology, Opthalmological, Opthalmologist, Opthalmology, Opthamologist, Opthamology, Opthomologist, Society for clincial ophthalmology, Society for clincial opthalmology, Society for clinical ophthalmology, Society for clinical opthalmology, Vision care"
bab.replace('&', 'and')
synset = bab.split(',')
topic = 'Ophthalmology'
res_pickle = "results/res__Ophthalmology"
Oph_fusion_res = get_fusion_res5(res_pickle, topic, synset)
eval_all(Oph_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 958
initial_corpus size: 958
babelnet results size of the topic "Ophthalmology": 2096
ground truth size 459
intersection with the ground truth: 25
F1 0.0195694716243
babel precision:  0.011927480916
babel recall:  0.0544662309368
initial_corpus size: 958
length of s3h results:  5793 length of test set 459
F1:  0.00447856685861
precision s3h:  0.0024167098222 recall s3h:  0.0305010893246
initial_corpus size: 958
length of results:  2096 length of test set 459
F1:  0.0133072407045
precision fusion:  0.0081106870229 recall fusion:  0.037037037037
for length of results:  4192
F1:  0.0107503762632
precision fusion:  0.00596374045802 recall fusion:  0.0544662309368
 for length of results:  6288
F1:  0.00800355713651
precision fusion:  0.00429389312977 recall fusion:  0.0588235294118
 for length of results:  8384
F1:  0.00633269252516
precision fusion:  0.00333969465649 recall fusion:  0.0610021786492
 for length of results:  10480
F1:  0.00639912240607
precision fusion: 

In [64]:
bab = "microscopy, 3D-SIM-microscopy, Amateur microscopy, Bioimaging, Infrared microscopy, IR microscopy, Laser microscopy, Light microscopy, Microscopic examination, Microscopically, Microscopist, Oblique illumination, Polarized light microscope"
bab.replace('&', 'and')
synset = bab.split(',')
topic = 'Microscopy'
res_pickle = "results/res__Microscopy"
Micr_fusion_res = get_fusion_res5(res_pickle, topic, synset)
eval_all(Micr_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 8418
initial_corpus size: 8418
babelnet results size of the topic "Microscopy": 8547
ground truth size 6819
intersection with the ground truth: 253
F1 0.0329298451126
babel precision:  0.029601029601
babel recall:  0.0371022144009
initial_corpus size: 8418
length of s3h results:  10573 length of test set 6819
F1:  0.0501379944802
precision s3h:  0.0412371134021 recall s3h:  0.0639389939874
initial_corpus size: 8418
length of results:  8547 length of test set 6819
F1:  0.039958349603
precision fusion:  0.035919035919 recall fusion:  0.045021264115
for length of results:  17094
F1:  0.0637310249655
precision fusion:  0.044577044577 recall fusion:  0.111746590409
 for length of results:  25641
F1:  0.0711645101664
precision fusion:  0.045045045045 recall fusion:  0.169379674439
 for length of results:  34188
F1:  0.0751578998708
precision fusion:  0.0450742950743 recall fusion:  0.225986214988
 for length of results:  42735
F1:  0.0747871009404
precision fusion:  0.04

In [65]:
bab = "ceramics, Ceramic art, Art pottery, Art ware, Ceramic artist, Ceramic paint, Ceramics art, Fine art pot, Vase painting"
bab.replace('&', 'and')
synset = bab.split(',')
topic = 'Ceramics'
res_pickle = "results/res__Ceramics"
Cer_fusion_res = get_fusion_res5(res_pickle, topic, synset)
eval_all(Cer_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 8256
initial_corpus size: 8256
babelnet results size of the topic "Ceramics": 8249
ground truth size 1276
intersection with the ground truth: 0
F1 0.0
babel precision:  0.0
babel recall:  0.0
initial_corpus size: 8256
length of s3h results:  8245 length of test set 1276
F1:  0.0212162587963
precision s3h:  0.012249848393 recall s3h:  0.0791536050157
initial_corpus size: 8256
length of results:  8249 length of test set 1276
F1:  0
precision fusion:  0.0 recall fusion:  0.0
for length of results:  16498
F1:  0.0178913019017
precision fusion:  0.00963753182204 recall fusion:  0.12460815047
 for length of results:  24747
F1:  0.0198286131499
precision fusion:  0.010425506122 recall fusion:  0.202194357367
 for length of results:  32996
F1:  0.0196078431373
precision fusion:  0.0101830524912 recall fusion:  0.263322884013
 for length of results:  41245
F1:  0.0185790550551
precision fusion:  0.00957691841435 recall fusion:  0.309561128527


In [67]:
bab = "infection, infectious disease, communicable diseases, contagion, Infectious diseases, Acute infection, AIDS-related bacterial infections, AIDS-related viral infections, Anti-infective, Anti-infectives, Antiinfective, Bacterial Infections, Communicable disease, Contagious diseases, Definition to contagious, Infect, Infecting, Infectiology, Infections, Infectious, Infectious disease epidemiology, Infectious disease medicine, Infectology, Local infection, Primary infection, Rochalimea infections, Secondary infection, Tropical bacterial infections, Tropical infections, Viral Infections, Wound colonization, Wound infection"
bab.replace('&', 'and')
synset = bab.split(',')
topic = 'Infectious Diseases'
res_pickle = "results/Infectious_results.pickle"
InfD_fusion_res = get_fusion_res5(res_pickle, topic, synset)
eval_all(InfD_fusion_res, topic, synset, res_pickle, thresh=0.75)

initial_corpus size: 2588
initial_corpus size: 2588
babelnet results size of the topic "Infectious Diseases": 8343
ground truth size 1375
intersection with the ground truth: 50
F1 0.0102901831653
babel precision:  0.00599304806425
babel recall:  0.0363636363636
initial_corpus size: 2588
length of s3h results:  1957 length of test set 1375
F1:  0.00180072028812
precision s3h:  0.00153295861012 recall s3h:  0.00218181818182
initial_corpus size: 2588
length of results:  8343 length of test set 1375
F1:  0.00967277217534
precision fusion:  0.00563346518039 recall fusion:  0.0341818181818
for length of results:  16686
F1:  0.015060074193
precision fusion:  0.00815054536737 recall fusion:  0.0989090909091
 for length of results:  25029
F1:  0.0168156339948
precision fusion:  0.00886971113508 recall fusion:  0.161454545455
 for length of results:  33372
F1:  0.0166345295997
precision fusion:  0.00865995445283 recall fusion:  0.210181818182
 for length of results:  41715
F1:  0.0157345091669
p