This notebook is design as a prototype for generating a ranked list of topics per article out of a ranked list of articles per topic

In [2]:
import numpy as np
import pandas as pd
import random, pickle, argparse, json, os, urllib2
from collections import OrderedDict
from operator import itemgetter

In [16]:
def query_from(q, f):
    q = q+'&from='+str(f)
    response = urllib2.urlopen(q)
    data = json.load(response)
    subject_ids = np.array(range(len(data['hits'])), dtype=np.object)
    for (i, hit) in enumerate(data['hits']):
        subject_ids[i] = hit['id']
    return subject_ids

def query(q):
    response = urllib2.urlopen(q)
    data = json.load(response)
    nb_requests = 1 + data['total'] / 1000
    if nb_requests > 10: # maximum number of pages due to API pagination restrection
        nb_requests = 10
    subject_ids = query_from(q, 0)
    for i in range(nb_requests)[1:]:
        f = i * 1000
        next_request = query_from(q, f)
        subject_ids = np.hstack((subject_ids, next_request))
    return subject_ids.tolist()

def find_intersection(list_a, list_b):
    return list(set(list_a) & set(list_b))

def term2url(string):
    string = string.split(' ')
    res = '%22'
    for s in string:
        res = res + s + '%20'
    res = res[:-3]
    res = res + '%22'
    return res

def babel_synset(synset):
    q = 'https://api.istex.fr/document/?q=(('
    for syn in synset:
        syn = term2url(syn)
        q = q + 'title:' + syn + '%20OR%20abstract:' + syn + '%20OR%20'
    q = q[:-8]
    q = q + ')%20AND%20(qualityIndicators.abstractWordCount:[35%20500]%20AND%20qualityIndicators.pdfPageCount:[3%2060]%20AND%20publicationDate:[1990%202016]%20AND%20language:(%22eng%22%20OR%20%22unknown%22)%20AND%20genre:(%22research_article%22%20OR%20%22conference[eBooks]%22%20OR%20%22article%22%20)%20))&size=1000&output=id'
    return q

def babel_subj_keyword(topic):
    q = 'https://api.istex.fr/document/?q=(('
    topic = term2url(topic)
    q = q+ 'subject.value:' + topic + '%20OR%20keywords:' + topic
    q = q + ')%20AND%20(qualityIndicators.abstractWordCount:[35%20500]%20AND%20qualityIndicators.pdfPageCount:[3%2060]%20AND%20publicationDate:[1990%202016]%20AND%20language:(%22eng%22%20OR%20%22unknown%22)%20AND%20genre:(%22research_article%22%20OR%20%22conference[eBooks]%22%20OR%20%22article%22%20)%20))&size=1000&output=id'
    return q

def babel_title_abst(topic):
    q = 'https://api.istex.fr/document/?q=(('
    topic = term2url(topic)
    q = q+ 'title:' + topic + '%20OR%20abstract:' + topic
    q = q + ')%20AND%20(qualityIndicators.abstractWordCount:[35%20500]%20AND%20qualityIndicators.pdfPageCount:[3%2060]%20AND%20publicationDate:[1990%202016]%20AND%20language:(%22eng%22%20OR%20%22unknown%22)%20AND%20genre:(%22research_article%22%20OR%20%22conference[eBooks]%22%20OR%20%22article%22%20)%20))&size=1000&output=id'
    return q

def babelnet_syn_get_input(topic, synset):
    results = query(babel_synset(synset))
    _gs = query(babel_subj_keyword(topic))
    results = find_intersection(results, inversed_index.keys())
    _abst_title = query(babel_title_abst(topic))
    test_set = _inter = {x for x in _gs if x not in _abst_title}
    test_set = find_intersection(test_set, inversed_index.keys())
    results = list(results)
    test = list(test_set)
    print 'initial_corpus size:', len(find_intersection(_abst_title, inversed_index.keys()))
    return results, test

def get_fusion_res(s3h_res_pickle, topic, synset):
    topic_s3h_results = pickle.load(open(s3h_res_pickle,'rb'))
    if type(topic_s3h_results) is OrderedDict:
        topic_s3h_results = topic_s3h_results.items()
    topic_s3h_top100k_results = topic_s3h_results[:100000]
    babel_results, test = babelnet_syn_get_input(topic, synset)
    fus = np.array(range(100000))
    fus = fus * len(babel_results)
    for i, s3h in enumerate(topic_s3h_top100k_results):
        for j, bab in enumerate(babel_results):
            if s3h[0] == bab:
                fus[i] = (i + j) / 2
    fusion_df = pd.DataFrame(data=topic_s3h_top100k_results, columns=["istex_id", "3sh_score"])
    fusion_df["fus_rank"] = fus
    fusion_res = fusion_df.sort_values("fus_rank")
    return fusion_res

In [14]:
#loading SDV of istex articles
inv_index = json.load(open('../RecSys_Exp_files/182_381_vec150_results/output_paragraph_inversed_index.json','rb'))
print 'original inversed_index'
print inv_index.items()[:3]
inversed_index = dict()
for (k, v) in inv_index.items():
    key = k.split('_')[1]
    inversed_index[key] = v
print 'processed inversed_index'
print inversed_index.items()[:3]

original inversed_index
[(u'ISTEX_D89FA3AC3521074D46F4245762153DF497BFFA1F', 2002320), (u'ISTEX_18EAF4D6A126B077EB38667801D1B7292F32FF49', 2483732), (u'ISTEX_5F91044435FCC4FABB9F02E31467DCFE75F4A7BE', 1429049)]
processed inversed_index
[(u'FCF1393F9B8136AC08FB67E88F94F3CF62C17288', 3517138), (u'482E1102A1114327A744FD2ADB4D9F8FF7E9A70B', 751643), (u'A81022B6295AE66F68A10222C3B94A06B033C1BA', 3983232)]


In [17]:
bab = "nursing, Nursing Science, Staff nurse, Adult nursing, Flightnurse, nurse, Nursing History, Nursing Officer, Nursing practice, Nursing skills, Nursing staff, Nursing Student, Nursing unit, Nurxing, Practice of nursing"
bab.replace('&', 'and')
synset = bab.split(',')
topic = 'Nursing'
s3h_res_pickle = "results/Nursing_results.pickle"
nursing_AE2TS = get_fusion_res(s3h_res_pickle, topic, synset)

initial_corpus size: 8160


In [21]:
nursing_AE2TS.head(5)

Unnamed: 0,istex_id,3sh_score,fus_rank
28,49630CDB3DC2ABEB1D9209F79CF6424C73171689,1.0,99
165,C4E914079936089EF0B179AFF5A655E3BC2A068B,1.0,106
233,39C8CDD40F50BE64D33FA53857280FDDD017E41A,1.0,127
188,964D0413B5C20E25E268B2BA677B64E0E8EFB3FC,1.0,154
220,D1F760638AB9537A33BB1177EB56B572E95B6663,1.0,169


In [22]:
bab ="surgery, operation, surgical operation, surgical procedure, surgical process, Chirurgery, Chirurgical, Chirurgy, Complications of surgery, Corrective surgery, Elliptical excision, Emergency Surgery, Post-operation, Post-operative, Specialties in surgery, Sterile drapes, surgeon, Surgeons, Surgeries, Surgery in general practice, Surgery operation, Surgery specialties, Surgical, Surgical excision, Surgical excision of malignant lesions, Surgical specialties, Surgical technique, Surgically"
bab.replace('&', 'and')
synset = bab.split(',')
topic = 'Surgery'
s3h_res_pickle = "results/res__Surgery"
Surgery_AE2TS = get_fusion_res(s3h_res_pickle, topic, synset)

initial_corpus size: 8455


In [51]:
surgery_lst = Surgery_AE2TS["istex_id"].tolist()[:10000]
nursing_lst = nursing_AE2TS["istex_id"].tolist()[:10000]
len(find_intersection(surgery_lst,nursing_lst))

17

In [45]:
find_intersection(surgery_lst,nursing_lst)

[u'DF75946C62E39EEBCA5CA56A7FFD387985996824',
 u'DA240C2D5DB51ADEE11E610A354B15C00AF64017',
 u'E8D1746D0985822B8886156A3E3F540110772D65',
 u'DBBF3B93D3AE905C37E2CDFDABB55768EB4CE416']

In [33]:
bab = "organ transplant, transplant, transplantation, Organ transplantation, Medical Transplantation, Transplant Surgery, Black market organs, First transplant, First transplantation, Intestinal transplant, Live organ transplants, Mixed chimerism, Organ and Tissue Donor, Organ doner, Organ farming, Organ transplantation in different countries, Organ transplantation therapy, Organ transplants, Skin transplant, Tissue transplant, Transplant Tourism and Organ Trafficking, Transplantation medicine, Transplantation surgery, Transplantation therapy, Transplanted organs, Transplantology"
bab.replace('&', 'and')
synset = bab.split(',')
topic = 'Transplantation'
s3h_res_pickle = "results/res_Transplantation"
Transplantation_AE2TS = get_fusion_res(s3h_res_pickle, topic, synset)

initial_corpus size: 8999


In [49]:
transplantation_lst = Transplantation_AE2TS["istex_id"].tolist()[:10000]

In [52]:
print len(find_intersection(transplantation_lst, surgery_lst))


76


In [53]:
print len(find_intersection(nursing_lst, transplantation_lst))
find_intersection(nursing_lst, transplantation_lst)

6


[u'0F7BAEDE8860CC6ADFB22D63051AD01434692BB4',
 u'A4D27148ACEF37BF3CD23175B04B589603F737F7',
 u'7D46620DB62AE8B1E3DB17655413321FAB422763',
 u'191CDEE0F7ABFB096A6E2727656F47FFFB4E6136',
 u'FF1770048446CCE47249FC070E6941A90ABCCE80',
 u'8A776E3C9AC913283F13E71CD78FB388A58827BA']