In [1]:
import json

import requests
from tqdm.notebook import tqdm
from collections import Counter

In [2]:
http_proxy  = "http://172.16.2.30:8080"
https_proxy = "http://172.16.2.30:8080"
ftp_proxy   = "http://172.16.2.30:8080"

proxyDict = { 
              "http"  : http_proxy, 
              "https" : https_proxy, 
              "ftp"   : ftp_proxy
            }

In [15]:
import re
import string

def preprocess(s):
    s = re.sub(r'\d+', '', s)
    translator = str.maketrans('', '', string.punctuation) 
    s = s.translate(translator) 
    s = s.strip()
    return s

def getEntitiesAndSpots(text, rhoThreshold = 0.1, long_text = 0):
    url = 'https://tagme.d4science.org/tagme/tag'
    params = {'lang': 'en', 'include_abstract': 'false', 'include_categories': 'true', 'gcube-token': '42aa36f7-4770-4574-8ef8-45138f3ba072-843339462', 'text': text, 'long_text': long_text}
    rhoThreshold = rhoThreshold
    entities = []
    spots = []
    r = requests.get(url = url, params = params) 
    data = r.json()
    for annotation in data['annotations']:
        if annotation['rho'] > rhoThreshold:
            entities.append(annotation['title'])
            spots.append(annotation['spot'])
    spots = Counter(spots)
    spots = [(s, spots[s]) for s in spots.keys()]
    entities = Counter(entities)
    entities = [(s, entities[s]) for s in entities.keys()]
    return spots, entities

In [61]:
with open('./data/papersForEntity.json', 'r') as file:
    for line in file:
        data = json.loads(line)
embeddingResults = data['embeddingResults']
esResults = data['esResults']

In [62]:
paperIDs = set()
for result in embeddingResults:
    paperIDs.update(result)
for result in esResults:
    paperIDs.update(result)

In [64]:
records = dict()
PapersOutFileName = './data/es/dblp_AIpapers_v1.json'
i = 0

with open(PapersOutFileName, 'r') as file:
    for line in file:
        if i % 2 != 0:
            data = json.loads(line)
            if (data['id'] in paperIDs):
                records[data['id']] = {'title': data['title'], 'abstract': data['abstract']}
        i += 1


In [89]:
dictForBody = dict() ## dict[paperId] = {'entities': entityCounterList,'spots': spotscounterList}
for key in tqdm(records.keys()):
    text = records[key]['title'] + ' ' + records[key]['abstract']
    spots, entities = getEntitiesAndSpots(text, rhoThreshold = 0.12, long_text = 5)
    dictForBody[key] = {'entities': entities, 'spots': spots}
with open('./data/BodyEntitiesPerPaper.json', 'w') as outfile:
    json.dump(dictForBody, outfile)

HBox(children=(FloatProgress(value=0.0, max=430.0), HTML(value='')))




In [None]:
dictForTitles = dict() ## dict[paperId] = {'entities': entityCounterList,'spots': spotscounterList}
for key in tqdm(records.keys()):
    text = records[key]['title']
    spots, entities = getEntitiesAndSpots(text, rhoThreshold = 0.1)
    dictForTitles[key] = {'entities': entities, 'spots': spots}
with open('./data/TitleEntitiesPerPaper.json', 'w') as outfile:
    json.dump(dictForTitles, outfile)

HBox(children=(FloatProgress(value=0.0, max=430.0), HTML(value='')))

In [29]:
with open('./data/TitleEntitiesPerPaper.json', 'r') as file:
    for line in file:
        dictForTitles = json.loads(line)

In [30]:
entityList = [[entityTuple[0] for entityTuple in tmpDict['entities']] for tmpDict in  dictForTitles.values() ]
entitySet = set()
for entitySubList in entityList:
    entitySet.update(entitySubList)

In [31]:
with open('./data/BodyEntitiesPerPaper.json', 'r') as file:
    for line in file:
        dictForBody = json.loads(line)
entityList = [[entityTuple[0] for entityTuple in tmpDict['entities']] for tmpDict in  dictForBody.values() ]
for entitySubList in entityList:
    entitySet.update(entitySubList)

In [32]:
len(entitySet)
entityList = list(entitySet)

In [19]:
import tensorflow as tf
import tensorflow_hub as hub
module_url = "./module/UnivTrans" #@param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
model = hub.load(module_url)
def embed(inputText):
    return model(inputText)

In [35]:
n = 100     # block size
entities = [entityList[i:i + n] for i in range(0, len(entityList), n)]

In [37]:
count = 0
with open('./data/entity_USE_Embeddings.json', 'w') as outfile:
    for entitySubList in tqdm(entities):
        entitySubList = [preprocess(entity) for entity in entitySubList]
        embeddings = embed(entitySubList).numpy().tolist()
        for embedding, entity in zip(embeddings, entitySubList):
            outDict = dict()
            outDict['entity'] = entity
            outDict['embedding'] = embedding
            count += 1
            json.dump(outDict, outfile)
            outfile.write('\n')
assert count = len(entityList)

HBox(children=(FloatProgress(value=0.0, max=28.0), HTML(value='')))




## FoS processing

In [5]:
from collections import defaultdict

fosList = []
fosCount = defaultdict(int)

for record in tqdm(records):    
    for fos in record['fos']:
        fosCount[fos] += 1
    

    

HBox(children=(FloatProgress(value=0.0, max=475839.0), HTML(value='')))




In [6]:
fosList = [(count, tag) for tag, count in fosCount.items()]
fosList.sort(reverse=True)

In [7]:
tagIdf =  [(len(records) / count , tag) for tag, count in fosCount.items() if count > 50]
tagIdf.sort(reverse=True)

In [8]:
len(fosCount.keys())

35325

## Generate FastText embeddings for the keywords

In [10]:
import json
import pandas as pd
import re
import fasttext 
import numpy as np
from numpy import dot
from numpy.linalg import norm


fasttextModel = fasttext.load_model('crawl-300d-2M-subword.bin')







In [11]:
with open('./data/dblp_fos_FT_embeddings.json', 'w') as outfile:
    for count, fos in tqdm(fosList):
        orgFos = fos
        embedding = fasttextModel.get_word_vector(fos).tolist()    # while reading use np.asarray to convert to np array
        outDict = dict()
        outDict['fos'] = orgFos
        outDict['embedding'] = embedding
        outDict['count'] = count
        json.dump(outDict, outfile)
        outfile.write('\n')

HBox(children=(FloatProgress(value=0.0, max=35325.0), HTML(value='')))




In [12]:
translator = str.maketrans('', '', string.punctuation) 
with open('./data/dblp_fos_FT_Phrase_embeddings.json', 'w') as outfile:
    for count, fos in tqdm(fosList):
        orgFos = fos
        fos.strip()
        fos = fos.translate(translator)
        fos = '_'.join(fos.split())
        embedding = fasttextModel.get_word_vector(fos).tolist()    # while reading use np.asarray to convert to np array
        outDict = dict()
        outDict['fos'] = orgFos
        outDict['embedding'] = embedding
        outDict['count'] = count
        json.dump(outDict, outfile)
        outfile.write('\n')


HBox(children=(FloatProgress(value=0.0, max=35325.0), HTML(value='')))




In [13]:
import tensorflow as tf
import tensorflow_hub as hub

In [14]:
module_url = "./module/UnivTrans" #@param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
model = hub.load(module_url)
def embed(inputText):
    return model(inputText)

In [15]:
translator = str.maketrans('', '', string.punctuation) 
with open('./data/dblp_fos_USE_embeddings.json', 'w') as outfile:
    for count, fos in tqdm(fosList):
        orgFos = fos
        fos.strip()
        fos = fos.translate(translator)
        fos = ' '.join(fos.split())
        embedding = embed([fos])[0].numpy().tolist()    # while reading use np.asarray to convert to np array
        outDict = dict()
        outDict['fos'] = orgFos
        outDict['embedding'] = embedding
        outDict['count'] = count
        json.dump(outDict, outfile)
        outfile.write('\n')


HBox(children=(FloatProgress(value=0.0, max=35325.0), HTML(value='')))




In [151]:
cosineSimilarity(fasttextModel.get_word_vector('natural language user interface'),fasttextModel.get_word_vector('natural language interface'))

0.88841444

In [154]:
cosineSimilarity(fasttextModel.get_word_vector('natural language'),fasttextModel.get_word_vector('natural language user interface'))

0.81671053

In [156]:
import spacy
import pytextrank

# example text
text = records[10109]['abstract']

# load a spaCy model, depending on language, scale, etc.
nlp = spacy.load("en_core_web_sm")

# add PyTextRank to the spaCy pipeline
tr = pytextrank.TextRank()
nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)

doc = nlp(text)

# examine the top-ranked phrases in the document
for p in doc._.phrases:
    print("{:.4f} {:5d}  {}".format(p.rank, p.count, p.text))
    print(p.chunks)

0.1129     1  chosen guideline ontology
[chosen guideline ontology]
0.1114     1  text formal representation
[text formal representation]
0.1033     1  semi-structured text
[semi-structured text]
0.1032     1  asbru gem ontologies
[Asbru GEM ontologies]
0.0934     2  digital electronic guideline library
[Digital Electronic Guideline Library, Digital Electronic Guideline Library]
0.0824     1  major tool
[major tool]
0.0755     2  representation
[representation, representations]
0.0704     1  semantic markup
[semantic markup]
0.0676     1  clinicians
[clinicians]
0.0672     1  retrospective assessment
[retrospective assessment]
0.0670     1  gradual conversion
[gradual conversion]
0.0652     1  medical care
[medical care]
0.0647     1  clinical guidelines
[Clinical Guidelines]
0.0630     1  demonstrated feasibility
[demonstrated feasibility]
0.0622     1  domain knowledge
[domain knowledge]
0.0617     1  hybrid meta-ontology
[hybrid meta-ontology]
0.0582     1  asbru
[Asbru]
0.0566     

In [157]:
b = "hello"
a = b
b = "efg"
a

'hello'