In [6]:
import json

In [7]:
TEST_JSON = "../data/input/biorxiv_medrxiv/biorxiv_medrxiv/0015023cc06b5362d332b3baf348d11567ca2fbb.json"
TEST_JSON_2 = "../data/input/biorxiv_medrxiv/biorxiv_medrxiv/00340eea543336d54adda18236424de6a5e91c9d.json"
test_json = json.load(open(TEST_JSON, 'r'))

papers_list = [json.load(open(TEST_JSON, 'r')),
               json.load(open(TEST_JSON_2, 'r')),
               json.load(open(TEST_JSON, 'r')),
               json.load(open(TEST_JSON_2, 'r'))]

<h1 style="color:red">Knowledge Graph</h1>

# Setup Code

`!python -m spacy download en_core_web_lg`

In [None]:
!pip install spacy
!python -m spacy download en_core_web_lg

In [104]:
import spacy
import functools
from functools import reduce, partial
import numpy as np
import operator
import requests

In [9]:
import os
import re
import json
import time
import pandas as pd

In [10]:
import warnings
warnings.filterwarnings("ignore")

In [11]:
ENTITY_INDICATORS = ["NOUN", "PROPN"]

## Helper Functions

In [12]:
def rm_dups(relations):
    """Removes duplicate triples from list."""
    return list(set(relations))

In [13]:
def regex_filter(text):
    """Removing numbers and puncutation."""
    text = re.sub(r" \d+", "",text)
    return re.sub(r"[^A-Za-z0-9 -]+", "",text)

In [14]:
def construct_name(author):
    """Combing first and last name of authors."""
    return author["first"].lower() + " " + author["last"].lower()

In [15]:
def collect_text(text):
    """Combing list of text elements into one."""
    return " ".join(regex_filter(seg['text']) for seg in text)
    

In [16]:
def collect_bib_text(bib):
    ref_iter = bib.values()
    return " ".join(regex_filter(ref['title']) for ref in ref_iter)

In [17]:
def get_entities(text):
    """Returns the entities from a spacy token sequence."""
    return [token.text.lower() for token in text if token.pos_ in ENTITY_INDICATORS]

---

In [18]:
def get_relation(nlp, paper):
    """
    Read paper and collect triple relations.
    """
    
    triples = []
    
    # Paper ID
    paperid = paper['paper_id']
    
    # Author
    authors_list = list(map(construct_name, paper["metadata"]["authors"]))
    authors_triples = [(author, paperid, "has_author") for author in authors_list]
    
    # Title
    title_entities = get_entities(nlp(paper['metadata']['title']))
    title_triples = [(title_entity, paperid, "has_title") for title_entity in title_entities]
    
    # Abtract
    abstract_text = collect_text(paper['abstract'])
    abstract_entities = get_entities(nlp(abstract_text))
    abstract_triples = [(abs_entity, paperid, "has_abstract") for abs_entity in abstract_entities]
    
    # Body
    body_text = collect_text(paper['body_text'])
    body_entities = get_entities(nlp(body_text))
    body_triples = [(body_entity, paperid, "has_body") for body_entity in body_entities]

    # Bibliography
    bib_text = collect_bib_text(paper['bib_entries'])
    bib_entities = get_entities(nlp(bib_text))
    bib_triples = [(bib_entity, paperid, "has_body") for bib_entity in bib_entities]
    
    triples.extend(rm_dups(authors_triples))
    triples.extend(rm_dups(title_triples))
    triples.extend(rm_dups(abstract_triples))
    triples.extend(rm_dups(body_triples))
    triples.extend(rm_dups(bib_triples))
    
    return triples

In [19]:
def get_mesh_relations(kg_data):
    """
    Fill knowledge graph with supporting references from MeSH
    """
    mesh_url = "https://id.nlm.nih.gov/mesh/lookup/term?label={query}&match=contains&limit=5"

    triples = []
    for idx, row in enumerate(kg_data.iterrows()):
        entity = row[1]['entity']

        req = requests.get(mesh_url.format(query=entity),
                           headers={"content-type": "application/json"}, verify=False)

        req_eval = eval(req.text)
        if len(req_eval) != 0:
            mesh_tup = [(ref['label'].lower(), row[1]['paper_id'], row[1]['relation']) for ref in req_eval]
            triples.append(mesh_tup)

        if idx % 100 == 0:
            print("        {idx}/{size}".format(idx=idx, size=kg_df.shape[0]))

    mesh_df = pd.DataFrame(reduce(operator.concat, triples)) 
    mesh_df.columns = ["entity", "paper_id", "relation"]
    
    return mesh_df

In [20]:
def create_knowledge_graph(paper_list):
    """Create Knowledge Graph"""
    
    # Construct KG from nlp pipeline
    relations = list(map(functools.partial(get_relation, nlp), papers_list))
    kg_df = pd.DataFrame(reduce(operator.concat, relations))
    kg_df.columns = ["entity", "paper_id", "relation"]
    
    # fill in KG with connects in MESH ontology
    full_knowledge_graph = get_mesh_relations(kg_df)
    
    # combine the two sets of triples
    return pd.concat([kg_df, full_knowledge_graph],
                     axis=0).reset_index(drop=True)
    
    

---

In [None]:
model = "en_core_web_lg"
nlp = spacy.load(model)

In [None]:
# creating stopword filter component of nlp
com_wrds_df = pd.read_csv("../data/external_data/commom_words.csv", header=0)
for word in com_wrds_df.iterrows():
    nlp.vocab[(word[1]['words'])].is_stop = True

def remove_stopwords(doc):
    """Spacy Component that removes stopwords."""
    return [token for token in doc if not token.is_stop]

nlp.add_pipe(remove_stopwords, name="filter_stopwords", last=True)

In [None]:
relations = list(map(functools.partial(get_relation, nlp), papers_list))

kg_df = pd.DataFrame(reduce(operator.concat, relations))
kg_df.columns = ["entity", "paper_id", "relation"]

In [None]:
full_knowledge_graph = get_mesh_relations(kg_df)

In [None]:
full_knowledge_graph = pd.concat([kg_df, mesh_df],axis=0).reset_index(drop=True)

In [None]:
full_knowledge_graph.to_csv("sampleresult1.csv", index=False)

<h1 style="color:red">BERT Embeddings</h1>

In [None]:
!pip install spacy-transformers

In [None]:
!python -m spacy download en_trf_bertbaseuncased_lg

In [None]:
!pip install -U spacy[cuda92]

In [1]:
import spacy
import torch
import numpy

In [2]:
spacy.prefer_gpu()

False

In [3]:
if torch.cuda.is_available():
    pass
    #torch.set_default_tensor_type("torch.cuda.FloatTensor")

In [4]:
bert_model = "en_trf_bertbaseuncased_lg"
bert_nlp = spacy.load(bert_model)

In [121]:
def generate_embedddings(paper_list, embed_pipeline):
        
    def get_id_title(paper):
        return (paper['paper_id'],
                paper['metadata']['title'])
    
    def get_embed(emb_mdl, text):
        # Summing gives us sentence level embedding
        # https://colab.research.google.com/github/explosion/spacy-pytorch-transformers/blob/master/examples/Spacy_Transformers_Demo.ipynb
        return (text[0], emb_mdl(text[1]).tensor.sum(axis=0))
        
    # processing embeddings
    title_text = map(get_id_title, papers_list)
    title_embeds = list(map(partial(get_embed, embed_pipeline),
                            title_text))
    
    # converting to dataframe
    embd_df = pd.DataFrame(title_embeds) # convert to dataframe
    embd_df = pd.DataFrame(embd_df[1].values.tolist(), index = embd_df[0]) # split vector into multiple columns
    embd_df.index = embd_df.index.set_names('paper_id')# rename index to paper_id
    
    return embd_df

    



In [122]:
embd_df = generate_embedddings(papers_list, bert_nlp)

In [124]:
embd_df.to_csv("sandbox_output/bert_title_embeddings.csv", index=True)