### Script Overview
This script creates a toy dataset from INDRA covid19, hosted on emma.indra.bio 

Emma puts together this graph on daily basis via a cron job that pulls in literature, does NER,  train new ML model..
It incorporates daily updates from CORD-19 and also searches the Internet, and runs about 6 text mining systems on those

The script converts the graph to BEL format via pybel library. 
The pybel library can be used to further process the graph and generate toy dataset outputs. 

In [1]:
import sys
import os

In [2]:
sys.path.insert(0,"/home/lani_lichtenstein/indra/")
print(sys.path)


['/home/lani_lichtenstein/indra/', '/usr/lib/python36.zip', '/usr/lib/python3.6', '/usr/lib/python3.6/lib-dynload', '', '/home/lani_lichtenstein/.local/lib/python3.6/site-packages', '/usr/local/lib/python3.6/dist-packages', '/usr/lib/python3/dist-packages', '/home/lani_lichtenstein/.local/lib/python3.6/site-packages/IPython/extensions', '/home/lani_lichtenstein/.ipython']


In [3]:
import getpass
import os
import sys
import time

import matplotlib.pyplot as plt
import pandas as pd
import pykeen
import torch
from pykeen.pipeline import pipeline
import pybel
import pybel_tools
import indra


%matplotlib inline

In [4]:
print(sys.version)

3.6.9 (default, Jul 17 2020, 12:50:27) 
[GCC 8.4.0]


In [5]:
print(time.asctime())

Wed Aug 19 09:40:49 2020


In [6]:
print(getpass.getuser())

lani_lichtenstein


In [7]:
print(indra.__path__) # check using local installation

['/home/lani_lichtenstein/indra/indra']


In [8]:
print(pybel.get_version(with_git_hash=True))

0.14.10-UNHASHED


In [None]:
#import requests
#from indra.statements import stmts_from_json
#from indra.tools import assemble_corpus as ac
#from indra.assemblers.pybel import PybelAssembler
#model_url = 'https://emmaa.s3.amazonaws.com/assembled/covid19/latest_statements_covid19.json'
#stmts_json = requests.get(model_url).json()
#stmts = stmts_from_json(stmts_json)
#filtered_stmts = ac.filter_belief(stmts, 0.9)
#pa = PybelAssembler(filtered_stmts)
#pybel_graph = pa.make_model()

In [None]:
from pybel.io.emmaa import get_statements_from_emmaa
from indra.tools import assemble_corpus as ac
from indra.assemblers.pybel import PybelAssembler

stmts = get_statements_from_emmaa('covid19')
filtered_stmts = ac.filter_belief(stmts, 0.9)

In [None]:
len(filtered_stmts)

In [None]:
from indra.assemblers.html.assembler import _format_evidence_text
from tqdm import tqdm
from indra.statements import Evidence

In [None]:
# explore filtered statements 
filtered_stmts_with_nlp_evidence=[]
for st in tqdm(filtered_stmts):
    
    #print(type(st.evidence[0].text)) # each statement has a list of objects of type Evidence https://indra.readthedocs.io/en/latest/_modules/indra/statements/evidence.html

            
    statement_evidence =_format_evidence_text(st)
    for i, statement_evidence_tmp in enumerate(statement_evidence):
            #st.evidence[i]["text_annotated"]=statement_evidence_tmp
        evjson=st.evidence[i].to_json()                
        evjson['annotations']['text_nlp']=statement_evidence_tmp # create new dict element in annotations    
        evobj=Evidence._from_json(evjson)
        st.evidence[i] = evobj

    filtered_stmts_with_nlp_evidence.append(st)        
    # explore evidence
    # evjson=st.evidence[0].to_json()
    # evannotations=evjson['annotations']
    # for key in evannotations.keys():
     #   print(key)


In [None]:
st=filtered_stmts_with_nlp_evidence[0]
x=st.evidence[0].to_json()
for key in x.keys():
    print(key)
print("\n")
y=x['annotations']
for key in y.keys():
    print(key)

In [None]:
for st in filtered_stmts_with_nlp_evidence:
    x=st.evidence[0].to_json()['annotations']
    for key in x.keys():
        print(key)
    #print(x["text_nlp"])
    
        print("new \n")

In [None]:
pa = PybelAssembler(filtered_stmts_with_nlp_evidence)
pybel_graph = pa.make_model()

In [None]:
# Convert Indra graph to Pybel
#https://emmaa.indra.bio/dashboard/covid19?tab=model

#pybel_covid_graph=pybel.from_emmaa('covid19', date="2020-04-23-17-44-57") 

In [None]:
pybel_graph.summarize() # summarise 

In [None]:
import pickle
pickle.dump(pybel_graph, open( "pybel_graph.p", "wb"))

#### Approach B - Generate Toy Dataset with Raw Text and Evidence

In [None]:
# use local repo cloned from github to access to_triple function
# this is not yet in pypi version, so need to access local cloned location
#sys.path.insert(0,"/home/username/pybel/src/") # If you are using a local version of the file

#from pybel.io.triples import api
# not working - IGNORE
#import imp
#imp.find_module("pybel")
#triples_api = imp.load_source('api', "/home/lani_lichtenstein/pybel/src/pybel/io/triples/api.py")
#import importlib
#importlib.reload(pybel)
pybel.__path__

In [9]:
import pickle
pybel_graph=pickle.load(open( "pybel_graph.p", "rb"))

In [None]:
import logging
from pybel.dsl import BaseConcept
from tqdm import tqdm
uniq_key_list_annotations=[]

for u,v,data in tqdm(pybel_graph.edges(data=True)):
            
    if 'annotations' in data.keys():
        #print("Explore relation \n")
        #print(data['relation'])
        #print("\n")
        annotations=data['annotations']
        #print(annotations)
        
        for key in annotations.keys():
            if key not in uniq_key_list_annotations:
                uniq_key_list_annotations.append(key)
                
        #for key,val in annotations.items():
        #    print(key)
        #    print(type(annotations[key]))


In [None]:
uniq_key_list_annotations


In [None]:
import logging
from pybel.dsl import BaseConcept
from tqdm import tqdm
#from pybel.io.triples import api

uniq_key_list_annotations=[]
column_list=["Source", "Target", "Relation", "Evidence", "Citation", "Text_NLP"]
indra_df=pd.DataFrame(columns=column_list)

for u,v,data in tqdm(pybel_graph.edges(data=True)):

    source='NaN'
    target='NaN'
    evidence='NaN'
    relation='NaN'
    annotations='NaN'
    text_nlp='NaN'
    
    #h,r,t=to_triple(u,v,data) https://github.com/pybel/pybel/blob/master/src/pybel/io/triples/api.py
    
    if isinstance(u, BaseConcept):
        source=u.name
        #source_obo=u.obo
        #print(entity)
        #print(u.name)
        #print(u.obo)
        #print("\n")
        
    if isinstance(v, BaseConcept):
        target=v.name
        
    if 'evidence' in data.keys():  # look also at pybel.has_edge_evidence() 
        #print("Explore evidence \n")
        #print(data['evidence'])
        evidence=data["evidence"]
        #print(type(evidence))
    
    if 'relation' in data.keys():
        #print("Explore relation \n")
        #print(data['relation'])
        #print("\n")
        relation=data['relation']
        
    if 'annotations' in data.keys():
        #print("Explore relation \n")
        #print(data['relation'])
        #print("\n")
        annotations=data['annotations']
        #print(annotations)
        
        for key in annotations.keys():
            if key not in uniq_key_list_annotations:
                uniq_key_list_annotations.append(key)
        
        if 'evidence_annotations' in annotations.keys():
            evidence_annotations=annotations['evidence_annotations']
            if 'text_nlp' in evidence_annotations.keys():
                text_nlp=evidence_annotations['text_nlp']
            
    if 'citation' in data.keys():
        #print("Explore relation \n")
        #print(data['relation'])
        #print("\n")
        citation=data['citation']
        
    tmp=pd.Series([source, target, relation, evidence, citation, text_nlp], index=column_list)
    indra_df=indra_df.append(tmp, ignore_index=True)


In [None]:
import pickle
pickle.dump(indra_df, open( "indra_df.p", "wb"))

In [5]:
import pickle
indra_df=pickle.load(open( "indra_df.p", "rb"))

In [10]:
# explore
indra_df.shape
indra_df.head()

Unnamed: 0,Source,Target,Relation,Evidence,Citation,Text_NLP
0,MS,Mucins,directlyIncreases,"In the current study, we investigated the NanS...","{'db': 'PubMed', 'db_id': '30340996'}","{'source_api': 'reach', 'pmid': '30340996', 't..."
1,MS,Mucins,directlyIncreases,"In the current study, we investigated the NanS...","{'db': 'Other', 'db_id': 'reach:Unknown'}","{'source_api': 'reach', 'pmid': '30340996', 't..."
2,MS,Mucins,directlyIncreases,"In the current study, we investigated the NanS...","{'db': 'PubMed', 'db_id': '30340996'}","{'source_api': 'reach', 'pmid': '30340996', 't..."
3,MS,MUC3B,increases,MS analysis of peptides recovered from an in-g...,"{'db': 'PubMed', 'db_id': '23118947'}","{'source_api': 'reach', 'pmid': '32214933', 't..."
4,MS,MUC3B,increases,MALDI MS allows the identification of higher m...,"{'db': 'PubMed', 'db_id': '32214933'}","{'source_api': 'reach', 'pmid': '32214933', 't..."


In [None]:
#pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_md-0.2.5.tar.gz

In [6]:
import spacy
import scispacy
import en_core_sci_md
from spacy.matcher import Matcher
import numpy as np
from tqdm import tqdm

In [7]:

# correct code
# nlp=spacy.load("en_core_sci_md")

# problem with not recognising path
nlp=spacy.load("/home/lani_lichtenstein/.local/lib/python3.6/site-packages/en_core_sci_md/en_core_sci_md-0.2.5/")

In [13]:
indra_df.shape

(133401, 6)

In [103]:

def get_text_position(txt_nlp, position_flag):
    ''' Function get cell position for source, target start and stop'''
    
    matcher = Matcher(nlp.vocab, validate=True)

    nlp_doc = txt_nlp
    new_doc = txt_nlp.text
    new_doc_nlp=[]

    names = []
    
    if position_flag== 0: # source start
        pattern = [{"TEXT": {"REGEX": "<"}}, {"TEXT": "span"}, {"TEXT": "class=\"badge"}, {"TEXT": "badge-subject\""},{"TEXT": {"REGEX": ">"}}]
    elif position_flag == 1:
        pattern = [{"TEXT": {"REGEX": "<"}}, {"TEXT": "/span"}, {"TEXT": ">"}]
    elif position_flag == 2:
        pattern = [{"TEXT": {"REGEX": "<"}}, {"TEXT": "span"}, {"TEXT": "class=\"badge"}, {"TEXT": "badge-object\""},{"TEXT": {"REGEX": ">"}}]
    elif position_flag == 3:
        pattern = [{"TEXT": {"REGEX": "<"}}, {"TEXT": "/span"}, {"TEXT": ">"}]

    matcher.add('source_start', None, pattern) 
    matches = matcher(nlp_doc) 

    for match_id, start, end in matches[0:1]: # just use first match 
        span = nlp_doc[start:end] 
        names.append(span.text) 

    for name in names: 
        new_doc = new_doc.replace(name,'',1) # replace first instance only
        new_doc = new_doc.replace('  ',' ') # replace double whitespace with one whitespace
        new_doc_nlp=nlp(new_doc)
    
    return(matches,new_doc_nlp)

In [78]:
indra_df_new=indra_df.copy()

indra_df_new['source_start']=np.NaN
indra_df_new['source_end']=np.NaN
indra_df_new['target_start']=np.NaN
indra_df_new['target_end']=np.NaN
indra_df_new['annotation_text']=None

In [79]:
indra_df_new.columns

Index(['Source', 'Target', 'Relation', 'Evidence', 'Citation', 'Text_NLP',
       'source_start', 'source_end', 'target_start', 'target_end',
       'annotation_text'],
      dtype='object')

In [87]:


for i in tqdm(range(indra_df.shape[0])): 
    #print(i)

    if indra_df.Text_NLP[i]=="NaN" or 'text' not in indra_df.Text_NLP[i].keys():
        continue
        
    txt=indra_df.Text_NLP[i]['text']
            
    source_start=np.NaN
    source_end=np.NaN
    target_start=np.NaN
    target_end=np.NaN

    if txt == None:
        continue    
    txt=txt.replace("</span", " </span") # so tokenizer can split the makrker of the source/target span into separate token
    txt_nlp=nlp(txt)
    evidence_nlp=nlp(indra_df.Evidence[i])

    
    # Two cases - source appears first, target appears first. 
    matches,new_doc_nlp=get_text_position(txt_nlp=txt_nlp,position_flag=0) # source start
    if len(matches) > 0:
        source_start=matches[0][1]
    matches,new_doc_nlp=get_text_position(txt_nlp=txt_nlp,position_flag=2) # source start
    if len(matches) > 0:
        target_start=matches[0][1]

    if source_start < target_start:
        option_start="A"
    else:
        option_start="B"
    
    if option_start == "A":
        
        # get source start
        if txt_nlp==[]: # empty list
            continue
        matches,new_doc_nlp=get_text_position(txt_nlp=txt_nlp,position_flag=0)
        if len(matches) > 0:
            source_start=matches[0][1]

        # get source end 
        if new_doc_nlp==[]:
            continue
        matches,new_doc_nlp=get_text_position(txt_nlp=new_doc_nlp,position_flag=1)
        if len(matches) > 0:
            source_end=matches[0][1] 

        # get target start
        if new_doc_nlp==[]:
            continue
        matches,new_doc_nlp=get_text_position(txt_nlp=new_doc_nlp,position_flag=2) # get target start
        if len(matches) > 0:
            target_start=matches[0][1]

        # get target end
        if new_doc_nlp==[]:
            continue
        matches,new_doc_nlp=get_text_position(txt_nlp=new_doc_nlp,position_flag=3)
        if len(matches) > 0:
            print(matches)
            target_end=matches[0][1]

    if option_start == "B": # target appears before source in text

        # get target start
        if txt_nlp==[]: # empty list
            continue
        matches,new_doc_nlp=get_text_position(txt_nlp=txt_nlp,position_flag=2) # pick up target pattern
        if len(matches) > 0:
            target_start=matches[0][1]

        # get target end 
        if new_doc_nlp==[]:
            continue
        matches,new_doc_nlp=get_text_position(txt_nlp=new_doc_nlp,position_flag=3) # pick up '</span>' to end target
        if len(matches) > 0:
            print(target_start)
            print(matches)
            target_end=matches[0][1]

        # get source start
        if new_doc_nlp==[]:
            continue
        matches,new_doc_nlp=get_text_position(txt_nlp=new_doc_nlp,position_flag=0) # get source start
        if len(matches) > 0:
            source_start=matches[0][1]

        # get source end
        if new_doc_nlp==[]:
            continue
        matches,new_doc_nlp=get_text_position(txt_nlp=new_doc_nlp,position_flag=1)  # pick up '</span>' to end source
        if len(matches) > 0:
            source_end=matches[0][1]

    indra_df_new.loc[i,"annotation_text"] = new_doc_nlp.text
    indra_df_new.loc[i,"source_start"] = source_start
    indra_df_new.loc[i,"source_end"] = source_end
    indra_df_new.loc[i,"target_start"] = target_start
    indra_df_new.loc[i,"target_end"] = target_end


  0%|          | 2/133401 [00:00<4:32:43,  8.15it/s]

22
[(10708931376594966169, 23, 26), (10708931376594966169, 50, 53)]
22
[(10708931376594966169, 23, 26), (10708931376594966169, 50, 53)]


  0%|          | 4/133401 [00:00<4:31:43,  8.18it/s]

22
[(10708931376594966169, 23, 26), (10708931376594966169, 50, 53)]
[(10708931376594966169, 10, 13)]


  0%|          | 6/133401 [00:00<4:34:21,  8.10it/s]

[(10708931376594966169, 10, 13)]
[(10708931376594966169, 10, 13)]


  0%|          | 8/133401 [00:00<4:35:36,  8.07it/s]

[(10708931376594966169, 8, 11)]
[(10708931376594966169, 8, 11)]


  0%|          | 10/133401 [00:01<4:38:02,  8.00it/s]

[(10708931376594966169, 8, 11)]
[(10708931376594966169, 8, 11)]


  0%|          | 12/133401 [00:01<4:28:09,  8.29it/s]

[(10708931376594966169, 8, 11)]
[(10708931376594966169, 27, 30)]


  0%|          | 14/133401 [00:01<4:13:02,  8.79it/s]

[(10708931376594966169, 27, 30)]
[(10708931376594966169, 27, 30)]
[(10708931376594966169, 5, 8)]


  0%|          | 16/133401 [00:01<3:53:12,  9.53it/s]

[(10708931376594966169, 5, 8)]
[(10708931376594966169, 5, 8)]


  0%|          | 19/133401 [00:02<3:31:21, 10.52it/s]

[(10708931376594966169, 25, 28)]
[(10708931376594966169, 25, 28)]


  0%|          | 22/133401 [00:02<4:18:56,  8.58it/s]

[(10708931376594966169, 25, 28)]
[(10708931376594966169, 25, 28)]


  0%|          | 24/133401 [00:02<4:30:55,  8.21it/s]

[(10708931376594966169, 7, 10), (10708931376594966169, 24, 27), (10708931376594966169, 36, 39)]
[(10708931376594966169, 7, 10), (10708931376594966169, 24, 27), (10708931376594966169, 36, 39)]


  0%|          | 25/133401 [00:02<4:21:51,  8.49it/s]

[(10708931376594966169, 7, 10), (10708931376594966169, 24, 27), (10708931376594966169, 36, 39)]





KeyboardInterrupt: 

In [88]:
indra_df_new.columns

Index(['Source', 'Target', 'Relation', 'Evidence', 'Citation', 'Text_NLP',
       'source_start', 'source_end', 'target_start', 'target_end',
       'annotation_text'],
      dtype='object')

In [89]:
indra_df_new.head()

Unnamed: 0,Source,Target,Relation,Evidence,Citation,Text_NLP,source_start,source_end,target_start,target_end,annotation_text
0,MS,Mucins,directlyIncreases,"In the current study, we investigated the NanS...","{'db': 'PubMed', 'db_id': '30340996'}","{'source_api': 'reach', 'pmid': '30340996', 't...",41.0,42.0,22.0,23.0,"In the current study, we investigated the NanS..."
1,MS,Mucins,directlyIncreases,"In the current study, we investigated the NanS...","{'db': 'Other', 'db_id': 'reach:Unknown'}","{'source_api': 'reach', 'pmid': '30340996', 't...",41.0,42.0,22.0,23.0,"In the current study, we investigated the NanS..."
2,MS,Mucins,directlyIncreases,"In the current study, we investigated the NanS...","{'db': 'PubMed', 'db_id': '30340996'}","{'source_api': 'reach', 'pmid': '30340996', 't...",41.0,42.0,22.0,23.0,"In the current study, we investigated the NanS..."
3,MS,MUC3B,increases,MS analysis of peptides recovered from an in-g...,"{'db': 'PubMed', 'db_id': '23118947'}","{'source_api': 'reach', 'pmid': '32214933', 't...",1.0,2.0,9.0,10.0,MALDI MS allows the identification of higher m...
4,MS,MUC3B,increases,MALDI MS allows the identification of higher m...,"{'db': 'PubMed', 'db_id': '32214933'}","{'source_api': 'reach', 'pmid': '32214933', 't...",1.0,2.0,9.0,10.0,MALDI MS allows the identification of higher m...


In [97]:
#Check example - at row 22 
#nlp.tokenizer.explain(indra_df_new.annotation_text[3])

#nlp(indra_df_new.Evidence[22])
start=indra_df_new.source_start[22]
end=indra_df_new.source_end[22]
nlp(indra_df_new['annotation_text'][22])[start:end]



mucin

In [99]:

#nlp(indra_df_new.Evidence[22])
start=indra_df_new.target_start[22]
end=indra_df_new.target_end[22]
nlp(indra_df_new['annotation_text'][22])[start:end]


biofilm formation

In [96]:
indra_df_new['Text_NLP'][22]['text']

'Our finding that <span class="badge badge-subject">mucin</span> decreases <span class="badge badge-object">biofilm formation</span> is consistent with studies, which showed that <span class="badge badge-subject">mucin</span> significantly decreased <span class="badge badge-object">biofilm formation</span> by Streptococcus mutans and P. aeruginosa (Haley et al., 2014, Frenkel and Ribbeck, 2015).'

In [100]:
indra_df.to_csv("indra_covid_toy_dataset_raw_evidence_high_belief.csv",index=False,sep="\t",header=False)

# OLD CODE BELOW

#### Approach A - Generate Triples

One approach to generating a toy dataset is to generate triples. 
Triples can be used to generate knowledge graph embeddings. 
They also contain grounded source and target identifiers, as well as details relation descriptions. 

This is not obtained using Approach B - Generate Raw Data with Evidence

In [None]:
import pickle
pybel_graph=pickle.load(open( "pybel_graph.p", "rb"))

In [None]:
import pybel.io.tsv.api

triples=pybel.io.tsv.api.get_triples(pybel_graph)

In [None]:
import numpy as np
triples = np.array(triples)

In [None]:
triples_df=pd.DataFrame(triples)

In [None]:
triples_df.to_csv("indra_covid_toy_dataset_triples.csv",index=False,sep="\t",header=False)

In [None]:
matches # position is matches[1] (token start)
source_start=matches[0][1]
source_start

In [None]:
new_doc_nlp[5] # position of matches
#nlp.tokenizer.explain(new_doc_nlp.text)

In [19]:
# now going to find END of source
def get_source_end(txt_nlp):
    
    matcher = Matcher(nlp.vocab, validate=True)

    nlp_doc = new_doc_nlp 
    new_doc = new_doc_nlp.text

    names = []

    #pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}] 
    pattern = [{"TEXT": {"REGEX": "<"}}, {"TEXT": "/span"}, {"TEXT": ">"}]
    #pattern = [{"TEXT": {"REGEX": "<"}}]
    #{"TEXT": {"REGEX": "<"}

    matcher.add('end_source', None, pattern) 
    matches = matcher(nlp_doc) 

    for match_id, start, end in matches[0:1]: # just replacing first "</span>". Not replacing the end of the target for now. 
        span = nlp_doc[start:end] 
        names.append(span.text) 

    for name in names: 
        new_doc = new_doc.replace(name,'',1) # only replace first occurence
        new_doc = new_doc.replace('  ',' ') # replace double whitespace with one whitespace
        new_doc_nlp=nlp(new_doc)

In [None]:

source_end=matches[0][1] # assume only one element in matches


In [None]:
source_end

In [None]:
#evidence_nlp[source_start:source_end] # good pi
#nlp.tokenizer.explain(new_doc_nlp.text)  # there is a "blank token in here that doesnt get shown"

new_doc_nlp[3]
#evidence_nlp[1]

In [None]:
## Now for target 
#nlp.tokenizer.explain(new_doc_nlp.text)

In [None]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab, validate=True)

nlp_doc = new_doc_nlp
new_doc = new_doc_nlp.text

names = []
pattern = [{"TEXT": {"REGEX": "<"}}, {"TEXT": "span"}, {"TEXT": "class=\"badge"}, {"TEXT": "badge-object\""},{"TEXT": {"REGEX": ">"}}]

matcher.add('target_start', None, pattern) 
matches = matcher(nlp_doc) 

for match_id, start, end in matches: 
    span = nlp_doc[start:end] 
    names.append(span.text) 

for name in names: 
    new_doc = new_doc.replace(name,'')
    new_doc_nlp=nlp(new_doc)

In [None]:
target_start=matches[0][1]  # subtract 1, because first match of "<" is treated as prefix, not a token
target_start

In [None]:
new_doc_nlp

In [None]:
# now going to find END of target
matcher = Matcher(nlp.vocab, validate=True)

nlp_doc = new_doc_nlp 
new_doc = new_doc_nlp.text

names = []

#pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}] 
pattern = [{"TEXT": {"REGEX": "<"}}, {"TEXT": "/span"}, {"TEXT": ">"}]
#pattern = [{"TEXT": {"REGEX": "<"}}]
#{"TEXT": {"REGEX": "<"}

matcher.add('end_source', None, pattern) 
matches = matcher(nlp_doc) 

for match_id, start, end in matches[0:1]: # just replacing first "</span>". Not replacing the end of the target for now. 
    span = nlp_doc[start:end] 
    names.append(span.text) 

for name in names: 
    new_doc = new_doc.replace(name,'')
    new_doc = new_doc.replace('  ',' ')
    new_doc_nlp=nlp(new_doc)

In [None]:
target_end=matches[0][1]

In [None]:
evidence_nlp[target_start:target_end]

In [None]:
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)
#pattern = [{"LOWER": "\<span class=\"badge badge-subject\"\>"}]
pattern = [{"LOWER": "\<"}, {"LOWER": "span"}, {"LOWER": "\"badge"}]
pattern = [{"LOWER": "\<"}]

matcher.add("HelloWorld", None, pattern)
doc = nlp("hello world!")
matches = matcher(doc)

In [None]:
matches

In [None]:
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)
pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
matcher.add("HelloWorld", None, pattern)
doc = nlp("hello world!")
matches = matcher(doc)

In [None]:
matches

In [None]:
##OLD CODE
# get source end 
        if new_doc_nlp==[]:
            continue
            
        #  Sometimes object (target) appears before subject (source
        # To ensure we match the right '</span>' marking end of the source (not the target)
        # just pass in text from source start until the end of the text. 
        # If src before trgt, then first '</span>' is replaced withut a problem
        # If trgt before src, then we cut out this first part of the sentnce, so wont impact the target end '</span>' marker
        new_doc_nlp_end=new_doc_nlp[source_start:len(new_doc_nlp.text)]
        matches,new_doc_nlp_end=get_text_position(txt_nlp=new_doc_nlp_end,position_flag=1)

        # now, need to piece together original doc, + returned doc..
        if new_doc_nlp_end==[]:
            continue

        new_doc_nlp1=nlp(new_doc_nlp[0:source_start].text + new_doc_nlp_end.text)

        if len(matches) > 0:
            source_end=source_start+matches[0][1] # need to offset the matches position by location of source start

        # get target start
        #print("before target start")
        #print(new_doc_nlp)
        if new_doc_nlp1==[]:
            continue
        matches,new_doc_nlp=get_text_position(txt_nlp=new_doc_nlp1,position_flag=2) # get target start
        if len(matches) > 0:
            target_start=matches[0][1]

     

In [None]:
# to add triples - read in api.py module and use to_triple
#https://github.com/pybel/pybel/blob/master/src/pybel/io/triples/api.py
