Load a list of open access PMID files.

In [5]:
import os
from subprocess import call

pmids = []
pmid_file = "/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/pmids.txt"

with open(pmid_file) as f:
    for line in f.readlines():
        pmids.append(line.strip())

Shell commands to build the zipped bundle. 

In [None]:
# echo -n application/vnd.wf4ever.robundle+zip > mimetype
# zip -0 -X ../reach mimetype
# zip -X -r ../reach . -x mimetype


Convert the sentences from each paper processed by SciDT into simple sentences for each Figure assignment.

In [28]:
import pandas as pd
from nltk.corpus import stopwords
import re

def retrieve_sentences_for_modeling(inFile, fid):
    
    tsv = pd.read_csv(inFile, sep='\t')
    fig_tagged_sentences = {}

    for i, row in tsv.iterrows():
        sid = row['SentenceId']
        codeStr = row['Codes']
        paragraph = row['Paragraph']
        text = row['Sentence Text']
        heading = row['Headings']
        floatingBox = row['FloatingBox?']
        discourse = row['Discourse Type']
        reachData = row['friesEventsTypes']
        fig = row['Figure Assignment']
        offset_start = row['Offset_Begin']
        offset_end = row['Offset_End']
        
        if fig == fig:
            for f in fig.split('|'):
                if( fig_tagged_sentences.get(f, None) is None ):
                    sent_list = []
                    fig_tagged_sentences[f] = sent_list
                    sent_list.append({'sid': sid, 'pid':paragraph, 
                                    'start': offset_start, 'end': offset_end, 'text': text,
                                    'discourse_types': discourse})
                else:
                    sent_list = fig_tagged_sentences[f]
                    sent_list.append({'sid': sid, 'pid':paragraph, 
                                    'start': offset_start, 'end': offset_end, 'text': text,
                                    'discourse_types': discourse})
                
            
    return fig_tagged_sentences

In [53]:
tsv_dir = "/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/scidt_fries_bioc_tsv4"
sentence_dir = "/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/fig_sentences"

for root, dirs, files in os.walk(tsv_dir):
    for file in files:    
        if os.path.isfile(root+'/'+file) and file[-4:]=='.tsv' :
            pmid = file[:-4]
            if( pmid in pmids ):
                print( pmid )
                fig_tagged_sentences = retrieve_sentences_for_modeling(root+'/'+file, pmid)
                for fig in fig_tagged_sentences.keys():
                    out = open(sentence_dir+'/'+pmid+'_'+fig+'.txt', 'w')
                    for sent_hash in fig_tagged_sentences[fig]:
                        out.write(sent_hash['text'] + '\n')
                    out.close()

10087260
10087263
10087265
10209036
10225955
10366597
10366599
10385523
10385526
10402465
10429675
10545507
10562275
10562277
10562279
10562288
10601328
10601346
10613896
10620603
10629222
10648568
10662770
10684247
10704439
10704444
10704446
10725331
10725334
10747088
10747089
10790433
10811823
10831611
10859335
10864201
10871282
10900456
10931856
10931877
10953014
10974003
10995436
11018051
11018064
11034606
11038172
11038182
11076969
11086001
11134073
11149930
11157975
11157979
11157984
11181702
11257119
11266443
11266449
11266451
11309418
11401320
11402059
11448995
11502761
11514608
11564755
11570975
11571312
11591728
11591731
11684708
11724822
11739401
11739402
11739404
11747467
11756480
11777938
11846885
11877480
11914126
11916981
11927603
11927608
12011112
12147674
12167173
12199906
12421467
12446742
12473693
12486103
12486115
12507995
12527750
12566426
12642614
12682088
12689351
12719471
12771128
12782684
12847081
12900395
12939254
14517205
14568990
14612908
14638857
14707117
1

Functions to simplify INTACT records from their standard XML into TSV format.  

In [26]:
from bs4 import BeautifulSoup
import re

def build_figure_extraction_patterns():
    bf = "\s*f(igs|igs\.|ig|ig\.|igure|\.|ig\:){0,1}"
    d =  "\s*(\d+\s*[\.\;\,]{0,1}\s*[a-z]*)\s*\.{0,1}\s*"
    d_split =  "\s*(\d*)\s*[\.\;\,]{0,1}\s*([a-z]*)"
    interval = "\s*(\d+)([a-z]+)\\-([a-z]+)"
    pattHash = {} 
    
    figPatt = []
    pattHash['figPatt'] = figPatt
    
    # 0. No alphanumeric codes at all: 'Figure. 1; more text'
    figPatt.append(re.compile("^" + bf + d + "$"))         
    figPatt.append(re.compile("^" + bf + "\s*(\d+\s*[\.\;\,]{0,1}\s*[a-z]*)[\,\;\.]{0,1}\s*t"))
    figPatt.append(re.compile("^" + bf + "\s*(\d+\s*[\.\;\,]{0,1}\s*[a-z]*)[\,\;\.]{0,1}\s*s"))
    figPatt.append(re.compile("^" + bf + "\s*(\d+\s*[\.\;\,]{0,1}\s*[a-z]*)[\,\;\.]{0,1}\s+and\s+s"))
    
    # [1]
    simplePatt = re.compile("^" + d + "$");
    pattHash['simplePatt'] = simplePatt
    
    # [2,4]    
    space2Patt = re.compile("^" + bf + d + "\s+" + bf + d + "$");
    pattHash['space2Patt'] = space2Patt

    # [2,4,6]    
    space3Patt = re.compile("^"+bf+d+"\s+"+bf+d+"\s+"+bf+d+"$");
    pattHash['space3Patt'] = space3Patt

    # [2,4]
    fullComma2Patt = re.compile("^" + bf + d + "[\;\,]" + bf + d + "$")
    pattHash['fullComma2Patt'] = fullComma2Patt
    
    # [2,3]
    comma2Patt = re.compile("^" + bf + d + "[\;\,]" + d + "$")
    pattHash['comma2Patt'] = comma2Patt

    # [1,2]
    simpleComma2Patt = re.compile("^" + d + "[\;\,]" + d + "$")
    pattHash['simpleComma2Patt'] = simpleComma2Patt

    # [2,3,4]
    comma3Patt = re.compile("^" + bf + d + "[\;\,]" + d + "[\;\,]" + d + "$");
    pattHash['comma3Patt'] = comma3Patt
    
    # [1,2,3]
    simpleComma3Patt = re.compile("^" + d + "[\;\,]" + d + "[\;\,]" + d + "$");
    pattHash['simpleComma3Patt'] = simpleComma3Patt

    # [2,3,4,5]
    comma4Patt = re.compile("^"+bf+d+"[\;\,]"+d+"[\;\,]"+d+"[\;\,]"+d+"$");
    pattHash['comma4Patt'] = comma4Patt

    # [2,3,4,5,6]
    comma5Patt = re.compile("^"+bf+d+"[\;\,]"+d+"[\;\,]"+d+"[\;\,]"+d+"[\;\,]"+d+"$");
    pattHash['comma5Patt'] = comma5Patt

    # [1,2,3,4]
    simpleComma4Patt = re.compile("^"+d+"[\;\,]"+d+"[\;\,]"+d+"[\;\,]"+d+"$");
    pattHash['simpleComma4Patt'] = simpleComma4Patt

    # [2,3]
    and2Patt = re.compile("^" + bf + d + "\s+and\s+" + d + "$");
    pattHash['and2Patt'] = and2Patt
    
    # [1,2]
    simpleAnd2Patt = re.compile("^" + d + "\s+and\s+" + d + "$");
    pattHash['simpleAnd2Patt'] = simpleAnd2Patt

    # [1,2,3]
    simple_a_and_b_patt = re.compile("^" + d_split + "\s+and\s+([a-z])$");
    pattHash['simple_a_and_b_patt'] = simple_a_and_b_patt

    # [2,3,4]
    a_and_b_patt = re.compile("^" + bf + d_split + "\s+and\s+([a-z])$");
    pattHash['a_and_b_patt'] = a_and_b_patt

    # [1,2,3]
    simple_a_comma_b_patt = re.compile("^" + d_split + "[\;\,]\s*([a-z])$");
    pattHash['simple_a_comma_b_patt'] = simple_a_comma_b_patt

    # [2,3,4]
    a_comma_b_patt = re.compile("^"+bf+d_split+"[\;\,]\s*([a-z])$");
    pattHash['a_comma_b_patt'] = a_comma_b_patt

    # [1,2,3]
    simple_a_comma_b_comma_c_patt = re.compile("^" + d_split + "[\;\,]\s*([a-z])\s*[\;\,]\s*([a-z])$");
    pattHash['simple_a_comma_b_comma_c_patt'] = simple_a_comma_b_comma_c_patt

    # [2,3,4]
    a_comma_b_comma_c_patt = re.compile("^"+bf+d_split+"[\;\,]\s*([a-z])\s*[\;\,]\s*([a-z])$");
    pattHash['a_comma_b_comma_c_patt'] = a_comma_b_comma_c_patt

    # [2,3,4,5]
    a_b_and_c_patt = re.compile("^" + bf + d_split + "[\;\,]\s+([a-z])\s+and\s+([a-z])$");
    pattHash['a_b_and_c_patt'] = a_b_and_c_patt

    # [1,2,3,4]
    simple_a_b_and_c_patt = re.compile("^" + d_split + "[\;\,]\s+([a-z])\s+and\s+([a-z])$");
    pattHash['simple_a_b_and_c_patt'] = simple_a_b_and_c_patt

    tableFigPatt = re.compile("^t(ab\.|ab|able){0,1}.*" + bf + d + "$");
    pattHash['tableFigPatt'] = tableFigPatt

    intervalPatt = re.compile("^" + bf + interval + "$");
    pattHash['intervalPatt'] = intervalPatt

    # simple single table (table 1, t1, tab. 1a)
    # returned value is second group
    tablePatt = re.compile("^t(ab\.|ab|able){0,1}\s*([\di]+[a-z]{0,1})[\,\;\.]{0,1}$");
    pattHash['tablePatt'] = tablePatt

    # simple single table (table 1, t1, tab. 1a)
    # returned value is third group
    suppTablePatt = re.compile("^s(upp|upp.|lementary){0,1}\s*t(ab\.|ab|able){0,1}\s*([i\d]+[a-z]{0,1})[\,\;\.]{0,1}$");
    pattHash['suppTablePatt'] = suppTablePatt
    
    return pattHash

def run_simple_matcher(fig_text, patt_hash, patt_code, groups=[1]):
    match = re.search(patt_hash.get(patt_code), fig_text)
    results = []
    if( match ) :
        for g in groups:
            results.append(match.group(g))
        return results
    else:
        return None

def build_matched_string(matched_list,code):
    matched_str = ""
    for mf in matched_list:
        if len(matched_str) > 0 :
            matched_str += '|'
        matched_str += code + mf.replace(" ", "").replace(".", "")
    return matched_str

def run_matcher(fig_text, patt_hash):
    
    if(fig_text == 'nfa' ):
        return None
    
    # strip out all parentheses.
    paren_patt = re.compile("(\(.+?\))")
    fig_text = re.sub(paren_patt, "", fig_text)

    # covert & to 'and'.
    fig_text = fig_text.replace("&", "and")
    
    fig_patt = patt_hash.get('figPatt')
    for p in fig_patt:
        match = re.search(p, fig_text)
        if match:
            return 'f' + match.group(2).replace(" ","").replace(".","").replace(",","")
    
    # [1] simplePatt
    # [2,4] space2Patt
    # [2,4,6] space3Patt
    # [2,4] fullComma2Patt
    # [2,3] comma2Patt
    # [1,2] simpleComma2Patt
    # [2,3,4] comma3Patt 
    # [1,2,3] simpleComma3Patt
    # [2,3,4,5] comma4Patt
    # [1,2,3,4] simpleComma4Patt
    # [1,2] simpleAnd2Patt
    # [1,2,3] simple_a_comma_b_patt 
    # [2,3,4] a_comma_b_patt 
    # [2,3,4,5]   a_b_and_c_patt 
    # [1,2,3,4] simple_a_b_and_c_patt
    
    matched_figs = run_simple_matcher(fig_text, patt_hash, 'simplePatt', [1])
    if( matched_figs is None ):
        matched_figs = run_simple_matcher(fig_text, patt_hash, 'tableFigPatt', [3])
    if( matched_figs is None ):
        matched_figs = run_simple_matcher(fig_text, patt_hash, 'comma2Patt', [2,3])
    if( matched_figs is None ):
        matched_figs = run_simple_matcher(fig_text, patt_hash, 'fullComma2Patt', [2,4])
    if( matched_figs is None ):
        matched_figs = run_simple_matcher(fig_text, patt_hash, 'simpleComma2Patt', [1,2])
    if( matched_figs is None ):
        matched_figs = run_simple_matcher(fig_text, patt_hash, 'comma3Patt', [2,3,4])
    if( matched_figs is None ):
        matched_figs = run_simple_matcher(fig_text, patt_hash, 'simpleComma3Patt', [1,2,3])
    if( matched_figs is None ):
        matched_figs = run_simple_matcher(fig_text, patt_hash, 'comma4Patt', [2,3,4,5])
    if( matched_figs is None ):
        matched_figs = run_simple_matcher(fig_text, patt_hash, 'simpleComma4Patt', [1,2,3,4])
    if( matched_figs is None ):
        matched_figs = run_simple_matcher(fig_text, patt_hash, 'comma5Patt', [2,3,4,5,6])
    if( matched_figs is None ):
        matched_figs = run_simple_matcher(fig_text, patt_hash, 'space2Patt', [2,4])
    if( matched_figs is None ):
        matched_figs = run_simple_matcher(fig_text, patt_hash, 'space3Patt', [2,4,6])
    if( matched_figs is None ):
        matched_figs = run_simple_matcher(fig_text, patt_hash, 'simpleAnd2Patt', [1,2])
    if( matched_figs is None ):
        matched_figs = run_simple_matcher(fig_text, patt_hash, 'and2Patt', [2,3])
    if( matched_figs is None ):
        match = re.search(patt_hash.get('simple_a_comma_b_patt'), fig_text)
        if( match ):
            f =  match.group(1)
            a = match.group(2)
            b = match.group(3)
            return 'f'+f+a+'|'+'f'+f+b
    if( matched_figs is None ):
        match = re.search(patt_hash.get('a_comma_b_patt'), fig_text)
        if( match ):
            f =  match.group(2)
            a = match.group(3)
            b = match.group(4)
            return 'f'+f+a+'|'+'f'+f+b
    if( matched_figs is None ):
        match = re.search(patt_hash.get('simple_a_and_b_patt'), fig_text)
        if( match ):
            f =  match.group(1)
            a = match.group(2)
            b = match.group(3)
            return 'f'+f+a+'|'+'f'+f+b
    if( matched_figs is None ):
        match = re.search(patt_hash.get('a_and_b_patt'), fig_text)
        if( match ):
            f =  match.group(2)
            a = match.group(3)
            b = match.group(4)
            return 'f'+f+a+'|'+'f'+f+b
    if( matched_figs is None ):
        match = re.search(patt_hash.get('a_b_and_c_patt'), fig_text)
        if( match ):
            f =  match.group(2)
            a = match.group(3)
            b = match.group(4)
            c = match.group(5)
            return 'f'+f+a+'|'+'f'+f+b+'|'+'f'+f+c
    if( matched_figs is None ):
        match = re.search(patt_hash.get('simple_a_b_and_c_patt'), fig_text)
        if( match ):
            f =  match.group(1)
            a = match.group(2)
            b = match.group(3)
            c = match.group(4)
            return 'f'+f+a+'|'+'f'+f+b+'|'+'f'+f+c
    if( matched_figs is None ):
        match = re.search(patt_hash.get('simple_a_comma_b_comma_c_patt'), fig_text)
        if( match ):
            f =  match.group(1)
            a = match.group(2)
            b = match.group(3)
            c = match.group(4)
            return 'f'+f+a+'|'+'f'+f+b+'|'+'f'+f+c
    if( matched_figs is None ):
        match = re.search(patt_hash.get('a_comma_b_comma_c_patt'), fig_text)
        if( match ):
            f =  match.group(2)
            a = match.group(3)
            b = match.group(4)
            c = match.group(5)
            return 'f'+f+a+'|'+'f'+f+b+'|'+'f'+f+c
    if( matched_figs is None ):
        match = re.search(patt_hash.get('intervalPatt'), fig_text)
        if( match ):
            fig_number =  match.group(2)
            start = match.group(3)
            end = match.group(4)
            if( len(start) > 1 or len(end)>1 ):
                return None
            matched_str = ""
            subfigs = [chr(i) for i in range(ord(start),ord(end)+1)] 
            for subfig in subfigs :
                if len(matched_str) > 0 :
                    matched_str += '|'
                matched_str += 'f' + fig_number + subfig
            return matched_str
            
    if(matched_figs is not None):
        return build_matched_string(matched_figs, 'f')
    
    matched_tab = run_simple_matcher(fig_text, patt_hash, 'tablePatt', [2])
    if(matched_tab is not None):
        return build_matched_string(matched_tab, 't')

    matched_tab = run_simple_matcher(fig_text, patt_hash, 'suppTablePatt', [3])
    if(matched_tab is not None):
        return build_matched_string(matched_tab, 'st')
    
    return None

def extract_simple_intact_data(input, title, tsv_output):
    
    with open(input, 'r') as input_file:
        xml = input_file.read()
        
    # Check if the figure legends are specified
    if "\"figure legend\"" not in xml: 
        return  
    
    soup = BeautifulSoup(xml, 'lxml')    

    intact_headings = ['pmid','i_id','orig_fig','fig','type','type_xref','p1_name',
                       'p1_xref','p1_site','p2_name','p2_xref','p2_site','p3_name',
                       'p3_xref','p3_site','i_meth','p_meth']
    intact_rows = []

    patt_hash = build_figure_extraction_patterns()

    # EXPERIMENTS
    all_expt_dict = {}
    for e in soup.select('experimentlist experimentdescription'):
        ex_dict = {}
        ex_dict['i_meth'] = e.interactiondetectionmethod.names.shortlabel.text
        ex_dict['p_meth'] = e.participantidentificationmethod.names.shortlabel.text 
        all_expt_dict[e.get('id')] = ex_dict

    # INTERACTORS
    all_int_dict = {}
    for i1 in soup.select('interactorlist interactor'):
        int_dict = {}
        int_dict['name'] = i1.names.shortlabel.text
        urls = []
        for t in i1.select('primaryref[db="uniprotkb"]'):
            if( t.get('reftype') == 'identity' ) :
                urls.append(t.get('id'))
        for t in i1.select('secondaryref[db="uniprotkb"]'):
            if( t.get('reftype') == 'identity' ) :
                urls.append(t.get('id'))
        int_dict['xref'] = urls
        all_int_dict[i1.get('id')] = int_dict

    # INTERACTIONS
    for i in soup.select('interactionlist interaction'):
        int_dict = {}
        int_dict['pmid'] = title
        int_dict['i_id'] = i.get('id')
        int_dict['type'] = i.interactiontype.names.shortlabel.text        
        int_dict['type_xref'] = i.interactiontype.xref.primaryref.get('id')
        p_count = 1
        for p_tag in i.select('participantlist participant'):
            p_id = p_tag.interactorref.text
            p = all_int_dict[p_id]
            int_dict['p'+str(p_count)+"_name"] = p.get('name')
            int_dict['p'+str(p_count)+"_xref"] = '|'.join(p.get('xref'))
            p_count += 1
        int_dict['fig'] = '-'
        for a in i.select('attributelist attribute[name]'):
            if( a.get('name') == "figure legend" ):
                fig_text = a.text.lower()
                fig_text = run_matcher(fig_text, patt_hash)
                if( fig_text is None):
                    print(a.text.lower() + "  :  None")
                int_dict['orig_fig'] = a.text
                int_dict['fig'] = fig_text
        e_id = i.experimentlist.experimentref.text
        e = all_expt_dict.get(e_id)
        if( e is not None ):
            int_dict['i_meth'] = e.get('i_meth', '-')
            int_dict['p_meth'] = e.get('p_meth', '-')
        else: 
            int_dict['i_meth'] = '-'
            int_dict['p_meth'] = '-'
            
        r = []
        for h in intact_headings:
            r.append(int_dict.get(h,'-'))
        intact_rows.append(r)
        
    intact_df = pd.DataFrame.from_records(intact_rows, columns=intact_headings) 
    intact_df.to_csv(tsv_output, sep='\t', encoding='utf-8')


Execution of code to simplify INTACT records from standard XML into TSV format.  

In [56]:
stem = '/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/'
intact_dir = stem + 'gold_standard/'
simple_intact_dir = stem + 'simple_intact_files/'

print(simple_intact_dir)


/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/simple_intact_files/


In [48]:
for x in os.walk(intact_dir):
    for infile in glob(os.path.join(x[0], '*.xml')):
        fn = ntpath.basename(infile)
        if( os.path.isfile(infile) and fn.endswith('.xml') ):
            title = fn.replace(".xml", "")
            if( title not in pmids ):
                continue

            outfile = simple_intact_dir + "/" + title + ".tsv"
            if( not os.path.isfile(outfile) ):
                try:
                    extract_simple_intact_data(infile, title, outfile)
                except KeyError:
                    print("KeyError for " + infile)

Run this script to convert collections of PSI-MI2.5 files to biopax. We've updated the script to run our updated PaxTools from github.com/BMKEG/Paxtools which includes annotations about Figures in Biopax evidence codes. 

In [53]:
paxtools_jar = "/Users/Gully/Coding/git/biopax/Paxtools/paxtools-console/target/paxtools.jar"

data_dir = "/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/gold_standard_data"
open_access_dir = "/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/oa_gold_data"
biopax_dir = "/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/biopax"
new_biopax_dir = "/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/biopax_reformat"

In [None]:
# THIS RUNS THE UPDATED PAXTOOLS TO GENERATE BIOPAX 3 DATA FOR OUR USE.
for root, dirs, files in os.walk(data_dir):
    for file in files:    
        if os.path.isfile(root+'/'+file) and file[-4:]=='.xml' :
            pmid = file[:-4]

            if( pmid in pmids ): 
                cmds = ["java","-jar",paxtools_jar,"toLevel3",root+'/'+file,biopax_dir+'/'+pmid+'_biopax.xml','-psimiToComplexes']
                print " ".join(cmds)
                call(cmds)
                print "\tDONE"

In [51]:
def reformat_figure_legend_annotations(input):
    
    with open(input, 'r') as input_file:
        xml = input_file.read()
        
    if ">Figure:" not in xml: 
        return  

    patt_hash = build_figure_extraction_patterns()
    fig_patt = re.compile(">Figure:(.*?)<")

    output = ""
    with open(input, 'r') as input_file:
        for line in input_file.readlines(): 
            match = re.search(fig_patt, line)
            if match: 
                fig_text = match.group(1).lower()
                new_fig_text = run_matcher(fig_text, patt_hash)
                if( new_fig_text is not None ):
                    line = re.sub(fig_patt,">Figure:"+new_fig_text+"</bp",line)
                    #print fig_text + '==>' + new_fig_text

            output += line

    return output

# THIS FORMATS FIGURE ANNOTATIONS IN THE UPDATED BIOPAX 3 FILES.
for root, dirs, files in os.walk(biopax_dir):
    for file in files:    
        if os.path.isfile(root+'/'+file) and file[-4:]=='.xml' :
            # Now, load each BIOPAX 3 file, and run the patterns on text found in the XML
            reformatted_text = reformat_figure_legend_annotations(root+'/'+file)

            if reformatted_text is not None:
                with open(new_biopax_dir+'/'+file, 'w') as output_file:
                    output_file.write(reformatted_text)
            

Code to find which pmids have intact records 

In [54]:
from shutil import copyfile

def copy_figure_files(intactFile, figAssigmentDir, outDir):

    frames = []
  
    intact_tsv = pd.read_csv(intactFile, sep='\t')
    
    fries_sentences = []
    fries_hits = []
    fries_events = []
    count = 0
    fries_count = 0
    hit_count = 0
    miss_count = 0
    for i,row in intact_tsv.iterrows():
        pmid = str(row['pmid'])
        fig = str(row['fig'])
        src_file = figAssigmentDir+'/'+pmid+'_'+fig+'.txt'
        dst_file = outDir+'/'+pmid+'_'+fig+'.txt'
        if( os.path.isfile(figAssigmentDir + '/'+pmid+'_'+fig+'.txt') ) :
            copyfile(src_file, dst_file)

In [57]:
fig_sentences_dir = stem + 'fig_sentences'
out_sentences_dir = stem + 'fig_sentences_in_intact'

for root, dirs, files in os.walk(simple_intact_dir):
    for file in files:    
        if os.path.isfile(root+'/'+file) and file[-4:]=='.tsv' :
            copy_figure_files(root+'/'+file, fig_sentences_dir, out_sentences_dir)

Code to link the intact files to the sciDt data.

This is derived from the simplified TSV-format generated above.  

In [None]:
def link_scidt_to_intact(intactFile, scidtDir, outFile):

    frames = []
  
    intact_tsv = pd.read_csv(intactFile, sep='\t')
    
    fries_sentences = []
    fries_hits = []
    fries_events = []
    count = 0
    fries_count = 0
    hit_count = 0
    miss_count = 0
    for i,row in intact_tsv.iterrows():
        pmid = row['pmid']
        print(pmid)
        intact_fig = row['fig']
        p1 = row['p1_xref']
        p2 = row['p2_xref']
        p3 = row['p3_xref']

        fries_events_local = []
        
        # find the figure numbers in the paper designation 
        scidt_path = os.path.join(scidtDir, str(pmid) + ".tsv")
        if( os.path.isfile( scidt_path ) ):
            scidt_tsv = pd.read_csv(scidt_path, sep='\t')
            for i2,row2 in scidt_tsv.iterrows():
                fries_sentence = row2['friesSentenceId'] 
                fries_event = row2['friesEventsTypes'] 
                scidt_figs = row2['Figure Assignment']
                if( scidt_figs == scidt_figs and fries_event == fries_event):
                    for scidt_fig in scidt_figs.split('|'):
                        if scidt_fig == intact_fig and 'complex-assembly' in fries_event:
                            fries_count += 1
                            if( p1 != p1 or p2 != p2 or p3 != p3):
                                hit = "MISS"
                                miss_count += 1
                            elif( (p1 == '-' or p1 in fries_event) and 
                                (p2 == '-' or p2 in fries_event) and 
                                (p3 == '-' or p3 in fries_event) ):
                                hit = "HIT"
                                hit_count += 1
                            else :
                                hit = "MISS"
                                miss_count += 1
                            fries_events_local.append(fries_event + '[' + hit + ']')
                            
        fries_events.append(fries_events_local)
    
    intact_tsv['fries_events'] = pd.Series(fries_events)
        
    intact_tsv.to_csv(outFile, sep='\t')
    print ("COUNT: %d" % fries_count)
    print ("HITS: %d" % hit_count)
    print ("MISSES: %d" % miss_count )

Run through Biopax entries. Load each file and search for evidence. Link that evidence to sentences via figure legends.

In [60]:
tsv_dir = "/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/scidt_fries_bioc_tsv4"
new_biopax_dir = "/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/biopax_reformat"

In [None]:
import uuid
import pandas as pd
import json

def generate_annotation_page(pmid, biopax_path, scidt_path):
    annotation_items = []
    annotation_page = {
        "@context": "http://www.w3.org/ns/anno.jsonld",
        "id": "http://sciknowengine.isi.edu/iswc17/annotation_page/"+pmid,
        "type": "AnnotationPage",
        "partOf": {
            "id": "http://sciknowengine.isi.edu/iswc17/annotations"
        },
        "next": "http://example.org/page2",
        "startIndex": 0,
        "items": annotation_items
    }
    
    biopax_lines = []
    with open(biopax_path, 'r') as biopax_file:
        biopax_lines = biopax_file.readlines()

    scidt_tsv = pd.read_csv(scidt_path, sep='\t')
    
    we_are_on = False
    evidence_patt = re.compile("<bp:Evidence rdf:about\=\"(.*?)\">")
    figure_patt = re.compile(">Figure:(.*?)<")
    evidence_off_patt = re.compile("<\/bp:Evidence>")

    evidence_code = ''
    figure_code = ''
    for biopax_line in biopax_lines: 
        evidence_match = re.search(evidence_patt, biopax_line)
        if evidence_match: 
            evidence_code = evidence_match.group(1)
            figure_code = ''
            we_are_on = True

        figure_match = re.search(figure_patt, biopax_line)
        if figure_match: 
            figure_code = figure_match.group(1)
                            
        if we_are_on and len(figure_code)>0:
            
            targets = []
            annotation = {
                "id": "http://sciknowengine.isi.edu/iswc17/annotations/"+pmid+'#'+str(count),
                "type": "Annotation",
                "body": {
                    "id": evidence_code,
                    "type": "Dataset"
                },
                "target": targets
            }
            annotation_items.append(annotation)
            
            for i, row in scidt_tsv.iterrows():
                sid = row['SentenceId']
                text = row['Sentence Text']
                codeStr = row['Codes']
                expts = row['ExperimentValues']
                paragraph = row['Paragraph']
                heading = row['Headings']
                discourse = row['Discourse Type']
                offset_start = row['Offset_Begin']
                offset_end = row['Offset_End']
                fig = row['Figure Assignment']
                
                if(fig != fig):
                    continue
                
                for f in re.split("|", fig):
                    if( f in figure_code):
                        
                        targets.append({
                            "source": "https://www.ncbi.nlm.nih.gov/pmc/articles/pmid/" + str(pmid),
                            "selector": [{
                                    "type": "TextQuoteSelector",
                                    "exact": text
                                },
                                {
                                    "type": "TextPositionSelector",
                                    "start": offset_start,
                                    "end": offset_end
                                }]
                        })
            
            annotation['target'] = targets
            we_are_on = False
        
    #print len(annotation_items)
    annotation_page['items'] = annotation_items
                
    return annotation_page

annotation_collection_path = "/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/annotation_collection.json"
annotation_pages_path = "/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/pages"

page = {}
annotation_collection = {
  "@context": "http://www.w3.org/ns/anno.jsonld",
  "id": "http://sciknowengine.isi.edu/iswc17/annotations",
  "type": "AnnotationCollection",
  "label": "Anntoations linking BioPax records from the INTACT database to text fragments describing evidence",
  "total": 0,
  "first": page
}

count = 0
annotation_pages = {}
last_annotation_page = None
for root, dirs, files in os.walk(new_biopax_dir):
    for file in files:    
        if os.path.isfile(root+'/'+file) and file[-4:]=='.xml' :
            l = len('_biopax.xml')
            pmid = file[:-l]
            tsv_file = tsv_dir+'/'+str(pmid)+'.tsv'
            
            if not os.path.isfile(tsv_file):
                continue
                
            annotation_page = generate_annotation_page(pmid, root+'/'+file, tsv_dir+'/'+str(pmid)+'.tsv')
            count += 1
            #print json.dumps(annotation_page, sort_keys=True, indent=4, separators=(',', ': '))
            
            if(last_annotation_page is None):
                annotation_collection['first'] = annotation_page['id']
            else:
                last_annotation_page['next'] = annotation_page['id']
            
            annotation_page_dump = json.dumps(annotation_page, sort_keys=True, indent=4, separators=(',', ': '))
            with open(annotation_pages_path+'/page_'+pmid+'.json', 'w') as annotation_page_file:
                annotation_page_file.write(annotation_page_dump)
            
            last_annotation_page = annotation_page
            
annotation_collection['total'] = count
annotation_collection_dump = json.dumps(annotation_collection, sort_keys=True, indent=4, separators=(',', ': '))
with open(annotation_collection_path, 'w') as annotation_collection_file:
    annotation_collection_file.write(annotation_collection_dump)

1
9
1
2
1
10
5
4
5
1
9
5
18
1
6
13
9
21
215
4
5
6
5
6
2
2
3
2
9
6
5
9
4
1
3
4
1
2
15
8
14
13
6
3
1
4
4
29
11
24
14
5
2
11
7
14
2
6
5
3
2
6
5
1
5
3
1
12
5
6
1
7
13
2
6
2
6
8
4
1

In [None]:
annotation_collection_path = "/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/annotation_collection.json"
annotation_pages_path = "/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/pages"

annotation_collection_dump = json.dumps(annotation_collection, sort_keys=True, indent=4, separators=(',', ': '))
with open(annotation_collection_path, 'w') as annotation_collection_file:
    annotation_collection_file.write(annotation_collection_dump)

for pmid in annotation_pages.keys():
    page = annotation_pages[pmid]
    annotation_page_dump = json.dumps(page, sort_keys=True, indent=4, separators=(',', ': '))
    with open(annotation_pages_path+'/page_'+pmid+'.json', 'w') as annotation_page_file:
        annotation_page_file.write(annotation_page_dump)
    
print json.dumps(annotation_collection, sort_keys=True, indent=4, separators=(',', ': '))
            