# Converting between BioC xml and BioC json

This script converts a BioC collection to BioC json or vice versa (adapted from the BioC-JSON tools from the NLM/NCBI BioNLP Research Group: bioc_json.py, downloaded 20200312 from https://github.com/ncbi-nlp/BioC-JSON).

Dependencies: PyBioC (available from https://github.com/2mh/PyBioC)

In [10]:
import json
import bioc
from bioc import biocxml

class BioC2JSON:
    def node(this, node):
        json_node = {'refid': node.refid, 'role': node.role}
        return json_node
    
    def relation(this, rel):
        json_rel = {}
        json_rel['id'] = rel.id
        json_rel['infons'] = rel.infons
        json_rel['nodes'] = [this.node(n) for n in rel.nodes] 
        return json_rel

    def location(this, loc):
        json_loc = {'offset': int(loc.offset), 'length': int(loc.length)}
        return json_loc

    def annotation(this, note):
        json_note = {}
        json_note['id'] = note.id
        json_note['infons'] = note.infons
        json_note['text'] = note.text
        json_note['locations'] = [this.location(l)
                                  for l in note.locations] 
        return json_note
    
    def sentence(this, sent):
        json_sent = {}
        json_sent['infons'] = sent.infons
        json_sent['offset'] = int(sent.offset)
        json_sent['text'] = sent.text
        json_sent['annotations'] = [this.annotation(a)
                                    for a in sent.annotations]
        json_sent['relations'] = [this.relation(r)
                                  for r in sent.relations] 
        return json_sent

    def passage(this, psg):
        json_psg = {}
        json_psg['infons'] = psg.infons
        json_psg['offset'] = int(psg.offset)
        json_psg['text'] =  psg.text
        json_psg['text'] =  psg.text if psg.text else ""
        json_psg['sentences'] = [this.sentence(s)
                                 for s in psg.sentences] 
        json_psg['annotations'] = [this.annotation(a)
                                   for a in psg.annotations]
        json_psg['relations'] = [this.relation(r)
                                 for r in psg.relations] 
        return json_psg

    def document(this, doc):
        json_doc = {}
        json_doc['id'] = doc.id
        json_doc['infons'] = doc.infons
        json_doc['passages'] = [this.passage(p)
                                for p in doc.passages]
        json_doc['relations'] = [this.relation(r)
                                 for r in doc.relations] 
        return json_doc

    def collection(this, collection):
        json_collection = {}
        json_collection['source'] = collection.source
        json_collection['date'] = collection.date
        json_collection['key'] = collection.key
        json_collection['infons'] = collection.infons
        json_collection['documents'] = [this.document(d)
                                        for d in collection.documents] 
        return json_collection

class JSON2BioC:

    def node(this, json_node):
        node = bioc.BioCNode()
        node.refid = json_node['refid']
        node.role = json_node['role']
        return node

    def relation(this, json_rel):
        rel = bioc.BioCRelation()
        rel.id = json_rel['id']
        rel.infons = json_rel['infons']
        rel.nodes = [this.node(n) for n in json_rel['nodes']] 
        return rel

    def location(this, json_loc):
        loc = bioc.BioCLocation()
        loc.offset = str(json_loc['offset'])
        loc.length = str(json_loc['length'])
        return loc

    def annotation(this, json_note):
        note = bioc.BioCAnnotation()
        note.id = json_note['id']
        note.infons = json_note['infons']
        note.text = json_note['text']
        note.locations = [this.location(l)
                          for l in json_note['locations']] 
        return note
    
    def sentence(this, json_sent):
        sent = bioc.BioCSentence()
        sent.infons = json_sent['infons']
        sent.offset = str(json_sent['offset'])
        sent.text = json_sent['text']
        sent.annotations = [this.annotation(a)
                            for a in json_sent['annotations']]
        sent.relations = [this.relation(r)
                          for r in json_sent['relations']]
        return sent

    def passage(this, json_psg):
        psg = bioc.BioCPassage()
        psg.infons = json_psg['infons']
        psg.offset = str(json_psg['offset'])
        psg.text = json_psg.get('text')
        psg.sentences = [this.sentence(s)
                         for s in json_psg['sentences']]
        psg.annotations = [this.annotation(a)
                           for a in json_psg['annotations']]
        psg.relations = [this.relation(r)
                         for r in json_psg['relations']]
        return psg

    def document(this, json_doc):
        doc = bioc.BioCDocument()
        doc.id = json_doc['id']
        doc.infons = json_doc['infons']
        doc.passages = [this.passage(p)
                        for p in json_doc['passages']]
        doc.relations = [this.relation(r)
                         for r in json_doc['relations']]
        return doc

    def collection(this, json_collection):
        collection = bioc.BioCCollection()
        collection.source = json_collection['source']
        collection.date = json_collection['date'] 
        collection.key = json_collection['key']
        collection.infons = json_collection['infons']
        collection.documents = [this.document(d)
                                for d in json_collection['documents']]
        return collection

option = "-j" #change to -b when converting from json to xml
in_file = "../../../data/gold_standard_xml/goldstandard2_20220203.xml"#'path to input file'
out_file = "../../../results/supple.json"#'path to output file'

if option == '-j': #converts xml to json
    with open(in_file,"r") as reader:
        collection = bioc.load(reader)
        bioc2json = BioC2JSON()
        bioc_json = bioc2json.collection(collection)
        with open(out_file, 'w') as f:
            json.dump(bioc_json, f, indent=2)
            print(file=f)

elif option == '-b': #converts json to xml
        bioc_json = None
        with open(in_file) as f:
            bioc_json = json.load(f)

        # print json.dumps(bioc_json, indent=2)

        json2bioc = JSON2BioC()
        bioc_collection = json2bioc.collection(bioc_json)
        
        writer = bioc.BioCWriter(out_file, bioc_collection)
        writer.write()
        

In [11]:
print(collection)

BioCCollection[source=Pubmed,date=20200306,key=,infons=[BQ_URL=http://www.bioqrator.org,BQ_NAME=Corona_gold],documents=[BioCDocument[id=31991541,infons=[BQ_FROM=PubMed,BQ_PIESCORE=-0.4251,BQ_DONE=NO,BQ_CURATABLE=YES],passages=[BioCPassage[offset=0,text='Return of the Coronavirus: 2019-nCoV. ',infons=[type=title],sentences=[],annotations=[BioCAnnotation[id=A01,text='2019-nCoV',infons=[representative_name=SARS-CoV-2,type=Virus_SARS-CoV-2,NCBI:txid=2697049],locations=[BioCLocation[offset=27,length=9]],],BioCAnnotation[id=A02,text='Coronavirus',infons=[representative_name=coronavirus,type=Virus_family,NCBI:txid=693995],locations=[BioCLocation[offset=14,length=11]],]],relations=[],],BioCPassage[offset=38,text='The emergence of  ...  2B coronavirus. ',infons=[type=abstract],sentences=[],annotations=[BioCAnnotation[id=A03,text='SARS-CoV',infons=[type=Virus_other,NCBI:txid=694009 ],locations=[BioCLocation[offset=114,length=8]],],BioCAnnotation[id=A04,text='most recent emerg ... up 2B coronavir