In [None]:
import json
import requests
import os
import tempfile
import time
from doccano_api_client import DoccanoClient

In [None]:
DOCCANO_PROJECT_ID = 8
CORD_UUID = '7ots8npg'
MAX_RESULTS = 10000
TAG_TYPES = ['UMLS', 'GGP', 'SO', 'TAXON', 'CHEBI', 'GO', 'CL', 'DNA', 'CELL_TYPE', 'CELL_LINE', 'RNA', 'PROTEIN', 'DISEASE', 'CHEMICAL', 'CANCER', 'ORGAN', 'TISSUE', 'ORGANISM', 'CELL', 'AMINO_ACID', 'GENE_OR_GENE_PRODUCT', 'SIMPLE_CHEMICAL', 'ANATOMICAL_SYSTEM', 'IMMATERIAL_ANATOMICAL_ENTITY', 'MULTI-TISSUE_STRUCTURE', 'DEVELOPING_ANATOMICAL_STRUCTURE', 'ORGANISM_SUBDIVISION', 'CELLULAR_COMPONENT', 'PATHOLOGICAL_FORMATION', 'ORGANISM_SUBSTANCE']

In [None]:
r = requests.get('http://search.coronawhy.org/v9sentences/_search?q=cord_uid:{}&size={}'.format(CORD_UUID, MAX_RESULTS))
response = r.json()
len(response['hits']['hits'])

In [None]:
results = []
    
for hit in response['hits']['hits']:
    sentence_id = hit['_source']['sentence_id']
    sentence = hit['_source']['sentence']
    labels = []
    for tag_type in TAG_TYPES:
        tags = hit['_source'][tag_type]
        for tag in tags:
            if len(tag) == 1: # empty lists are returned '[]' as a string by ES, and some lists contain just punctuation symbols
                continue
            pos = sentence.find(tag)
            if pos != -1:
                labels.append([pos, pos+len(tag), tag_type])
    results.append({'text':sentence, 'labels':labels})

In [None]:
doccano_client = DoccanoClient(
  'http://doccano.labs.coronawhy.org',
  'login',
  'pass'
)

In [None]:
for result in results:
    with tempfile.NamedTemporaryFile(suffix='.json') as tmp:
        parts = tmp.name.split('/')
        filename = parts[-1]
        directory = '/'.join(parts[:-1])
        tmp.write((str(result)).replace("'",'"').encode('UTF-8'))
        tmp.seek(0)
        time.sleep(.1)
        response_upload = doccano_client.post_doc_upload(DOCCANO_PROJECT_ID, 'json', filename, directory)
        if response_upload.status_code >= 400:
            print('Error {}:{}'.format(response_upload.status_code, response_upload.text))