In [None]:
import json
import requests
import os
import tempfile
import time
from doccano_api_client import DoccanoClient

In [None]:
DOCCANO_PROJECT_ID = 12
CORD_UUID = '7ots8npg'
MAX_RESULTS = 10000
TAG_TYPES = ['UMLS', 'GGP', 'SO', 'TAXON', 'CHEBI', 'GO', 'CL', 'DNA', 'CELL_TYPE', 'CELL_LINE', 'RNA', 'PROTEIN', 'DISEASE', 'CHEMICAL', 'CANCER', 'ORGAN', 'TISSUE', 'ORGANISM', 'CELL', 'AMINO_ACID', 'GENE_OR_GENE_PRODUCT', 'SIMPLE_CHEMICAL', 'ANATOMICAL_SYSTEM', 'IMMATERIAL_ANATOMICAL_ENTITY', 'MULTI-TISSUE_STRUCTURE', 'DEVELOPING_ANATOMICAL_STRUCTURE', 'ORGANISM_SUBDIVISION', 'CELLULAR_COMPONENT', 'PATHOLOGICAL_FORMATION', 'ORGANISM_SUBSTANCE']
SPLIT_SENTENCES = False


## Get sentences from elasticsearch

In [None]:
r = requests.get('http://search.coronawhy.org/v9sentences/_search?q=cord_uid:{}&size={}'.format(CORD_UUID, MAX_RESULTS))
response = r.json()
len(response['hits']['hits'])

## Label sentences

Sentences can be joined all together or not depending on the `SPLIT_SENTENCES` option. They will be stored as one or multiple json objects in the `results` variable to be later uploaded.

In [None]:
results = []
texts = []
labels = []
previous_length = 0
done_labels = [] # Some label could be applied multiple times which would cause a constraint error on doccano's side.
                 # This serves as marking sure a label is applied only once at a given location in the text

for hit in response['hits']['hits']:
    sentence_id = hit['_source']['sentence_id']
    sentence = hit['_source']['sentence']
    if SPLIT_SENTENCES:
        labels = []
        
    for tag_type in TAG_TYPES:
        tags = hit['_source'][tag_type]
        for tag in tags:
            if len(tag) == 1: # empty lists are returned '[]' as a string by ES, and some lists contain just punctuation symbols
                continue
            
            pos = sentence.find(tag)
            if pos != -1:
                if not SPLIT_SENTENCES:
                    pos = previous_length + pos #Adding the length of previous sentences to calculate the new position
                
                if '{},{},{}'.format(pos, pos+len(tag), tag_type) not in done_labels:
                    labels.append([pos, pos+len(tag), tag_type])
                    done_labels.append('{},{},{}'.format(pos, pos+len(tag), tag_type))
    
    if SPLIT_SENTENCES:
        results.append(json.dumps({"text":sentence, "labels":labels}))
    else:
        if sentence[-1] == '.':
            sentence = sentence[:-1]
        
        previous_length += len(sentence) + 2 # Sentences will be joined with '. '
        texts.append(sentence)

if not SPLIT_SENTENCES: 
    results = [json.dumps({"text":'. '.join(texts), "labels":labels})]

## Doccano connection and file upload

In [None]:
doccano_client = DoccanoClient(
  'http://doccano.labs.coronawhy.org',
  'login',
  'pass'
)

In [None]:
with tempfile.NamedTemporaryFile(suffix='.json') as tmp:
    parts = tmp.name.split('/')
    filename = parts[-1]
    directory = '/'.join(parts[:-1])
    
    for i, result in enumerate(results):
        tmp.write(result.encode('UTF-8'))
    tmp.seek(0)
    response_upload = doccano_client.post_doc_upload(DOCCANO_PROJECT_ID, 'json', filename, directory)
    
    if response_upload.status_code >= 400:
        print('Error {}:{}'.format(response_upload.status_code, response_upload.text))
print("Done {} texts\n".format(i+1))