An initial step is to perform an evaluation of the notes with which we have worked. We evaluated the number of tokens, number of entities, their relationship for a correct interpretation of the tokens and to make an adequate approach.

In the images [Frequency_tokens_entitiesref](./Frequency_tokens_entities.png) and [Relation_numtokens_entities](./Relation_numtokens_entities.png) you can see an example of some of the visualizations made for the initial understanding of the notes.

In [None]:
import os
import spacy
from database import load_dataset
from dotenv import load_dotenv
from pprint import pprint
from utils import get_number_tokens

In [None]:
load_dotenv()

In [None]:
CORPUS_PATH = os.getenv("CORPUS_CLINICAL_PATH")
MODEL_PATH = os.getenv("MODEL_CLINICAL_PATH")
CORPUS_PATH_OUT = os.getenv("CORPUS_CLINICAL_FILTERED_PATH")

# Load Corpus

In [None]:
MODEL = spacy.load(MODEL_PATH)

ENTS = spacy.info(MODEL_PATH)['labels']['ner']

In [None]:
print(len(ENTS))
pprint(ENTS)

In [None]:
docs = load_dataset(MODEL, CORPUS_PATH)
print("Number of notes:", len(docs))
print("Number of tokens:", get_number_tokens(docs))

## Preprocessing

Covert from notes to sentences

In [None]:
from spacy.tokens import Span
from tqdm import tqdm

In [None]:
def gen_doc(model, sent, span_list):
    doc = model(sent.text)
    new_ents = []

    sent_start = sent.start

    for span in span_list:
        start =  span.start
        end =  span.end 
        label = span.label_

        if sent.start <= start < sent.end:
            
            entidad = Span(doc, start - sent_start, end - sent_start, label=label)
            new_ents.append(entidad)

    doc.ents = new_ents

    return doc

In [None]:
sentencizer = MODEL.add_pipe("sentencizer")

In [None]:
def split_in_sentences(docs):
    docs_sents =[]
    for doc in tqdm(sentencizer.pipe(docs, batch_size=250), total=len(docs)):
        for sent in doc.sents:
            # sent == Span
            span_start, span_end = sent.start, sent.end
            span = doc[span_start:span_end]

            docs_sents.append(gen_doc(MODEL, sent, span.ents))

# EDA 

In [None]:
from spacy import displacy
from utils import get_tokens_ents
from visualize import visualize_ent_scatter, visualize_distrib_outliers

In [None]:
ents_counter = { ent : get_tokens_ents(docs, ent) for ent in ENTS }

In [None]:
for ent in ents_counter:
    tokens, ents = ents_counter[ent]
    visualize_ent_scatter(tokens,ents, ent)

In [None]:
for ent in ents_counter:
    tokens, ents = ents_counter[ent]
    visualize_distrib_outliers(tokens, ents, ent)

In [None]:
from visualize import box_plot

tokens, ents = ents_counter[list(ents_counter.keys())[0]]
box_plot(tokens, "Number of Tokens")


# Filter by number of tokens

In [None]:
docs_good = [d for d in docs if get_number_tokens([d]) <= 250]

In [None]:
print("Number of good docs:", len(docs_good))

# Dump docs

In [None]:
from utils import calculate_and_categorize_entities, select_entity_results, save_datasets

notes_with_1_ents, notes_with_morethan1_ents, notes_without_ents = calculate_and_categorize_entities(docs_good, ENTS)

for label in ENTS:
    ent_0, ent_1, ent_morethan1 = select_entity_results(docs_good, label, notes_with_1_ents, notes_with_morethan1_ents, notes_without_ents)
    save_datasets({"eq0_ents":ent_0, "eq1_ents":ent_1, "gt1_ents":ent_morethan1}, CORPUS_PATH_OUT, label)


# Test load

In [None]:
from utils import load_datasets

dic_docs = load_datasets(MODEL, CORPUS_PATH_OUT, "CANCER_CONCEPT")
print(len(dic_docs))