In [35]:
%reload_ext autoreload
%autoreload 2
%config Completer.use_jedi = False
%reload_ext lab_black

# Please infer super_pipe on `BRAT_DIR`. Use the sbatch file in `/export/home/cse200093/Jacques_Bio/super_pipe/py_files/sbatch/main.sh`. res path should be in `RES_DIR`.

In [5]:
import pandas as pd
from os.path import isfile, isdir, join, basename
from os import listdir
import spacy
from edsnlp.processing import pipe
from joblib import Parallel, delayed

BRAT_DIR = "/export/home/cse200093/Jacques_Bio/data_bio/super_pipe_get_stats_by_section"
RES_DIR = "/export/home/cse200093/Jacques_Bio/data_bio/super_pipe_res/stats_by_section/res.json"

TO_BE_MATCHED = {
    "Lupus": {
        "Facteur anti-nucléaire": ["C0587178"],
        "Anti-DNA natif": ["C1262035"],
        "Anti-Sm": ["C0201357"],
        "Hémoglobine": ["C0518015"],
        "CRP": ["C0201657"],
        "Créatininémie": ["C0201975"],
        "DFG": ["C0017654"],
    },
    "Sclérodermie": {
        "Anti-RNA pol 3": ["C1295034"],
        "Anti-SCL 70": ["C0523317"],
        "Hémoglobine": ["C0518015"],
        "CRP": ["C0201657"],
        "Créatininémie": ["C0201975"],
        "DFG": ["C0017654"],
    },
    "Takayasu": {
        "CRP": ["C0201657"],
        "Créatininémie": ["C0201975"],
        "DFG": ["C0017654"],
        "Hémoglobine": ["C0518015"],
    },
    "SAPL": {
        "INR": ["C0525032"],
        "Hémoglobine": ["C0518015"],
        "CRP": ["C0201657"],
        "Créatininémie": ["C0201975"],
    },
}

CUIS_OF_INTEREST = set(
    [
        cui
        for cui_dict in TO_BE_MATCHED.values()
        for cuis in cui_dict.values()
        for cui in cuis
    ]
)

In [3]:
# SHOW DATASETS
sql("USE cse_200093_20210402")
sql("SHOW tables").show(10, False)

+-------------------+-------------------------+-----------+
|database           |tableName                |isTemporary|
+-------------------+-------------------------+-----------+
|cse_200093_20210402|i2b2_concept             |false      |
|cse_200093_20210402|i2b2_observation_ccam    |false      |
|cse_200093_20210402|i2b2_observation_cim10   |false      |
|cse_200093_20210402|i2b2_observation_doc     |false      |
|cse_200093_20210402|i2b2_observation_ghm     |false      |
|cse_200093_20210402|i2b2_observation_lab     |false      |
|cse_200093_20210402|i2b2_observation_med     |false      |
|cse_200093_20210402|i2b2_observation_microbio|false      |
|cse_200093_20210402|i2b2_observation_pacs    |false      |
|cse_200093_20210402|i2b2_observation_physio  |false      |
+-------------------+-------------------------+-----------+
only showing top 10 rows



In [2]:
# Load res dataset
res_df = pd.read_json(RES_DIR)

# Select only CUIs of interest
res_df = res_df.loc[
    res_df["label"].apply(
        lambda labels: len(set(labels).intersection(CUIS_OF_INTEREST))
    )
    > 0
]

# Load nlp pipe to detect sections
nlp_sections = spacy.blank("eds")
nlp_sections.add_pipe("eds.normalizer")
nlp_sections.add_pipe("eds.sections")

# Load txt files in DataFrame
txt_files = [
    f for f in listdir(BRAT_DIR) if isfile(join(BRAT_DIR, f)) if f.endswith(".txt")
]
txt_list = []
for txt_file in txt_files:
    with open(join(BRAT_DIR, txt_file), "r") as file:
        text = file.read()
        txt_list.append([text, txt_file[:-3] + "ann"])
txt_df = pd.DataFrame(txt_list, columns=["note_text", "note_id"])

# Infer nlp pipe to detect sections
txt_df = pipe(
    note=txt_df,
    nlp=nlp_sections,
    n_jobs=-2,
    additional_spans=["sections"],
).drop(columns=["span_type", "lexical_variant"])

txt_df

[Parallel(n_jobs=-2)]: Using backend MultiprocessingBackend with 4 concurrent workers.
[Parallel(n_jobs=-2)]: Batch computation too fast (0.1113s.) Setting batch_size=2.
[Parallel(n_jobs=-2)]: Done   3 out of   3 | elapsed:    2.8s remaining:    0.0s
[Parallel(n_jobs=-2)]: Done   3 out of   3 | elapsed:    2.8s finished


Unnamed: 0,note_id,label,start,end
0,crs3115_CIM10_6374508586831126195.ann,introduction,0,59
1,crs3115_CIM10_6374508586831126195.ann,motif,59,302
2,crs3115_CIM10_6374508586831126195.ann,habitus,302,596
3,crs3115_CIM10_6374508586831126195.ann,vaccinations,596,741
4,crs3115_CIM10_6374508586831126195.ann,antécédents,741,753
...,...,...,...,...
1457,crs2343_CIM10_4137602884760253.ann,données biométriques entrée,1456,2734
1458,crs2343_CIM10_4137602884760253.ann,examens complémentaires,2734,3901
1459,crs2343_CIM10_4137602884760253.ann,examens complémentaires,3901,4180
1460,crs2343_CIM10_4137602884760253.ann,conclusion,4180,4189


In [3]:
# Link each section to the found res inside It for each file
parallelizer = Parallel(
    n_jobs=-1, verbose=20, backend="multiprocessing", prefer="processes"
)


def is_overlapping(a, b):
    # Return true if a segment is overlapping b
    # else False
    return min(a[1], b[1]) > max(a[0], b[0])


def find_section(source, section_span):
    res_df_part = res_df.loc[res_df["source"] == source]
    res = {
        key1: {key2: [] for key2 in TO_BE_MATCHED[key1].keys()}
        for key1 in TO_BE_MATCHED.keys()
    }
    for _, (term, label, label_span) in res_df_part[
        ["term", "label", "span_converted"]
    ].iterrows():
        if is_overlapping(section_span, label_span):
            for key1, labels_dict in TO_BE_MATCHED.items():
                for key2, labels_list in labels_dict.items():
                    if len(set(label).intersection(set(labels_list))):
                        res[key1][key2].append(term)
    # Delete empty keys
    for key1, labels_dict in res.items():
        for key2, labels_list in labels_dict.copy().items():
            if not len(labels_list):
                res[key1].pop(key2, None)
    return res


txt_df[[key for key in TO_BE_MATCHED.keys()]] = pd.DataFrame(
    parallelizer(
        delayed(find_section)(row["note_id"], [row["start"], row["end"]])
        for _, row in txt_df.iterrows()
    )
)

with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(txt_df)

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 5 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0067s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0016s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    0.0s
[Pa

Unnamed: 0,note_id,label,start,end,Lupus,Sclérodermie,Takayasu,SAPL
0,crs3115_CIM10_6374508586831126195.ann,introduction,0,59,{},{},{},{}
1,crs3115_CIM10_6374508586831126195.ann,motif,59,302,{},{},{},{}
2,crs3115_CIM10_6374508586831126195.ann,habitus,302,596,{},{},{},{}
3,crs3115_CIM10_6374508586831126195.ann,vaccinations,596,741,{},{},{},{}
4,crs3115_CIM10_6374508586831126195.ann,antécédents,741,753,{},{},{},{}
5,crs3115_CIM10_6374508586831126195.ann,allergies,753,847,{},{},{},{}
6,crs3115_CIM10_6374508586831126195.ann,antécédents familiaux,847,1036,{},{},{},{}
7,crs3115_CIM10_6374508586831126195.ann,antécédents,1036,1921,{},{},{},{}
8,crs3115_CIM10_6374508586831126195.ann,antécédents,1921,2167,{},{},{},{}
9,crs3115_CIM10_6374508586831126195.ann,traitements entrée,2167,2610,{},{},{},{}
