# **04-NER**

In [1]:
%cd ..

/Users/belensantamaria/Documentos/section_identification


In [2]:
import json
import spacy
import re
import pandas as pd

from sklearn.model_selection import train_test_split
from spacy.tokens import Span, DocBin

In [3]:
nlp = spacy.blank("es")
span_key = "ner"

In [4]:
def trim_entity_spans(text, start, end):
    invalid_span_tokens = re.compile(r"\s")
    valid_start = start
    valid_end = end
    while valid_start < len(text) and invalid_span_tokens.match(text[valid_start]):
        valid_start += 1
    while valid_end > valid_start and invalid_span_tokens.match(text[valid_end - 1]):
        valid_end -= 1
    return valid_start, valid_end

def data_to_spacy_format(file_path):
    with open(file_path) as f:
        data = json.load(f)
    spacy_data = []
    for note_id, entry in data["annotated_entries"].items():
        doc = nlp(entry["note_text"])
        ents = []
        for annotation in entry["section_annotation"]["gold"]:
            sec = nlp(annotation["segment"])
            start, end = trim_entity_spans(entry["note_text"], annotation["start_offset"], annotation["start_offset"]+len(sec[0]))
            span = doc.char_span(start, end, label=annotation["label"])
            ents.append(span)   
        doc.ents = ents
        spacy_data.append(doc)
    return spacy_data

In [5]:
train_path = "data/raw/clinais.train.json"
train_data = data_to_spacy_format(train_path)

In [6]:
train, valid = train_test_split(train_data, test_size=0.2)

In [7]:
DocBin(docs=train).to_disk("data/ner/train.spacy")
DocBin(docs=valid).to_disk("data/ner/valid.spacy")

In [8]:
! python -m spacy init fill-config data/ner/ner_config.cfg data/ner/ner_config.cfg

[38;5;3m⚠ Nothing to auto-fill: base config is already complete[0m
[38;5;2m✔ Saved config[0m
data/ner/ner_config.cfg
You can now add your data and train your pipeline:
python -m spacy train ner_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [9]:
! python -m spacy train data/ner/ner_config.cfg --output data/ner/output --paths.train data/ner/train.spacy --paths.dev data/ner/valid.spacy --gpu-id 0

[38;5;4mℹ Saving to output directory: data/ner/output[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     98.73    0.82    0.42   17.64    0.01
  0     200         69.09   5425.65   30.69   84.67   18.75    0.31
  0     400         84.34   1922.98   38.05   76.56   25.31    0.38
  0     600        121.91   2190.30   50.10   56.72   44.87    0.50
  1     800        130.44   1898.71   44.50   75.71   31.51    0.45
  1    1000        131.80   1821.85   51.06   58.22   45.46    0.51
  1    1200        132.31   1835.36   45.34   78.94   31.81    0.45
  2    1400        153.10   1557.80   53.29   55.62   51.14    0.53
  2    1600        165.15   1584.86   48.54   62.24   39.78    0.49
  2    1800        179.99   1605.4

In [10]:
nlp = spacy.load("data/ner/output/model-best")

In [11]:
test_path = "data/raw/clinais.dev.json"

with open(test_path) as f:
    data = json.load(f)

In [12]:
annotations = []

for note_id, entry in data["annotated_entries"].items():
    text = entry["note_text"]
    doc = nlp(text)
    ents = doc.ents

    for ent in ents:
        d = {
            "note_id": note_id,
            "label": ent.label_,
            "start_offset": ent.start_char
        }
        annotations.append(d)

df = pd.DataFrame(annotations)

In [13]:
df

Unnamed: 0,note_id,label,start_offset
0,S0004-06142005000200009-3,PRESENT_ILLNESS,0
1,S0004-06142005001000015-1,PRESENT_ILLNESS,0
2,S0004-06142005001000015-1,EXPLORATION,201
3,S0004-06142005001000015-1,EXPLORATION,360
4,S0004-06142005001000015-1,EVOLUTION,1526
...,...,...,...
952,S0376-78922009000400002-8,PRESENT_ILLNESS,0
953,S0376-78922009000400002-8,PAST_MEDICAL_HISTORY,17
954,S0376-78922009000400002-8,TREATMENT,342
955,S1135-76062007000100006-1,PRESENT_ILLNESS,0


In [14]:
def entry_boundaries(df_test, note_id, boundaries):
    temp_df = df_test[df_test["note_id"]==note_id]
    predictions = boundaries
    for pred in predictions:
        if pred["start_offset"] in temp_df["start_offset"].values:
            pred["boundary"] = temp_df[temp_df["start_offset"]==pred["start_offset"]]["label"].values[0]
        else:
            pred["boundary"] = None
    return predictions   

In [15]:
predictions = {}

for note_id, entry in data["annotated_entries"].items():
    predictions[entry["note_id"]] = entry
    predictions[entry["note_id"]]["boundary_annotation"]["prediction"] = entry_boundaries(df, note_id, entry["boundary_annotation"]["gold"])
    
with open("data/predictions/predictions_spacy_ner.json", "w") as f:
    json.dump({"annotated_entries": predictions}, f) 