# **03-SpanCategorizacer**

Referencias

* [SpanCategorizer](https://spacy.io/api/spancategorizer)
* [Blogpost](https://explosion.ai/blog/spancat)

In [1]:
%cd ..

/Users/belensantamaria/Documentos/section_identification


In [2]:
import json
import spacy
import pandas as pd
import difflib

from sklearn.model_selection import train_test_split
from spacy.tokens import Span, DocBin

In [3]:
nlp = spacy.blank('es')
span_key = "sc"

## Creación del conjunto de datos de entrenamiento

In [4]:
def data_to_spacy_format(file_path):
    with open(file_path) as f:
        data = json.load(f)
    spacy_data = []
    for note_id, entry in data["annotated_entries"].items():
        doc = nlp(entry["note_text"])
        spans = []
        start_span = 1
        for annotation in entry["section_annotation"]["gold"]:
            seg = nlp(annotation["segment"])
            spans.append(Span(doc, start_span, start_span+len(seg), annotation["label"]))
        doc.spans[span_key] = spans
        spacy_data.append(doc)
    return spacy_data

In [5]:
train_path = "data/raw/clinais.train.json"
train_data = data_to_spacy_format(train_path)

In [6]:
train, valid = train_test_split(train_data, test_size=0.2)

In [7]:
DocBin(docs=train).to_disk("data/spacy/train.spacy")
DocBin(docs=valid).to_disk("data/spacy/valid.spacy")

## Entrenamiento del modelo

In [8]:
! python -m spacy init fill-config data/spacy/base_config.cfg data/spacy/base_config.cfg

[38;5;3m⚠ Nothing to auto-fill: base config is already complete[0m
[38;5;2m✔ Saved config[0m
data/spacy/base_config.cfg
You can now add your data and train your pipeline:
python -m spacy train base_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [9]:
! python -m spacy train data/spacy/base_config.cfg --output data/spacy/output --paths.train data/spacy/train.spacy --paths.dev data/spacy/valid.spacy --gpu-id 0

[38;5;4mℹ Saving to output directory: data/spacy/output[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'spancat'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS SPANCAT  SPANS_SC_F  SPANS_SC_P  SPANS_SC_R  SCORE 
---  ------  ------------  ------------  ----------  ----------  ----------  ------
  0       0      22170.10      22968.01        0.01        0.00        2.03    0.00
  0     200       6946.91      12467.57        0.00        0.00        0.00    0.00
  0     400          0.00        316.12        0.00        0.00        0.00    0.00
  0     600          0.00        351.05        0.00        0.00        0.00    0.00
  1     800          0.00        341.02        0.00        0.00        0.00    0.00
  1    1000          0.00        311.01        0.00        0.00        0.00    0.00
  1    1200          0.00        321.00        0.00        0.00        0.00    0.00
  2    1400         

## Predicción sobre nuevos datos

In [4]:
nlp = spacy.load("data/spacy/output/model-best")
nlp_es = spacy.load("es_core_news_sm")

In [5]:
test_path = "data/raw/clinais.dev.json"

with open(test_path) as f:
    data = json.load(f)

In [6]:
annotations = []

for note_id, entry in data["annotated_entries"].items():
    text = entry["note_text"]

    doc = nlp(text)
    spans = doc.spans[span_key]
    spans_dict = {span.text: span.label_ for span in spans}

    start_offset = 0
    
    for sent in nlp_es(text).sents:
        if sent in spans_dict:
            label = spans_dict[sent.text]
        else:
            most_similar = difflib.get_close_matches(sent.text, spans_dict.keys(), n=1)[0]
            label = spans_dict[most_similar]

        d = {
            "note_id": note_id,
            "segment": sent.text,
            "label": label,
            "start_offset": start_offset
        }
        annotations.append(d)
        start_offset += len(sent.text)

df = pd.DataFrame(annotations)

In [7]:
df  

Unnamed: 0,note_id,segment,label,start_offset
0,S0004-06142005000200009-3,Paciente de 69 a. de edad con un PSA en el mom...,PAST_MEDICAL_HISTORY,0
1,S0004-06142005000200009-3,El paciente tenía una biopsia previa por sexta...,PAST_MEDICAL_HISTORY,78
2,S0004-06142005000200009-3,Se practicó una E-RME que mostró inicialmente ...,PAST_MEDICAL_HISTORY,138
3,S0004-06142005001000015-1,Un paciente varón de 19 años acudió al Servici...,PAST_MEDICAL_HISTORY,0
4,S0004-06142005001000015-1,No presentaba ningún antecedente urológico.,PRESENT_ILLNESS,111
...,...,...,...,...
1905,S1135-76062007000100006-1,La madre había fallecido súbitamente a los 48 ...,EXPLORATION,179
1906,S1135-76062007000100006-1,Una noche salió a cenar con los compañeros de ...,PAST_MEDICAL_HISTORY,254
1907,S1135-76062007000100006-1,No manifestó ninguna sintomatología y se acost...,EVOLUTION,366
1908,S1135-76062007000100006-1,Poco después el perro comenzó a ladrar por lo ...,PAST_MEDICAL_HISTORY,435


In [8]:
def entry_boundaries(df_test, note_id, boundaries):
    temp_df = df_test[df_test["note_id"]==note_id]
    predictions = boundaries
    for pred in predictions:
        if pred["start_offset"] in temp_df["start_offset"].values:
            pred["boundary"] = temp_df[temp_df["start_offset"]==pred["start_offset"]]["label"].values[0]
        else:
            pred["boundary"] = None
    return predictions   

In [9]:
predictions = {}

for note_id, entry in data["annotated_entries"].items():
    predictions[entry["note_id"]] = entry
    predictions[entry["note_id"]]["boundary_annotation"]["prediction"] = entry_boundaries(df, note_id, entry["boundary_annotation"]["gold"])
    
with open("data/predictions/predictions_spacy_spancat.json", "w") as f:
    json.dump({"annotated_entries": predictions}, f)  