In [None]:
%load_ext autoreload
%load_ext aymurai.devtools.magic
%autoreload 2


In [None]:
from sklearn.model_selection import train_test_split

from aymurai.spacy.display import DocRender
from aymurai.pipeline import AymurAIPipeline
from aymurai.datasets.ar_juz_pcyf_10 import ArgentinaJuzgadoPCyF10Dataset

render = DocRender()

In [None]:
private = ArgentinaJuzgadoPCyF10Dataset('private', use_cache=True)
train, test = train_test_split(private, test_size=0.2, random_state=22)
train, val = train_test_split(train, test_size=0.2, random_state=22)
print('train:', len(train))
print('test:', len(test))
print('val:', len(val))

In [None]:
%%export aymurai.spacy.rulers.section_parser
from copy import deepcopy
from functools import partial

from spacy.tokens import Span
from more_itertools import zip_offset

import aymurai.spacy.components
from aymurai.spacy.utils import load_base, format_entity
from aymurai.spacy.components.regex import EnhancedRegexRuler
from aymurai.meta.pipeline_interfaces import DataItem, DataBlock, Transform


class AymuraiRulerSectionParser(Transform):
    def __init__(
        self,
        base: str,
        breakpoints: dict,
        context_offset: int = 10,
        spans_key: str = 'section'
    ):
        global __nlp
        __nlp = load_base(base)
        __nlp.add_pipe("enhanced_regex_ruler", config={"patterns": breakpoints})

        self.offset = context_offset

        self.spans_key = spans_key

    def __call__(self, item: DataItem) -> DataItem:
        item = deepcopy(item)

        doc = __nlp.pipe([item["data"]["doc.text"]])
        doc = list(doc)[0]

        ents = sorted(doc.ents, key=lambda x: x.start_char)
        if not ents:
            return item
        spans = [Span(doc, start=0, end=ents[0].start, label="SECTION:HEAD")]

        for ent1, ent2 in zip_offset(
            ents,
            ents,
            offsets=(0, 1),
            longest=True,
            fillvalue=Span(doc, start=len(doc), end=len(doc)),
        ):
            spans.append(
                Span(
                    doc,
                    start=ent1.start,
                    end=ent2.start,
                    label=ent1.label_,
                )
            )

        _format_entity = partial(format_entity, offset=self.offset)
        formatted_ents = map(_format_entity, spans)

        if not 'spans' in item['data']:
            item['data']['spans'] = {}
        if not self.spans_key in item['data']['spans']:
            item['data']['spans'][self.spans_key] = []

        item["data"]["spans"][self.spans_key] += list(formatted_ents)

        return item


In [None]:
import aymurai.spacy.components
from aymurai.text.normalize import TextNormalize
from aymurai.spacy.ruler import SpacyRulerPipeline
from aymurai.text.extraction import FulltextExtract
from aymurai.spacy.rulers.section_parser import AymuraiRulerSectionParser

config = {
    "preprocess": [
        (
            FulltextExtract,
            {
                "extension": "pdf",
                "method": "tesseract",
                "language": "spa",
                "errors": "ignore",
                "use_cache": True,
            },
        ),
        (TextNormalize, {}),
        (
            AymuraiRulerSectionParser,
            {
                "base": "es",
                "breakpoints": {
                    "SECTION:DEVELOPMENT": [
                        "DESARROLLO",
                    ],
                    'SECTION:BACKGROUND': [
                        "ANTECEDENTES",
                        "ANTECEDENTES Y ARGUMENTOS",
                    ],
                    "SECTION:ARGUMENTS": [
                        "ARGUMENTOS",
                        "ANTECEDENTES Y ARGUMENTOS",
                        'CONSIDERO'
                    ],
                    "SECTION:DECISION": [
                        "DECID[EO]",
                        "RESUELV[EO]",
                    ],
                    'KEYWORDS': [
                        "PALABRAS[_\s]+CLAVE[\w\d\s_:]+",
                    ]
                },
            },
        ),
    ],
    "models": [],
    "postprocess": [],
    "multiprocessing": {},
    "use_cache": False,
    # 'log_level': 'debug'
}

pipeline = AymurAIPipeline(config)


In [None]:
preprocess = pipeline.preprocess(train)
result = pipeline.predict(preprocess)

In [None]:
import json

registry = result[1]
metadata = {k: v for k, v in registry.items() if type(v) not in [dict, list]}
print(json.dumps(metadata, indent=4))

print('\n-------\n')
render(registry, 'span', spans_key='section')

In [None]:
registry['data']['spans']