# load data

In [None]:
import os
from glob import glob

DOCS_PATH = os.environ["AYMURAI_RESTRICTED_DOCUMENT_DOCS_PATH"]

paths = glob(f"{DOCS_PATH}/**/*.doc", recursive=True)
paths += glob(f"{DOCS_PATH}/**/*.docx", recursive=True)

docs = [{"path": path} for path in paths]
print("doc files:", len(docs))

In [None]:
from aymurai.pipeline import AymurAIPipeline
from aymurai.text.extraction import FulltextExtract
from aymurai.text.normalize import JunkCleaner, TextNormalize

config = {
    "preprocess": [
        (
            FulltextExtract,
            {
                "errors": "ignore",
                "use_cache": True,
            },
        ),
        (TextNormalize, {}),
    ],
    "models": [],
    "postprocess": [],
    "multiprocessing": {},
    "use_cache": False,
    # 'log_level': 'debug'
}

pipeline = AymurAIPipeline(config)

In [None]:

preprocessed = pipeline.preprocess(docs)

# utils & filters

In [None]:
import regex


def search_in_doc(item, pat):
    text = item['data']['doc.text']
    return regex.findall(pat, text)

def filterout_admisibilidad(item):
    return not search_in_doc(item, r'(?i)admisibilidad{e<=2}')


In [None]:
no_admisibilidad = filter(filterout_admisibilidad, preprocessed)
no_admisibilidad = list(no_admisibilidad)
print(len(no_admisibilidad))

# export to labelstudio

In [None]:

def to_labelstudio_json(item):
    obj = {
        'text': item['data']['doc.text'],
        'meta_info': {
            'path': item['path']
        }
    }
    return obj


In [None]:
import json

export = map(to_labelstudio_json, preprocessed)
export = list(export)

with open('dump-docs-labelstudio.json', 'w') as file:
    json.dump(export, file)
