# Export documents
This notebook shows how to export documents from a path to a file that LabelStudio can read.
This is useful when you have a lot of documents and you want to use LabelStudio to label them.

In this example we are going to export anonymized documents from the 10 criminal court from the Ciudad de Buenos Aires, Argentina. The anonymization consist on replacing the names or other sensible data of the parties with a generic name.

# load data

In [None]:
import os
from glob import glob

DOCS_PATH = '/resources/data/sample'

paths = glob(f"{DOCS_PATH}/**/*.doc", recursive=True)
paths += glob(f"{DOCS_PATH}/**/*.docx", recursive=True)

docs = [{"path": path} for path in paths]
print("doc files:", len(docs))

In [None]:
from aymurai.pipeline import AymurAIPipeline
from aymurai.text.extraction import FulltextExtract
from aymurai.text.normalize import TextNormalize

config = {
    "preprocess": [
        (
            FulltextExtract,
            {
                "errors": "ignore",
                "use_cache": False,
            },
        ),
        (TextNormalize, {}),
    ],
    "models": [],
    "postprocess": [],
    "multiprocessing": {},
    "use_cache": False,
    # 'log_level': 'debug'
}

pipeline = AymurAIPipeline(config)

In [None]:

preprocessed = pipeline.preprocess(docs)

# export to labelstudio

In [None]:

def to_labelstudio_json(item):
    obj = {
        'text': item['data']['doc.text'],
        'meta_info': {
            'path': item['path']
        }
    }
    return obj


In [None]:
import json

export = map(to_labelstudio_json, preprocessed)
export = list(export)

with open('dump-docs-labelstudio.json', 'w') as file:
    json.dump(export, file)
