In [None]:
import os
import shutil
import requests

from glob import glob
from rich.pretty import pprint
from more_itertools import flatten

from aymurai.text.anonymization import DocAnonymizer

In [None]:
doc_anonymizer = DocAnonymizer()

## Sample document

In [None]:
# Sample docx
doc_path = "path/to/your/file.docx"
output_dir = os.path.basename(doc_path).split(".")[0]

In [None]:
# Unzip document
doc_anonymizer.unzip_document(doc_path, output_dir)

## /document-extract endpoint output

In [None]:
# Function to extract document using the API
def extract_document(file_path: str) -> dict:
    # Open the file in binary mode and send the POST request
    with open(file_path, "rb") as file:
        files = {"file": file}
        response = requests.post(
            url="http://localhost:8899/document-extract", files=files
        )
    return response.json()

In [None]:
# /document-extract endpoint output
extracted_document = extract_document(doc_path)
pprint(extracted_document[:10])

In [None]:
len(extracted_document["document"])

## XML paragraphs

In [None]:
# XML files
xml_files = glob(f"{output_dir}/**/**.xml", recursive=True)

# Index paragraphs
paragraphs = (doc_anonymizer.index_paragraphs(file) for file in xml_files)
paragraphs = list(flatten(paragraphs))

# Filter out empty paragraphs
paragraphs = [paragraph for paragraph in paragraphs if paragraph["plain_text"].strip()]

In [None]:
paragraphs

In [None]:
len(paragraphs)

## Inference

In [None]:
# Function to make inference using the API
def get_predictions(sample: str) -> dict:
    response = requests.post(
        url="http://localhost:8899/anonymizer/predict",
        json={"text": sample},
    )
    return response.json()

In [None]:
predictions = [
    get_predictions(paragraph) for paragraph in extracted_document["document"]
]
pprint(predictions[:10])

In [None]:
len(predictions)

In [None]:
iter_preds = iter(predictions)

In [None]:
pred = next(iter_preds)
pprint(pred)

## Matching

In [None]:
# Matching
paragraphs = doc_anonymizer.match_paragraphs_with_predictions(paragraphs, predictions)

In [None]:
iter_paragraphs = iter(paragraphs)

In [None]:
paragraph = next(iter_paragraphs)
pprint(paragraph)

## Replace source XMLs

In [None]:
doc_anonymizer.replace_text_in_xml(paragraphs, output_dir)

## Recreate anonymized docx

In [None]:
anonymized_dir = "anonymized-documents"
os.makedirs(output_dir, exist_ok=True)

item = {"path": doc_path}
doc_anonymizer(item, predictions, anonymized_dir)

In [None]:
# Remove XMl files
shutil.rmtree(output_dir)