In [None]:
%load_ext rich

In [None]:
import os
import json

import requests
from tqdm import tqdm

API_URL = "http://localhost:8999"  # Url for debugger. change it to your own

## Sample document

In [None]:
doc_path = "/resources/data/sample/document-01.pdf"

## /document-extract endpoint output

In [None]:
# Function to extract document using the API
def extract_document(file_path: str) -> dict:
    # Open the file in binary mode and send the POST request
    with open(file_path, "rb") as file:
        files = {"file": file}
        response = requests.post(url=f"{API_URL}/document-extract", files=files)
    response.raise_for_status()
    return response.json()

In [None]:
# /document-extract endpoint output
extracted_document = extract_document(doc_path)
extracted_document

In [None]:
len(extracted_document["document"])

## Inference

In [None]:
# Function to make inference using the API
def get_predictions(sample: str) -> dict:
    response = requests.post(url=f"{API_URL}/anonymizer/predict", json={"text": sample})
    response.raise_for_status()
    return response.json()

In [None]:
predictions = [
    get_predictions(paragraph) for paragraph in tqdm(extracted_document["document"])
]
predictions

In [None]:
json_prediction = json.dumps({"data": predictions})

In [None]:
with open(doc_path, "rb") as file:
    files = {"file": file}

    response = requests.post(
        url=f"{API_URL}/anonymizer/anonymize-document",
        data={"annotations": json_prediction},
        files=files,
    )
    response.raise_for_status()

In [None]:
output_dir = "output"
os.makedirs(output_dir, exist_ok=True)

filename = os.path.basename(doc_path)
filename, ext = os.path.splitext(filename)
with open(f"{output_dir}/{filename}.odt", "wb") as file:
    file.write(response.content)