In [None]:
%load_ext rich

In [None]:
import json
import os

import requests
from tqdm import tqdm

API_URL = "http://localhost:8999"  # Url for debugger. change it to your own

## Sample document

In [None]:
doc_path = "/resources/data/sample/document-01.docx"

## /document-extract endpoint output

In [None]:
# Function to extract document using the API
def extract_document(file_path: str) -> dict:
    # Open the file in binary mode and send the POST request
    with open(file_path, "rb") as file:
        files = {"file": file}
        response = requests.post(url=f"{API_URL}/document/extract", files=files)
    response.raise_for_status()
    return response.json()

In [None]:
# /document-extract endpoint output
extracted_document = extract_document(doc_path)
extracted_document

In [None]:
len(extracted_document["paragraphs"])

document_id = extracted_document["id"]
print("document_id:", document_id)
print("number of paragraphs:", len(extracted_document["paragraphs"]))

## Inference

In [None]:
import uuid


# Function to make inference using the API
def get_predictions(paragraph_id: uuid.UUID) -> dict:
    response = requests.get(
        url=f"{API_URL}/pipeline/anonymization/paragraph/{paragraph_id}/predict",
        params={"use_cache": True},
    )
    response.raise_for_status()
    return response.json()

In [None]:
predictions = [
    get_predictions(paragraph["id"])
    for paragraph in tqdm(extracted_document["paragraphs"])
]
predictions

In [None]:
predictions[5]

In [None]:
from aymurai.database.crud.prediction import read_document_prediction_paragraphs
from aymurai.database.meta.extra import ParagraphPredictionPublic
from aymurai.database.schema import ModelType
from aymurai.database.session import get_session

session = next(get_session())

annotations = read_document_prediction_paragraphs(
    session=session,
    document_id=uuid.UUID(document_id),
    model_type=ModelType.ANONYMIZATION,
)

In [None]:
annotations

In [None]:
document_id


In [None]:
from itertools import groupby

from more_itertools import flatten


def get_entities(prediction):
    return prediction["labels"]


labels = [para.prediction.labels for para in annotations if para.prediction]
entities = list(flatten(labels))

indexed_entities = [(i, entity) for i, entity in enumerate(entities)]
indexed_entities = sorted(indexed_entities, key=lambda x: x[1].attrs.aymurai_label)


groups = {
    label: list(group)
    for label, group in groupby(
        indexed_entities, key=lambda x: x[1].attrs.aymurai_label
    )
}
groups

In [None]:
groups.keys()

In [None]:
entities

In [None]:
import unicodedata

import numpy as np
import pandas as pd
import regex
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import CountVectorizer

from aymurai.meta.entities import DocLabel


# Normalize helper
def normalize_text(text):
    text = unicodedata.normalize("NFKD", text)
    text = "".join(ch for ch in text if unicodedata.category(ch) != "Mn")
    text = regex.sub(r"\s+", " ", text)
    text = regex.sub(r"\p{P}", "", text)
    return text.lower()


def cluster_entities(entities: list[DocLabel], eps: float = 0.01, min_samples: int = 2):
    """
    Cluster entity texts in a group and return a DataFrame with columns:
    text, norm_text, index, cluster. Uses binary vectorization and Jaccard-DBSCAN.
    """

    indexed_entities = [(i, entity) for i, entity in enumerate(entities)]
    indexed_entities = sorted(indexed_entities, key=lambda x: x[1].attrs.aymurai_label)

    # Prepare raw and normalized texts
    texts = [ent[1].text for ent in indexed_entities]
    norm_texts = [normalize_text(t) for t in texts]

    # --------- Binary vectorization of words ----------------
    vectorizer = CountVectorizer(binary=True, token_pattern=r"\b\w+\b")
    matrix = vectorizer.fit_transform(
        norm_texts
    )  # sparse matrix (n_samples x n_features)

    # --------- Jaccard distance matrix ----------------
    ints = (matrix @ matrix.T).toarray()
    row_sums = matrix.sum(axis=1).A1
    union = row_sums[:, None] + row_sums[None, :] - ints
    # Avoid division by zero and compute distances
    with np.errstate(divide="ignore", invalid="ignore"):
        X = 1 - (ints / union)
    X[union == 0] = 1.0

    # -------- DBSCAN ----------------------------------------------------
    db = DBSCAN(eps=eps, min_samples=min_samples, metric="precomputed")
    labels = db.fit_predict(X)

    # Merge clusters by overlapping centroids
    unique_lbls = sorted(set(labels) - {-1})
    if unique_lbls:
        # build centroid binary vectors
        centroid_vectors = []
        for lbl in unique_lbls:
            mat = matrix[labels == lbl]
            centroid = (mat.sum(axis=0) > 0).A1  # boolean mask
            centroid_vectors.append(centroid)

        C = np.vstack(centroid_vectors).astype(bool)
        sim = (C.astype(int) @ C.T.astype(int)) > 0
        mapping = {
            lbl: unique_lbls[int(np.argmax(sim[idx]))]
            for idx, lbl in enumerate(unique_lbls)
        }
        labels = [mapping.get(lbl, -1) for lbl in labels]

    # Build results DataFrame
    df = (
        pd.DataFrame({
            "id": [ent[1].id for ent in indexed_entities],
            "index": [ent[0] for ent in indexed_entities],
            "paragraph_id": [ent[1].fk_paragraph for ent in indexed_entities],
            "label": [ent[1].attrs.aymurai_label for ent in indexed_entities],
            "text": texts,
            "norm_text": norm_texts,
            "cluster": labels,
        })
        .sort_values("cluster")
        .reset_index(drop=True)
    )
    df["id"] = df["id"].apply(uuid.UUID)
    df["paragraph_id"] = df["paragraph_id"].apply(uuid.UUID)
    df.set_index("id", inplace=True)

    return df


# Example: cluster the PER group
results = cluster_entities(entities)
results

In [None]:
results.groupby("cluster").apply(lambda x: x["norm_text"].tolist()).to_dict()