# Setup

In [1]:
DATA_VERSION = "2025-06-02"
# DATA_FILE_NAME = f"consultation-documents-preprocessed-{DATA_VERSION}.parquet"
DATA_FILE_NAME = "consultation-documents-preprocessed-2025-06-04.parquet"
EXTRA_DOCUMENT_FEATURES_FILE_NAME = f"consultation-documents-features-{DATA_VERSION}.parquet"
EMBEDDINGS_FILE_NAME = (
    f"consultation-documents-embeddings-beginnings-openai-text-embedding-3-large-{DATA_VERSION}.parquet"
)

LANGUAGES = {
    "de",
    # "fr",
    # "it",
}

FROM_YEAR = 2000

INCLUDE_RULE_LABELS_IN_TRAINING = {
    "SYNOPTIC_TABLE",
    "LETTER",
    "RECIPIENT_LIST",
    "FINAL_REPORT",
}

CV_FOLDS = 10
# This fraction only applies to manually labelled OpenParlData documents. Many more documents make it
# into the training set, e.g. all Fedlex documents.
TEST_SIZE = 0.2
RANDOM_STATE = 2718
USE_TEST_SET = True

In [2]:
%load_ext autoreload
%autoreload 2

import datetime
import logging
import pathlib
import sys

import cleanlab
import dotenv
import pandas as pd
import sklearn.model_selection

REPOSITORY_ROOT = (pathlib.Path().cwd() / ".." / "..").resolve()
sys.path.append(str(REPOSITORY_ROOT))

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import demokratis_ml.models.document_types.model
import demokratis_ml.models.document_types.preprocessing
from research.document_types import training_split
from research.lib import cleanlab_analysis, data_access

In [4]:
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(name)s: %(message)s")

In [5]:
dotenv.load_dotenv()

True

# Input data preparation

## Get all dataframes

In [6]:
PREPROCESSED_DATA_FILE = REPOSITORY_ROOT / "data" / "dataframes" / DATA_FILE_NAME
data_access.ensure_dataframe_is_available(PREPROCESSED_DATA_FILE)

FEATURES_DATA_FILE = REPOSITORY_ROOT / "data" / "dataframes" / EXTRA_DOCUMENT_FEATURES_FILE_NAME
data_access.ensure_dataframe_is_available(FEATURES_DATA_FILE)

EMBEDDINGS_DATA_FILE = REPOSITORY_ROOT / "data" / "dataframes" / EMBEDDINGS_FILE_NAME
data_access.ensure_dataframe_is_available(EMBEDDINGS_DATA_FILE)

INFO ensure_dataframe_is_available: File /Users/vita/Code/demokratis/demokratis-ml/data/dataframes/consultation-documents-preprocessed-2025-06-04.parquet already exists locally.
INFO ensure_dataframe_is_available: File /Users/vita/Code/demokratis/demokratis-ml/data/dataframes/consultation-documents-features-2025-06-02.parquet already exists locally.
INFO ensure_dataframe_is_available: File /Users/vita/Code/demokratis/demokratis-ml/data/dataframes/consultation-documents-embeddings-beginnings-openai-text-embedding-3-large-2025-06-02.parquet already exists locally.


In [7]:
df_docs = data_access.load_consultation_documents(
    PREPROCESSED_DATA_FILE, only_languages=LANGUAGES, starting_year=FROM_YEAR
)

In [8]:
df_features = pd.read_parquet(FEATURES_DATA_FILE)

In [9]:
df_embeddings = pd.read_parquet(EMBEDDINGS_DATA_FILE)

In [10]:
embedding_dimension = df_embeddings["embedding"].iloc[0].shape[0]
print("Embedding dimension:", embedding_dimension)

Embedding dimension: 3072


## Preprocess

In [11]:
df_input = demokratis_ml.models.document_types.preprocessing.create_input_dataframe(
    df_docs,
    df_extra_features=df_features,
    df_embeddings=df_embeddings,
)

INFO document_types.preprocessing: Dropping 1268 documents (5.4%) with empty texts
INFO document_types.features: 5292 rows (23.7%) were lost due to missing features. Remaining rows: 17017. 15 columns were added.
INFO document_types.features: Lost documents by political_body/year:
year            2007  2008  2009  2010  2011  2012  2013  2014  2015  2016  2017  2018  2019  2020  2021  2022  2023  2024   Total
political_body                                                                                                                    
ag                 0     0     0     0     0    63    17    52    27    13    62     3    59    75    59    57     5     3   495.0
ai                 0     0     0     0     0     0     0     0     0     7    13     0     8    12    25    28     6     0    99.0
ar                 0     0     0     7    47    22    11     0     0     2    42    32    12    11     0     2     0     0   188.0
be                 0     0     0     0     0     0     0     0  

## Split

In [12]:
df_input_train, df_input_test = training_split.train_test_split(
    df_input,
    random_state=RANDOM_STATE,
    test_size=TEST_SIZE,
    include_rule_labels_in_training=INCLUDE_RULE_LABELS_IN_TRAINING,
    stratify_by_canton=False,
)

INFO document_title_rule_model: 0.00% of documents already have labels
INFO document_title_rule_model: Labelled 0.26% by rule: canton=<any>, title^=adressliste => type=RECIPIENT_LIST
INFO document_title_rule_model: Labelled 2.97% by rule: canton=<any>, title^=adressatenliste => type=RECIPIENT_LIST
INFO document_title_rule_model: Labelled 0.16% by rule: canton=<any>, title^=adressatenverzeichnis => type=RECIPIENT_LIST
INFO document_title_rule_model: Labelled 1.18% by rule: canton=<any>, title^=vernehmlassungsadressaten => type=RECIPIENT_LIST
INFO document_title_rule_model: Labelled 0.16% by rule: canton=<any>, title^=vernehmlassungsadressen => type=RECIPIENT_LIST
INFO document_title_rule_model: Labelled 0.00% by rule: canton=<any>, title^=verzeichnis der anhörungsadressaten => type=RECIPIENT_LIST
INFO document_title_rule_model: Labelled 0.00% by rule: canton=<any>, title^=liste der konsultationsadressatinnen und konsultationsadressaten => type=RECIPIENT_LIST
INFO document_title_rule_mod

# Cleanlab

In [13]:
classifier = demokratis_ml.models.document_types.model.create_classifier(
    embedding_dimension=embedding_dimension,
    random_state=RANDOM_STATE,
)
classifier

In [14]:
df_cleanlab = pd.concat([df_input_train, df_input_test], ignore_index=True)
X_cleanlab, y_cleanlab = demokratis_ml.models.document_types.model.create_matrices(df_cleanlab)
X_cleanlab.shape

(14601, 3084)

In [15]:
pred_probs = sklearn.model_selection.cross_val_predict(
    estimator=classifier, X=X_cleanlab, y=y_cleanlab, cv=CV_FOLDS, method="predict_proba"
)
assert len(df_cleanlab) == pred_probs.shape[0]

In [16]:
lab = cleanlab.Datalab(data=df_cleanlab, label_name="document_type")

In [17]:
lab.find_issues(pred_probs=pred_probs, features=X_cleanlab)

Finding null issues ...
Finding label issues ...
Finding outlier issues ...
Finding near_duplicate issues ...
Finding non_iid issues ...
Finding class_imbalance issues ...
Finding underperforming_group issues ...

Audit complete. 15015 issues found in the dataset.


In [18]:
lab.report()

Dataset Information: num_examples: 14601, num_classes: 9

Here is a summary of various issues found in your data:

     issue_type  num_issues
 near_duplicate       14601
          label         329
class_imbalance          84
        non_iid           1

Learn about each issue: https://docs.cleanlab.ai/stable/cleanlab/datalab/guide/issue_type_description.html
See which examples in your dataset exhibit each issue via: `datalab.get_issues(<ISSUE_NAME>)`

Data indices corresponding to top examples of each issue are shown below.


------------------ near_duplicate issues -------------------

About this issue:
	A (near) duplicate issue refers to two or more examples in
    a dataset that are extremely similar to each other, relative
    to the rest of the dataset.  The examples flagged with this issue
    may be exactly duplicated, or lie atypically close together when
    represented as vectors (i.e. feature embeddings).
    

Number of examples with this issue: 14601
Overall dataset qual

In [19]:
df_output = df_cleanlab.copy()
df_output["year"] = df_output["consultation_start_date"].dt.year
df_output["edit_link"] = df_output.apply(
    lambda d: f"https://www.demokratis.ch/de/consultation/{d['consultation_id']}/admin/document/{d['document_id']}/edit",
    axis=1,
)
df_output = df_output[
    [
        "edit_link",
        "consultation_id",
        "document_id",
        "political_body",
        "year",
        "consultation_url",
        "document_source_url",
        "document_title",
        "document_type",
        # Features:
        "count_pages",
        "count_pages_containing_tables",
        "average_page_aspect_ratio",
        "fraction_pages_containing_tables",
        "contains_synopse_keyword",
        "contains_salutation",
    ]
]

cleanlab_analysis.cleanlab_issues_to_excel(
    lab,
    pred_probs=pred_probs,
    dataset=df_output,
    output_path=f"cleanlab_issues_{datetime.datetime.now(tz=datetime.UTC):%Y-%m-%d}.xlsx",
    linkify_columns={
        "edit_link",
        "consultation_url",
        # "document_source_url",  # Odd characters in some URLs cause Excel errors
    },
    issue_types=(
        "label",
        # "near_duplicate",
        # "outlier",
        # "underperforming_group",
        "non_iid",
    ),
)