# Setup

In [22]:
DATA_VERSION = "2025-06-09"
DATA_FILE_NAME = f"consultation-documents-preprocessed-{DATA_VERSION}.parquet"
EXTRA_DOCUMENT_FEATURES_FILE_NAME = f"consultation-documents-features-{DATA_VERSION}.parquet"
EMBEDDINGS_FILE_NAME = (
    f"consultation-documents-embeddings-beginnings-openai-text-embedding-3-large-{DATA_VERSION}.parquet"
)

LANGUAGES = {
    "de",
    # "fr",
    # "it",
}

FROM_YEAR = 2000

INCLUDE_RULE_LABELS_IN_TRAINING = {
    "SYNOPTIC_TABLE",
    "LETTER",
    "RECIPIENT_LIST",
    "FINAL_REPORT",
}

CV_FOLDS = 10
# This fraction only applies to manually labelled OpenParlData documents. Many more documents make it
# into the training set, e.g. all Fedlex documents.
TEST_SIZE = 0.2
RANDOM_STATE = 2718
USE_TEST_SET = True

In [23]:
%load_ext autoreload
%autoreload 2

import logging
import pathlib
import sys

import cleanlab
import dotenv
import pandas as pd
import sklearn.model_selection

REPOSITORY_ROOT = (pathlib.Path().cwd() / ".." / "..").resolve()
sys.path.append(str(REPOSITORY_ROOT))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [24]:
import demokratis_ml.models.document_types.model
import demokratis_ml.models.document_types.preprocessing
from research.document_types import training_split
from research.lib import cleanlab_analysis, data_access

In [25]:
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(name)s: %(message)s")

In [26]:
dotenv.load_dotenv()

True

# Input data preparation

## Get all dataframes

In [27]:
PREPROCESSED_DATA_FILE = REPOSITORY_ROOT / "data" / "dataframes" / DATA_FILE_NAME
data_access.ensure_dataframe_is_available(PREPROCESSED_DATA_FILE)

FEATURES_DATA_FILE = REPOSITORY_ROOT / "data" / "dataframes" / EXTRA_DOCUMENT_FEATURES_FILE_NAME
data_access.ensure_dataframe_is_available(FEATURES_DATA_FILE)

EMBEDDINGS_DATA_FILE = REPOSITORY_ROOT / "data" / "dataframes" / EMBEDDINGS_FILE_NAME
data_access.ensure_dataframe_is_available(EMBEDDINGS_DATA_FILE)

INFO ensure_dataframe_is_available: File /Users/vita/Code/demokratis/demokratis-ml/data/dataframes/consultation-documents-preprocessed-2025-06-09.parquet already exists locally.
INFO ensure_dataframe_is_available: File /Users/vita/Code/demokratis/demokratis-ml/data/dataframes/consultation-documents-features-2025-06-09.parquet already exists locally.
INFO ensure_dataframe_is_available: File /Users/vita/Code/demokratis/demokratis-ml/data/dataframes/consultation-documents-embeddings-beginnings-openai-text-embedding-3-large-2025-06-09.parquet already exists locally.


In [28]:
df_docs = data_access.load_consultation_documents(
    PREPROCESSED_DATA_FILE, only_languages=LANGUAGES, starting_year=FROM_YEAR
)

In [29]:
df_features = pd.read_parquet(FEATURES_DATA_FILE)

In [30]:
df_embeddings = pd.read_parquet(EMBEDDINGS_DATA_FILE)

In [31]:
embedding_dimension = df_embeddings["embedding"].iloc[0].shape[0]
print("Embedding dimension:", embedding_dimension)

Embedding dimension: 3072


## Preprocess

In [32]:
df_input = demokratis_ml.models.document_types.preprocessing.create_input_dataframe(
    df_docs,
    df_extra_features=df_features,
    df_embeddings=df_embeddings,
)

INFO document_types.preprocessing: Dropping 1269 documents (5.4%) with empty texts
INFO document_types.features: 3385 rows (15.2%) were lost due to missing features. Remaining rows: 18948. 15 columns were added.
INFO document_types.features: Lost documents by political_body/year:
year            2007  2008  2009  2010  2011  2012  2013  2014  2015  2016  2017  2018  2019  2020  2021  2022  2023  2024   Total
political_body                                                                                                                    
ag                 0     0     0     0     0    53    17    21     6     0    41     3    59    58    18    24     5     3   308.0
ai                 0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     NaN
ar                 0     0     0     7    47    22    11     0     0     2    22     0     0     0     0     0     0     0   111.0
be                 0     0     0     0     0     0     0     0  

## Split

In [33]:
df_input_train, df_input_test = training_split.train_test_split(
    df_input,
    random_state=RANDOM_STATE,
    test_size=TEST_SIZE,
    include_rule_labels_in_training=INCLUDE_RULE_LABELS_IN_TRAINING,
    stratify_by_canton=False,
)

INFO document_title_rule_model: 0.00% of documents already have labels
INFO document_title_rule_model: Labelled 0.26% by rule: canton=<any>, title^=adressliste => type=RECIPIENT_LIST
INFO document_title_rule_model: Labelled 2.83% by rule: canton=<any>, title^=adressatenliste => type=RECIPIENT_LIST
INFO document_title_rule_model: Labelled 0.12% by rule: canton=<any>, title^=adressatenverzeichnis => type=RECIPIENT_LIST
INFO document_title_rule_model: Labelled 1.27% by rule: canton=<any>, title^=vernehmlassungsadressaten => type=RECIPIENT_LIST
INFO document_title_rule_model: Labelled 0.14% by rule: canton=<any>, title^=vernehmlassungsadressen => type=RECIPIENT_LIST
INFO document_title_rule_model: Labelled 0.09% by rule: canton=<any>, title^=liste vernehmlassungsadressaten => type=RECIPIENT_LIST
INFO document_title_rule_model: Labelled 0.05% by rule: canton=<any>, title^=verzeichnis der adressatinnen und adressaten => type=RECIPIENT_LIST
INFO document_title_rule_model: Labelled 0.42% by ru

# Cleanlab

In [34]:
classifier = demokratis_ml.models.document_types.model.create_classifier(
    embedding_dimension=embedding_dimension,
    random_state=RANDOM_STATE,
)
classifier

0,1,2
,steps,"[('columntransformer', ...), ('randomforestclassifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('embeddings', ...), ('extra_features', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,n_components,40
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,2718

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [35]:
df_cleanlab = pd.concat([df_input_train, df_input_test], ignore_index=True)
X_cleanlab, y_cleanlab = demokratis_ml.models.document_types.model.create_matrices(df_cleanlab)
X_cleanlab.shape

(15684, 3084)

In [36]:
pred_probs = sklearn.model_selection.cross_val_predict(
    estimator=classifier, X=X_cleanlab, y=y_cleanlab, cv=CV_FOLDS, method="predict_proba"
)
assert len(df_cleanlab) == pred_probs.shape[0]

In [37]:
lab = cleanlab.Datalab(data=df_cleanlab, label_name="document_type")

In [38]:
lab.find_issues(pred_probs=pred_probs, features=X_cleanlab)

Finding null issues ...
Finding label issues ...
Finding outlier issues ...
Error in outlier: Input X contains NaN.
NearestNeighbors does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values
Finding near_duplicate issues ...
Error in near_duplicate: Input X contains NaN.
NearestNeighbors does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor

In [39]:
lab.report()

Dataset Information: num_examples: 15684, num_classes: 9

Here is a summary of various issues found in your data:

     issue_type  num_issues
          label         372
class_imbalance          84

Learn about each issue: https://docs.cleanlab.ai/stable/cleanlab/datalab/guide/issue_type_description.html
See which examples in your dataset exhibit each issue via: `datalab.get_issues(<ISSUE_NAME>)`

Data indices corresponding to top examples of each issue are shown below.


----------------------- label issues -----------------------

About this issue:
	Examples whose given label is estimated to be potentially incorrect
    (e.g. due to annotation error) are flagged as having label issues.
    

Number of examples with this issue: 372
Overall dataset quality in terms of this issue: 0.9850

Examples representing most severe instances of this issue:
       is_label_issue  label_score   given_label predicted_label
14879            True          0.0  FINAL_REPORT          REPORT
5279       

In [40]:
df_output = df_cleanlab.copy()
df_output["year"] = df_output["consultation_start_date"].dt.year
df_output["edit_link"] = df_output.apply(
    lambda d: f"https://www.demokratis.ch/de/consultation/{d['consultation_id']}/admin/document/{d['document_id']}/edit",
    axis=1,
)
df_output = df_output[
    [
        "edit_link",
        "consultation_id",
        "document_id",
        "political_body",
        "year",
        "consultation_url",
        "document_source_url",
        "document_title",
        "document_type",
        # Features:
        "count_pages",
        "count_pages_containing_tables",
        "average_page_aspect_ratio",
        "fraction_pages_containing_tables",
        "contains_synopse_keyword",
        "contains_salutation",
    ]
]

cleanlab_analysis.cleanlab_issues_to_excel(
    lab,
    pred_probs=pred_probs,
    dataset=df_output,
    output_path=f"cleanlab_issues_{DATA_VERSION}.xlsx",
    linkify_columns={
        "edit_link",
        "consultation_url",
        # "document_source_url",  # Odd characters in some URLs cause Excel errors
    },
    issue_types=(
        "label",
        # "near_duplicate",
        # "outlier",
        # "underperforming_group",
        # "non_iid",
    ),
)