# Setup

In [None]:
DATA_FILE_NAME = "consultation-documents-preprocessed-2025-03-11.parquet"
EXTRA_DOCUMENT_FEATURES_FILE_NAME = "consultation-documents-features-2025-03-17.parquet"

LANGUAGES = {
    "de",
    # "fr",
    # "it",
}

# EMBEDDING_MODEL = "openai/text-embedding-ada-002"
EMBEDDING_MODEL = "openai/text-embedding-3-large"

# FROM_YEAR = 2010
FROM_YEAR = 2000

MERGE_CLASSES = {
    # (classes, to, drop): replacement_class
    # ("RESPONSE_FORM", "SURVEY", "SYNOPTIC_TABLE", "VARIOUS_TEXT"): "VARIOUS_TEXT",
}

INCLUDE_RULE_LABELS_IN_TRAINING = {
    "SYNOPTIC_TABLE",
    "LETTER",
    "RECIPIENT_LIST",
    # "FINAL_REPORT",  # might not work well
    # "RESPONSE_FORM",  # might not work well
}

CV_FOLDS = 5
TEST_SIZE = 0.1
RANDOM_STATE = 2718
USE_TEST_SET = True

In [2]:
%load_ext autoreload
%autoreload 2

import logging
import os
import pathlib
import sys

import cleanlab
import dotenv
import mlflow
import numpy as np
import pandas as pd
import sklearn.metrics
import sklearn.model_selection
import sklearn.utils.multiclass
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier  # noqa: F401
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler  # noqa: F401
from tqdm import tqdm

REPOSITORY_ROOT = (pathlib.Path().cwd() / ".." / "..").resolve()
sys.path.append(str(REPOSITORY_ROOT))

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from research.document_types import data_loading
from research.lib import data_access, embeddings

In [4]:
dotenv.load_dotenv()

True

In [5]:
if os.environ.get("MLFLOW_TRACKING_USERNAME") and os.environ.get("MLFLOW_TRACKING_PASSWORD"):
    mlflow.set_tracking_uri("https://mlflow.ml1.demokratis.ch/")
else:
    logging.warning("MLflow credentials not found, will track locally.")
    mlflow.set_tracking_uri("sqlite:///mlruns.db")

mlflow.set_experiment("VM_document_type_cleanlab")

if run := mlflow.active_run():
    logging.warning("Run = %s is already active, closing it.", run.info.run_name)
    mlflow.end_run()
run = mlflow.start_run()
print("Starting run:", run.info.run_name)
mlflow.log_param("input_file", DATA_FILE_NAME)
mlflow.log_param("languages", LANGUAGES)
mlflow.log_param("from_year", FROM_YEAR)
mlflow.log_param("embedding_model", EMBEDDING_MODEL)
mlflow.log_param("cv_folds", CV_FOLDS)
mlflow.log_param("test_size", TEST_SIZE)
mlflow.log_param("random_state", RANDOM_STATE)
mlflow.log_param("include_rule_labels_in_training", sorted(INCLUDE_RULE_LABELS_IN_TRAINING))
;

Starting run: abrasive-zebra-807


''

In [6]:
mlflow.sklearn.autolog()
tqdm.pandas()



# Input data preparation

In [7]:
PREPROCESSED_DATA_FILE = REPOSITORY_ROOT / "data" / "dataframes" / DATA_FILE_NAME
data_access.ensure_dataframe_is_available(PREPROCESSED_DATA_FILE)

In [8]:
df_input, df_test_external = data_loading.load_documents(
    document_file=PREPROCESSED_DATA_FILE,
    external_test_labels_file=REPOSITORY_ROOT / "research" / "document_types" / "ground_truth.xlsx",
    only_languages=LANGUAGES,
    starting_year=FROM_YEAR,
    include_rule_labels=INCLUDE_RULE_LABELS_IN_TRAINING,
    class_merges=MERGE_CLASSES,
)

print("df_input labels:")
df_input.groupby("document_type_label_source")["document_type"].value_counts()

df_input labels:


document_type_label_source  document_type 
explicit                    LETTER            3148
                            DRAFT             2953
                            REPORT            2196
                            RECIPIENT_LIST    1804
                            FINAL_REPORT      1769
                            OPINION           1004
                            VARIOUS_TEXT       463
                            SYNOPTIC_TABLE     115
                            SURVEY              19
                            RESPONSE_FORM        7
                            DECISION             0
                            LINK                 0
rule                        LETTER            1536
                            SYNOPTIC_TABLE     690
                            RECIPIENT_LIST     611
                            DECISION             0
                            DRAFT                0
                            FINAL_REPORT         0
                            LINK       

In [9]:
df_test_external

Unnamed: 0,document_id,consultation_id,consultation_start_date,consultation_end_date,consultation_title,consultation_description,document_source_url,consultation_url,consultation_topics,document_language,document_title,organisation_id,organisation_name,political_body,consultation_reviewed_at,document_source,consultation_topics_label_source,document_content_plain,document_type_label_source,document_type
38521,38522,2317,2021-09-08,2021-12-09,Materielle und formelle Revision des Geb√ºhrenr...,,https://www.ag.ch/media/kanton-aargau/portal/a...,https://www.demokratis.ch/vernehmlassung/xnyvgkgk,"[economics, finance]",de,"Beilage 1 zum Anh√∂rungsbericht (PDF, 84 Seiten...",141,Regierungsrat des Kantons Aargau,ag,NaT,openparldata,openparldata,Synopse Beilage 1 zum Anh√∂rungsbericht \n \nA...,external_test,SYNOPTIC_TABLE
38570,38571,2322,2022-02-20,2022-04-21,Anpassung des Richtplans; Verminderung der Fru...,,https://www.ag.ch/media/kanton-aargau/bvu/raum...,https://www.demokratis.ch/vernehmlassung/84ssk4vc,"[energy, environment, spatial_planning]",de,"Planungsbericht (PDF, 31 Seiten, 2,7 MB)",141,Regierungsrat des Kantons Aargau,ag,NaT,openparldata,openparldata,Axpo Power AG | Hydroenergie & Biomasse \n\nPa...,external_test,VARIOUS_TEXT
38660,38684,2339,2012-01-06,2012-02-06,Gemeinde Gontenschwil; Anpassung des Richtplans,,https://www.ag.ch/media/kanton-aargau/portal/a...,https://www.demokratis.ch/vernehmlassung/fs257a7u,[spatial_planning],de,"Information (PDF, 8 Seiten, 450 KB)",141,Regierungsrat des Kantons Aargau,ag,NaT,openparldata,openparldata,j:\vernehmlassungen\vernehmlassungen\kanton\rp...,external_test,REPORT
38664,38688,2340,2012-01-11,2012-04-13,eHealth; IDAG und des Gesundheitsgesetz; Teilr...,Der Bund wird voraussichtlich im Jahr 2015 ode...,https://www.ag.ch/media/kanton-aargau/portal/a...,https://www.demokratis.ch/vernehmlassung/69q4yzws,"[health, it]",de,"Schreiben der Vorsteherin (PDF, 2 Seiten, 32 KB)",141,Regierungsrat des Kantons Aargau,ag,2025-01-25 14:10:50,openparldata,manual,Departement \nGesundheit und Soziales \nVorste...,external_test,LETTER
38747,38771,2362,2012-12-10,2013-03-08,Gemeinde W√ºrenlos; Anpassung des Richtplans,,https://www.ag.ch/media/kanton-aargau/portal/a...,https://www.demokratis.ch/vernehmlassung/nnuj5gx8,[spatial_planning],de,"Anpassung allgemeine Nutzungsplanung (PDF, 52 ...",141,Regierungsrat des Kantons Aargau,ag,NaT,openparldata,openparldata,Gemeinde W√ºrenlos Kanton Aargau \n___________...,external_test,REPORT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50583,52074,4835,2024-08-16,2024-11-11,√Ñnderung des Finanzausgleichsgesetzes,"Der Regierungsrat hat in Aussicht gestellt, na...",https://sh.ch/CMS/get/file/f6cad1a9-895e-40d7-...,https://www.demokratis.ch/vernehmlassung/vwp33sb5,"[administration, public_finance, security]",de,"Pr√§sentation, FAG STG, 16.8.2024",157,Regierungsrat des Kantons Schaffhausen,sh,2024-10-17 15:54:23,openparldata,manual,Kanton Schaffhausen\nMedienkonferenz\n√Ñnderung...,external_test,VARIOUS_TEXT
50915,52424,4886,2024-09-13,2024-10-15,√Ñnderung der Verordnung zum Einf√ºhrungsgesetz ...,"Der Regierungsrat plant, die monatlichen Kinde...",https://zg.ch/dam/jcr:95397bc7-9729-4e66-9c3c-...,https://www.demokratis.ch/vernehmlassung/nmu3kfpv,"[economics, public_finance, social]",de,3) Antwortformular,165,Regierungsrat des Kantons Zug,zg,NaT,openparldata,openparldata,Amt f√ºr Gesundheit_3 \n \n√Ñnderung der Verord...,external_test,RESPONSE_FORM
51036,52559,4901,2024-09-19,2024-10-31,Revision des Gesetzes √ºber den Fristenlauf (FriG),Der Bund nimmt auf Anfang 2025 eine √Ñnderung b...,https://www.ai.ch/politik/standeskommission/ka...,https://www.demokratis.ch/vernehmlassung/w4rqwc8a,"[administration, communications, law]",de,Begleitschreiben zur Revision des Gesetzes √ºbe...,142,Standeskommission Appenzell Innerrhoden,ai,2024-10-16 10:51:54,openparldata,manual,AI 013.25-27.71.5-1211726 1-2 \nLandammann un...,external_test,LETTER
51410,52971,4943,2024-10-23,2025-01-06,VIII. Nachtrag zum Sozialhilfegesetz (Zuweisun...,Der Kantonsrat hat mit der Annahme der Motion ...,https://www.sg.ch/news/sgch_allgemein/2024/10/...,https://www.demokratis.ch/vernehmlassung/9qb9ntpw,"[housing, migration, public_finance]",de,Vernehmlassungsvorlage,156,Regierung des Kantons St. Gallen,sg,2024-11-06 12:51:05,openparldata,manual,\n \n \nRRB 2024/718 / Beilage \n \n \n 1/11...,external_test,REPORT


In [10]:
# # Test: remove documents known to have label issues
# document_ids_with_issues = pd.read_csv("cleanlab_1_document_ids.csv")
# df_input = df_input[~df_input["document_id"].isin(document_ids_with_issues["document_id"])]

In [11]:
df_input["document_type"].value_counts(dropna=False)

document_type
LETTER            4684
DRAFT             2953
RECIPIENT_LIST    2415
REPORT            2196
FINAL_REPORT      1769
OPINION           1004
SYNOPTIC_TABLE     805
VARIOUS_TEXT       463
SURVEY              19
RESPONSE_FORM        7
DECISION             0
LINK                 0
Name: count, dtype: int64

## Drop empty documents

In [12]:
def drop_empty_texts(df: pd.DataFrame) -> pd.DataFrame:
    empty_index = df["document_content_plain"].str.strip() == ""
    empty_count = len(df[empty_index])
    print(f"Empty texts: {empty_count} ({100 * empty_count / len(df):.1f}%)")
    return df.loc[~empty_index]


print("df_input:", end=" ")
df_input = drop_empty_texts(df_input)
print("df_test_external:", end=" ")
df_test_external = drop_empty_texts(df_test_external)

df_input: Empty texts: 1015 (6.2%)
df_test_external: Empty texts: 7 (5.6%)


## Additional features

In [13]:
FEATURES_DATA_FILE = REPOSITORY_ROOT / "data" / "dataframes" / EXTRA_DOCUMENT_FEATURES_FILE_NAME
data_access.ensure_dataframe_is_available(FEATURES_DATA_FILE)

df_features = pd.read_parquet(FEATURES_DATA_FILE)
df_features

Unnamed: 0_level_0,count_pages,count_pages_containing_tables,average_page_aspect_ratio
document_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,26.0,0.0,0.704898
2,26.0,0.0,0.704898
3,25.0,0.0,0.704898
4,6.0,0.0,0.707098
5,6.0,0.0,0.707098
...,...,...,...
54314,5.0,0.0,0.707108
54315,5.0,0.0,0.706651
54319,3.0,2.0,1.414210
54320,3.0,2.0,1.414210


In [14]:
def add_features(df: pd.DataFrame) -> pd.DataFrame:
    previous_shape = df.shape
    df = df.join(df_features, on="document_id", how="inner")
    df["fraction_pages_containing_tables"] = df["count_pages_containing_tables"] / df["count_pages"]
    df["contains_synopse_keyword"] = (
        df["document_content_plain"].str.slice(0, 1000).str.contains("synopse", case=False, regex=False)
    )
    df["contains_salutation"] = (
        df["document_content_plain"]
        .str.slice(0, 3000)
        .str.contains(
            r"(?:Sehr\s+geehrte[r]?\s+(?:Frau|Herr|Damen\s+und\s+Herren)|"
            r"Liebe[r]?\s+(?:Frau|Herr|Damen\s+und\s+Herren)|"
            r"Sehr\s+geehrte[r]?\s+(?:"
            r"Bundesr(?:at|√§tin)|"
            r"Regierungsr(?:at|√§tin)|"
            r"Nationalr(?:at|√§tin)|"
            r"Stadtpr[√§a]sid(?:ent|entin)|"
            r"Gemeindepr[√§a]sid(?:ent|entin)|"
            r"Stadtr(?:at|√§tin)|"
            r"Gemeinder(?:at|√§tin)|"
            r"Pr[√§a]sid(?:ent|entin)))",
            case=False,
            regex=True,
        )
    )
    print(
        f"{previous_shape[0] - df.shape[0]} rows were lost due to missing features. Remaining rows: {df.shape[0]}. "
        f"{df.shape[1] - previous_shape[1]} columns were added."
    )
    return df

In [15]:
df_input = add_features(df_input)
df_test_external = add_features(df_test_external)

775 rows were lost due to missing features. Remaining rows: 14525. 6 columns were added.
13 rows were lost due to missing features. Remaining rows: 106. 6 columns were added.


## Splits

### Set aside a test set

In [18]:
splitter = sklearn.model_selection.StratifiedShuffleSplit(
    n_splits=1,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
)

train_index, test_index = next(splitter.split(X=df_input, y=df_input["document_type"]))

In [19]:
df_input_train = df_input.iloc[train_index]
df_input_test = df_input.iloc[test_index]

print("Train:", df_input_train.shape)
print("Test:", df_input_test.shape)

Train: (13072, 26)
Test: (1453, 26)


In [20]:
mlflow.log_param("train_samples_count", len(df_input_train))
mlflow.log_param("test_samples_count", len(df_input_test))

1453

# Embeddings

## Tokenise and truncate to input window

In [22]:
embedding_model = embeddings.create_embedding_model(EMBEDDING_MODEL)
mlflow.log_param("embedding_model.max_input_tokens", embedding_model.max_input_tokens)

tokens_train = df_input_train["document_content_plain"].progress_map(embedding_model.tokenize)
tokens_test = df_input_test["document_content_plain"].progress_map(embedding_model.tokenize)
tokens_test_external = df_test_external["document_content_plain"].progress_map(embedding_model.tokenize)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 13072/13072 [00:46<00:00, 283.63it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1453/1453 [00:04<00:00, 304.58it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 106/106 [00:00<00:00, 322.94it/s]


## Embed

In [23]:
with embeddings.use_cache(
    embedding_model,
    tqdm=tqdm,
    cache_directory=REPOSITORY_ROOT / "data" / "embeddings-cache",
    read_only=False,
) as get_embeddings:
    embeddings_train = get_embeddings(tokens_train.tolist())
    print(embeddings_train.shape)
    embeddings_test = get_embeddings(tokens_test.tolist())
    print(embeddings_test.shape)
    embeddings_test_external = get_embeddings(tokens_test_external.tolist())
    print(embeddings_test_external.shape)

Embedding (cached=13071, new=0): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 13072/13072 [00:10<00:00, 1223.66it/s]


(13072, 3072)


Embedding (cached=1452, new=0): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1453/1453 [00:01<00:00, 1413.21it/s]


(1453, 3072)


Embedding (cached=105, new=0): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 106/106 [00:00<00:00, 1501.65it/s]


(106, 3072)


# Classification

In [24]:
extra_feature_columns = [
    "contains_synopse_keyword",  # bool: whether the beginning of the document contains the word "synopse"
    "contains_salutation",  # bool: whether the beginning of the document contains a formal German letter greetings
    "count_pages",  # int: number of pages in the document
    "count_pages_containing_tables",  # int: number of pages containing tables (for documents <= 50 pages)
    "fraction_pages_containing_tables",  # float: count_pages_containing_tables / count_pages
    "average_page_aspect_ratio",  # float: average aspect ratio of pages (width / height) (for documents <= 50 pages)
]
extra_categorical_feature_columns = [
    # "political_body",
    # "document_source",
]

mlflow.log_param("extra_feature_columns", sorted(extra_feature_columns))
mlflow.log_param("extra_categorical_feature_columns", sorted(extra_categorical_feature_columns))


def create_matrices(df: pd.DataFrame, embeddings: np.ndarray) -> tuple[np.ndarray, pd.Series]:
    x = np.hstack(
        (
            embeddings,
            df[extra_feature_columns].fillna(0),
            df[extra_categorical_feature_columns],
        )
    )
    y = df["document_type"]
    assert x.shape[0] == y.shape[0]
    return x, y


X_train, y_train = create_matrices(df_input_train, embeddings_train)
X_test, y_test = create_matrices(df_input_test, embeddings_test)
X_test_external, y_test_external = create_matrices(df_test_external, embeddings_test_external)

print("train", X_train.shape)
print("test", X_test.shape)
print("test_external", X_test_external.shape)

train (13072, 3078)
test (1453, 3078)
test_external (106, 3078)


In [25]:
i_embeddings = 0
i_extra_features = i_embeddings + embeddings_train.shape[1]
i_categorical_features = i_extra_features + len(extra_feature_columns)
assert i_categorical_features + len(extra_categorical_feature_columns) == X_train.shape[1]


def create_classifier() -> Pipeline:
    pca_step = None  # If this is set in the pipeline below, we'll log the number of components to MLflow.
    pipeline = make_pipeline(
        ColumnTransformer(
            [
                (
                    "embeddings",
                    make_pipeline(
                        StandardScaler(),
                        pca_step := PCA(n_components=20, random_state=RANDOM_STATE),
                    ),
                    slice(i_embeddings, i_extra_features),
                ),
                (
                    "extra_features",
                    make_pipeline(
                        StandardScaler(),
                    ),
                    slice(i_extra_features, i_categorical_features),
                ),
                # (
                #     "categorical_features",
                #     make_pipeline(
                #         OneHotEncoder(
                #             sparse_output=False,
                #             categories=[
                #                 # list(schemata.CANTON_CODES | {schemata.FEDERAL_CODE}),
                #                 # ["fedlex", "openparldata"],
                #             ],
                #         ),
                #         StandardScaler(),
                #     ),
                #     slice(i_categorical_features, None),
                # ),
            ]
        ),
        LogisticRegression(max_iter=2000),
        # GradientBoostingClassifier(random_state=RANDOM_STATE),
    )
    if pca_step is not None:
        mlflow.log_param("pca_n_components", pca_step.get_params()["n_components"])
    return pipeline


classifier = create_classifier()
classifier

## Cleanlab on the entire dataset

In [None]:
X_cleanlab = np.vstack((X_train, X_test, X_test_external))
y_cleanlab = pd.concat((y_train, y_test, y_test_external))

pred_probs = sklearn.model_selection.cross_val_predict(
    estimator=classifier, X=X_cleanlab, y=y_cleanlab, cv=5, method="predict_proba"
)

In [28]:
df_cleanlab = pd.concat([df_input_train, df_input_test, df_test_external], ignore_index=True)
assert len(df_cleanlab) == X_cleanlab.shape[0]
lab = cleanlab.Datalab(data=df_cleanlab, label_name="document_type")

In [29]:
lab.find_issues(pred_probs=pred_probs, features=X_cleanlab)

Finding null issues ...
Finding label issues ...
Finding outlier issues ...




Error in outlier: The axis argument to unique is not supported for dtype object
Finding near_duplicate issues ...




Error in near_duplicate: The axis argument to unique is not supported for dtype object
Finding non_iid issues ...




Error in non_iid: The axis argument to unique is not supported for dtype object
Finding class_imbalance issues ...
Finding underperforming_group issues ...




Error in underperforming_group: The axis argument to unique is not supported for dtype object
Failed to check for these issue types: [OutlierIssueManager, NearDuplicateIssueManager, NonIIDIssueManager, UnderperformingGroupIssueManager]

Audit complete. 495 issues found in the dataset.


In [30]:
lab.report()

Dataset Information: num_examples: 14631, num_classes: 10

Here is a summary of various issues found in your data:

     issue_type  num_issues
          label         486
class_imbalance           9

Learn about each issue: https://docs.cleanlab.ai/stable/cleanlab/datalab/guide/issue_type_description.html
See which examples in your dataset exhibit each issue via: `datalab.get_issues(<ISSUE_NAME>)`

Data indices corresponding to top examples of each issue are shown below.


----------------------- label issues -----------------------

About this issue:
	Examples whose given label is estimated to be potentially incorrect
    (e.g. due to annotation error) are flagged as having label issues.
    

Number of examples with this issue: 486
Overall dataset quality in terms of this issue: 0.9771

Examples representing most severe instances of this issue:
       is_label_issue   label_score   given_label predicted_label
6068             True  1.274843e-12         DRAFT         OPINION
11494   

In [31]:
lab.get_issues("class_imbalance").sort_values("class_imbalance_score").head(9)

Unnamed: 0,is_class_imbalance_issue,class_imbalance_score,given_label
2318,True,0.000615,RESPONSE_FORM
11902,True,0.000615,RESPONSE_FORM
14627,True,0.000615,RESPONSE_FORM
14554,True,0.000615,RESPONSE_FORM
14260,True,0.000615,RESPONSE_FORM
6024,True,0.000615,RESPONSE_FORM
11866,True,0.000615,RESPONSE_FORM
10153,True,0.000615,RESPONSE_FORM
7823,True,0.000615,RESPONSE_FORM


In [32]:
examples_w_issue = lab.get_issues("label").query("is_label_issue").sort_values("label_score")
examples_w_issue

Unnamed: 0,is_label_issue,label_score,given_label,predicted_label
6068,True,1.274843e-12,DRAFT,OPINION
11494,True,1.336734e-12,OPINION,RECIPIENT_LIST
800,True,1.504169e-08,VARIOUS_TEXT,RECIPIENT_LIST
11110,True,1.508540e-06,FINAL_REPORT,OPINION
13094,True,2.280410e-06,OPINION,FINAL_REPORT
...,...,...,...,...
2285,True,4.202818e-01,VARIOUS_TEXT,SURVEY
11930,True,4.346111e-01,SYNOPTIC_TABLE,VARIOUS_TEXT
57,True,4.348292e-01,DRAFT,VARIOUS_TEXT
7889,True,4.398113e-01,VARIOUS_TEXT,SURVEY


In [36]:
df_issues = examples_w_issue.join(df_cleanlab)
df_issues["year"] = df_issues["consultation_start_date"].dt.year
df_issues["edit_link"] = df_issues.apply(
    lambda d: f"https://www.demokratis.ch/de/consultation/{d['consultation_id']}/admin/document/{d['document_id']}/edit",
    axis=1,
)

df_issues_display = df_issues[
    [
        "consultation_id",
        "document_id",
        "political_body",
        "year",
        "consultation_url",
        "document_source_url",
        "document_title",
        "label_score",
        "document_type_label_source",
        "given_label",
        "predicted_label",
        "edit_link",
        # Features:
        "count_pages",
        "count_pages_containing_tables",
        "average_page_aspect_ratio",
        "fraction_pages_containing_tables",
        "contains_synopse_keyword",
        "contains_salutation",
    ]
]

df_issues_display.to_clipboard(index=False)

# df_issues_display.style.format(
#     {
#         "consultation_url": lambda x: f'<a href="{x}">vnl</a>',
#         "document_source_url": lambda x: f'<a href="{x}">doc</a>',
#     }
# ).set_table_styles([{"selector": "th,td", "props": [("text-align", "left")]}])

In [33]:
label_issues_info = lab.get_info("label")
label_issues_info["classes_by_label_quality"]

Unnamed: 0,Class Name,Class Index,Label Issues,Inverse Label Issues,Label Noise,Inverse Label Noise,Label Quality Score
0,SURVEY,7,11,20,0.578947,0.714286,0.421053
1,VARIOUS_TEXT,9,144,278,0.322148,0.478485,0.677852
2,SYNOPTIC_TABLE,8,70,29,0.133588,0.060041,0.866412
3,RESPONSE_FORM,6,1,5,0.111111,0.384615,0.888889
4,DRAFT,0,160,97,0.054237,0.033599,0.945763
5,OPINION,3,44,10,0.04467,0.010515,0.95533
6,REPORT,5,71,77,0.032243,0.034873,0.967757
7,FINAL_REPORT,1,26,18,0.014681,0.01021,0.985319
8,RECIPIENT_LIST,4,8,3,0.003583,0.001346,0.996417
9,LETTER,2,12,10,0.003437,0.002866,0.996563


# End MLflow run

In [42]:
mlflow.end_run()

üèÉ View run fun-gull-251 at: https://mlflow.ml1.demokratis.ch/#/experiments/2/runs/cf783fb732924831877f772fe789a1f0
üß™ View experiment at: https://mlflow.ml1.demokratis.ch/#/experiments/2
