# Setup

In [1]:
import pathlib

DATA_FILE_NAME = "consultation-documents-preprocessed-2024-11-26.parquet"

REPOSITORY_ROOT = (pathlib.Path().cwd() / ".." / "..").resolve()
PREPROCESSED_DATA_FILE = REPOSITORY_ROOT / "data" / "dataframes" / DATA_FILE_NAME

DOCUMENT_SOURCES = {
    "fedlex",
    # "openparldata",
}
LANGUAGES = {
    "de",
    # "fr",
    # "it",
}

# EMBEDDING_MODEL = "openai/text-embedding-ada-002"
EMBEDDING_MODEL = "openai/text-embedding-3-large"

# FROM_YEAR = 2010
FROM_YEAR = 2000

DOC_TYPES = {
    "LETTER",
    "DRAFT",
    "RECIPIENT_LIST",
    "REPORT",
    "FINAL_REPORT",
    "OPINION",
    "VARIOUS_TEXT",
    "SYNOPTIC_TABLE",
    "SURVEY",
    "RESPONSE_FORM",
    # None,  # Many openparldata documents don't have a known type
}

CV_FOLDS = 5
TEST_SIZE = 0.1
RANDOM_STATE = 271
USE_TEST_SET = False

%env MLFLOW_TRACKING_URI=sqlite:///mlruns.db

env: MLFLOW_TRACKING_URI=sqlite:///mlruns.db


In [2]:
%load_ext autoreload
%autoreload 2

import logging
import sys

sys.path.append(str(REPOSITORY_ROOT))


import dotenv
import mlflow
import numpy as np
import sklearn.metrics
import sklearn.model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

In [3]:
from research.lib import data_access, embeddings

In [4]:
dotenv.load_dotenv()

True

In [5]:
mlflow.set_experiment(f"V1. {'&'.join(sorted(LANGUAGES))}. document types")

if run := mlflow.active_run():
    logging.warning("Run = %s is already active, closing it.", run.info.run_name)
    mlflow.end_run()
run = mlflow.start_run()
print("Starting run:", run.info.run_name)
mlflow.log_param("input_file", DATA_FILE_NAME)
mlflow.log_param("document_sources", sorted(DOCUMENT_SOURCES))
mlflow.log_param("languages", LANGUAGES)
mlflow.log_param("from_year", FROM_YEAR)
mlflow.log_param("doc_types", sorted(map(str, DOC_TYPES)))
mlflow.log_param("embedding_model", EMBEDDING_MODEL)
mlflow.log_param("cv_folds", CV_FOLDS)
mlflow.log_param("test_size", TEST_SIZE)
mlflow.log_param("random_state", RANDOM_STATE)
;

Starting run: suave-shrew-497


''

In [6]:
mlflow.sklearn.autolog()
tqdm.pandas()

# Input data preparation

In [7]:
df_input = data_access.load_consultation_documents(
    PREPROCESSED_DATA_FILE,
    only_document_sources=DOCUMENT_SOURCES,
    only_languages=LANGUAGES,
    only_doc_types=DOC_TYPES,
    starting_year=FROM_YEAR,
    mlflow=mlflow,
)
df_input

  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(


Unnamed: 0,document_id,consultation_id,consultation_start_date,consultation_end_date,consultation_title,consultation_description,document_source_url,consultation_url,consultation_topics,document_type,document_language,document_title,organisation_id,organisation_name,political_body,consultation_reviewed_at,document_source,consultation_topics_label_source,document_content_plain
0,1,1,2021-10-01,2022-01-17,Verordnung des BAZG über die Bekämpfung von G...,Mit der am 19. März 2021 durch das Parlament b...,https://fedlex.data.admin.ch/filestore/fedlex....,https://www.demokratis.ch/vernehmlassung/7bt3hz2w,"[administration, finance, security]",DRAFT,de,Vernehmlassungsvorlage,41,Bundesamt für Zoll und Grenzsicherheit,ch,2024-11-04 15:11:24,fedlex,manual,«$$e-seal» \n«$$QrCode» \n \n2021-… \n«%ASFF_Y...
3,4,1,2021-10-01,2022-01-17,Verordnung des BAZG über die Bekämpfung von G...,Mit der am 19. März 2021 durch das Parlament b...,https://fedlex.data.admin.ch/filestore/fedlex....,https://www.demokratis.ch/vernehmlassung/7bt3hz2w,"[administration, finance, security]",RECIPIENT_LIST,de,Adressatenliste,41,Bundesamt für Zoll und Grenzsicherheit,ch,2024-11-04 15:11:24,fedlex,manual,Eidgenössisches Finanzdepartement EFD \nEidgen...
6,7,1,2021-10-01,2022-01-17,Verordnung des BAZG über die Bekämpfung von G...,Mit der am 19. März 2021 durch das Parlament b...,https://fedlex.data.admin.ch/filestore/fedlex....,https://www.demokratis.ch/vernehmlassung/7bt3hz2w,"[administration, finance, security]",LETTER,de,Begleitschreiben 2,41,Bundesamt für Zoll und Grenzsicherheit,ch,2024-11-04 15:11:24,fedlex,manual,Eidgenössisches Finanzdepartement EFD \nEidgen...
9,10,1,2021-10-01,2022-01-17,Verordnung des BAZG über die Bekämpfung von G...,Mit der am 19. März 2021 durch das Parlament b...,https://fedlex.data.admin.ch/filestore/fedlex....,https://www.demokratis.ch/vernehmlassung/7bt3hz2w,"[administration, finance, security]",REPORT,de,Erläuternder Bericht,41,Bundesamt für Zoll und Grenzsicherheit,ch,2024-11-04 15:11:24,fedlex,manual,1. Oktober 2021 \n \n \n \n \nVerordnung des B...
12,13,1,2021-10-01,2022-01-17,Verordnung des BAZG über die Bekämpfung von G...,Mit der am 19. März 2021 durch das Parlament b...,https://fedlex.data.admin.ch/filestore/fedlex....,https://www.demokratis.ch/vernehmlassung/7bt3hz2w,"[administration, finance, security]",LETTER,de,Begleitschreiben 1,41,Bundesamt für Zoll und Grenzsicherheit,ch,2024-11-04 15:11:24,fedlex,manual,Eidgenössisches Finanzdepartement EFD \nEidgen...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52012,53138,4956,2024-11-21,2025-03-14,Parlamentarische Initiative. Armut ist kein Ve...,"Die Kommission schlägt vor, das Ausländer- und...",https://fedlex.data.admin.ch/filestore/fedlex....,https://www.demokratis.ch/vernehmlassung/8vzwky3r,"[law, migration, social]",LETTER,de,Begleitschreiben-2,86,Parlamentsdienste,ch,2024-11-21 13:37:26,fedlex,manual,Na t i o na l r at \nCo n s e il na t io n a...
52015,53141,4956,2024-11-21,2025-03-14,Parlamentarische Initiative. Armut ist kein Ve...,"Die Kommission schlägt vor, das Ausländer- und...",https://fedlex.data.admin.ch/filestore/fedlex....,https://www.demokratis.ch/vernehmlassung/8vzwky3r,"[law, migration, social]",LETTER,de,Begleitschreiben,86,Parlamentsdienste,ch,2024-11-21 13:37:26,fedlex,manual,Na t i o na l r at \nCo n s e il na t io n a...
52018,53144,4956,2024-11-21,2025-03-14,Parlamentarische Initiative. Armut ist kein Ve...,"Die Kommission schlägt vor, das Ausländer- und...",https://fedlex.data.admin.ch/filestore/fedlex....,https://www.demokratis.ch/vernehmlassung/8vzwky3r,"[law, migration, social]",REPORT,de,Erläuternder Bericht,86,Parlamentsdienste,ch,2024-11-21 13:37:26,fedlex,manual,[Signature] \n[QR Code] \n2019–...... \n1 \n20...
52021,53147,4956,2024-11-21,2025-03-14,Parlamentarische Initiative. Armut ist kein Ve...,"Die Kommission schlägt vor, das Ausländer- und...",https://fedlex.data.admin.ch/filestore/fedlex....,https://www.demokratis.ch/vernehmlassung/8vzwky3r,"[law, migration, social]",SYNOPTIC_TABLE,de,Synoptische Tabelle,86,Parlamentsdienste,ch,2024-11-21 13:37:26,fedlex,manual,20.451 n Pa. Iv. Marti Samira. Armut ist kein ...


## Define the target

In [8]:
df_input["is_draft"] = (df_input["document_type"] == "DRAFT").astype(int)
df_input["is_draft"].value_counts()

is_draft
0    10165
1     2879
Name: count, dtype: int64

## Drop empty documents

In [9]:
empty_index = df_input["document_content_plain"] == ""
empty_count = len(df_input[empty_index])
print(f"Empty texts: {empty_count} ({100 * empty_count / len(df_input):.1f}%)")

Empty texts: 705 (5.4%)


In [10]:
df_input = df_input.loc[~empty_index]

## Splits

### Set aside a test set

In [11]:
splitter = sklearn.model_selection.StratifiedShuffleSplit(
    n_splits=1,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
)

train_index, test_index = next(splitter.split(df_input, df_input["is_draft"]))

In [12]:
df_input_train = df_input.iloc[train_index]
df_input_test = df_input.iloc[test_index]

print("Train:", df_input_train.shape)
print("Test:", df_input_test.shape)

Train: (11105, 20)
Test: (1234, 20)


In [13]:
mlflow.log_param("train_samples_count", len(df_input_train))
mlflow.log_param("test_samples_count", len(df_input_test))

1234

### Prepare a cross-validation splitter

In [14]:
cv_splitter = sklearn.model_selection.StratifiedKFold(
    n_splits=CV_FOLDS,
    shuffle=True,
    random_state=RANDOM_STATE,
)

# Embeddings

## Tokenise and truncate to input window

In [15]:
if EMBEDDING_MODEL is not None:
    embedding_model = embeddings.create_embedding_model(EMBEDDING_MODEL)
    mlflow.log_param("embedding_model.max_input_tokens", embedding_model.max_input_tokens)

    tokens_train = df_input_train["document_content_plain"].progress_map(embedding_model.tokenize)
    tokens_test = df_input_test["document_content_plain"].progress_map(embedding_model.tokenize)

100%|██████████| 11105/11105 [00:38<00:00, 289.12it/s]
100%|██████████| 1234/1234 [00:04<00:00, 290.39it/s]


## Embed

In [16]:
if EMBEDDING_MODEL is not None:
    with embeddings.use_cache(
        embedding_model,
        tqdm=tqdm,
        cache_directory=REPOSITORY_ROOT / "data" / "embeddings-cache",
    ) as get_embeddings:
        embeddings_train = get_embeddings(tokens_train.tolist())
        print(embeddings_train.shape)
        embeddings_test = get_embeddings(tokens_test.tolist())
        print(embeddings_test.shape)

Embedding (cached=11104, new=0): 100%|██████████| 11105/11105 [00:08<00:00, 1283.25it/s]


(11105, 3072)


Embedding (cached=1233, new=0): 100%|██████████| 1234/1234 [00:00<00:00, 1364.27it/s]


(1234, 3072)


# Classification

In [17]:
X_train = embeddings_train
X_test = embeddings_test
y_train = df_input_train["is_draft"]
y_test = df_input_test["is_draft"]

print("Train", X_train.shape, y_train.shape)
print("Test", X_test.shape, y_test.shape)

Train (11105, 3072) (11105,)
Test (1234, 3072) (1234,)


In [18]:
classifier = make_pipeline(
    StandardScaler(),
    # PCA(n_components=200, random_state=RANDOM_STATE),
    LogisticRegression(max_iter=1000),
    # SGDClassifier(loss="modified_huber", max_iter=1000),
    # GradientBoostingClassifier(random_state=RANDOM_STATE),
    # SVC(kernel="linear"),
)
classifier

In [19]:
if "pca" in classifier.named_steps:
    mlflow.log_param("pca_n_components", classifier.named_steps["pca"].get_params()["n_components"])

In [20]:
def explained_variance_scorer(pipeline, X, y=None):
    try:
        pca = pipeline.named_steps["pca"]
    except KeyError:
        return np.nan
    explained_variance = np.sum(pca.explained_variance_ratio_)
    return explained_variance


scoring = {
    "precision": "precision_weighted",
    "recall": "recall_weighted",
    "f1": "f1_weighted",
    "pca_explained_variance": explained_variance_scorer,
}

scores_docs = sklearn.model_selection.cross_validate(
    classifier,
    X=X_train,
    y=y_train,
    cv=cv_splitter,
    scoring=scoring,
)
scores_docs

{'fit_time': array([1.71075201, 1.47406387, 1.56653714, 1.44320893, 1.32072306]),
 'score_time': array([0.02783108, 0.01932883, 0.01872802, 0.01908779, 0.01918602]),
 'test_precision': array([0.97079779, 0.97755678, 0.96420686, 0.97232436, 0.96656434]),
 'test_recall': array([0.9707339 , 0.97748762, 0.96398019, 0.97208465, 0.96668167]),
 'test_f1': array([0.97076343, 0.97751786, 0.96407667, 0.97217783, 0.96661331]),
 'test_pca_explained_variance': array([nan, nan, nan, nan, nan])}

## Evaluation of the cross-validation

In [21]:
avg_scores_docs = {k: np.round(np.mean(v), 4) for k, v in scores_docs.items()}
mlflow.log_metrics({f"{k}_docs": v for k, v in avg_scores_docs.items() if k.startswith("test_")})

print("Per-document CV scores:")
print(f"Precision: {avg_scores_docs['test_precision']:.4f} (+/- {np.std(scores_docs['test_precision']):.4f})")
print(f"Recall:    {avg_scores_docs['test_recall']:.4f} (+/- {np.std(scores_docs['test_recall']):.4f})")
print(f"F1:        {avg_scores_docs['test_f1']:.4f} (+/- {np.std(scores_docs['test_f1']):.4f})")

Per-document CV scores:
Precision: 0.9703 (+/- 0.0046)
Recall:    0.9702 (+/- 0.0046)
F1:        0.9702 (+/- 0.0047)


***

## Evaluation on the test set (USE SPARINGLY)

In [22]:
if USE_TEST_SET:
    classifier.fit(X_train, y_train)
    test_ground_truth_docs = y_test
    test_predictions_docs = classifier.predict(X_test)

    print("docs: ground truth", test_ground_truth_docs.shape)
    print("docs: predictions", test_predictions_docs.shape)

    # TODO: print scores

***

# End MLflow run

In [23]:
mlflow.end_run()