In [9]:
import pandas as pd
import numpy as np
import onnxruntime as ort

In [10]:
df = pd.read_csv("../data/train.csv")
X = df.drop(columns=["checked"], errors="ignore").fillna(0)
y = df["checked"]

print("Evaluation data shape:", X.shape)

Evaluation data shape: (104000, 317)


In [11]:
class ONNXModelAdapter:
    def __init__(self, model_path):
        self.session = ort.InferenceSession(
            model_path,
            providers=["CPUExecutionProvider"]
        )
        self.input_name = self.session.get_inputs()[0].name
        self.output_name = self.session.get_outputs()[0].name

    def predict(self, X):
        preds = self.session.run(
            [self.output_name],
            {self.input_name: X.astype(np.float32).values}
        )[0]
        return preds.astype(int)

In [None]:
good_model = ONNXModelAdapter("good-model.onnx")
bad_model = ONNXModelAdapter("bad-model.onnx")

In [None]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix

X_test = pd.read_csv("../data/test.csv")
y_test = X_test["checked"]

X_test = X_test.drop(columns=["checked"], errors="ignore").fillna(0)

print("Test set shape:", X_test.shape)

Test set shape: (26000, 317)


In [14]:
def evaluate_classical_metrics_adapter(model, X_test, y_test, model_name="Model"):
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)

    print(f"\n--- Classical ML Evaluation: {model_name} ---")
    print(f"Accuracy:  {acc:.4f}")
    print(f"F1-score:  {f1:.4f}")
    print(f"AUC-ROC:   {auc:.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    return {
        "accuracy": acc,
        "f1": f1,
        "auc": auc,
    }

In [None]:
metrics_1 = evaluate_classical_metrics_adapter(good_model, X_test, y_test, model_name="Good Model")
metrics_2 = evaluate_classical_metrics_adapter(bad_model, X_test, y_test, model_name="Bad Model")


--- Classical ML Evaluation: Model 1 ---
Accuracy:  0.8545
F1-score:  0.1817
AUC-ROC:   0.5470
Confusion Matrix:
[[21796   303]
 [ 3481   420]]

--- Classical ML Evaluation: Model 2 ---
Accuracy:  0.8547
F1-score:  0.1518
AUC-ROC:   0.5385
Confusion Matrix:
[[21884   215]
 [ 3563   338]]


In [16]:
from partition_tests import (
    partition_test_binary,
    partition_test_numeric,
)
from metamorphic_tests import (
    metamorphic_test,
    neutralize_opm_judgements,
    normalize_documentation_intensity,
)

In [None]:
from train_models import toxic_features, safe_core

print("=== GOOD MODEL 1: PARTITION TESTS ===")

# binary columns EP tests
for feature in toxic_features:
    if X[feature].dropna().nunique() <= 2:
        partition_test_binary(good_model, X, feature)

# numerical columns EP tests


print("\n=== GOOD MODEL: METAMORPHIC TESTS ===")

m1 = metamorphic_test(good_model, X, neutralize_opm_judgements, name="Neutralize OPM judgments", threshold=0.05)

metamorphic_test(good_model, X, normalize_documentation_intensity, name="Normalize documentation verbosity", threshold=0.05)

Saved model_1.onnx with full 317-column interface.
Saved model_2.onnx with full 317-column interface.
=== MODEL 1: PARTITION TESTS ===
[Partition] persoonlijke_eigenschappen_flexibiliteit_opm
  Group 0 rate: 0.021
  Group 1 rate: 0.033
  Absolute diff: 0.013
  Risk ratio: 1.61
  RESULT: FAIL
[Partition] persoonlijke_eigenschappen_doorzettingsvermogen_opm
  Group 0 rate: 0.022
  Group 1 rate: 0.033
  Absolute diff: 0.012
  Risk ratio: 1.55
  RESULT: FAIL
[Partition] persoonlijke_eigenschappen_motivatie_opm
  Group 0 rate: 0.018
  Group 1 rate: 0.034
  Absolute diff: 0.016
  Risk ratio: 1.88
  RESULT: FAIL
[Partition] persoonlijke_eigenschappen_houding_opm
  Group 0 rate: 0.018
  Group 1 rate: 0.036
  Absolute diff: 0.018
  Risk ratio: 2.05
  RESULT: FAIL
[Partition] persoonlijke_eigenschappen_uiterlijke_verzorging_opm
  Group 0 rate: 0.021
  Group 1 rate: 0.034
  Absolute diff: 0.013
  Risk ratio: 1.60
  RESULT: FAIL
[Partition] afspraak_aantal_woorden
  Low (≤ median) rate:  0.036
  Hi

{'name': 'Normalize documentation verbosity',
 'changed': np.int64(1026),
 'fraction_changed': np.float64(0.009865384615384616),
 'passed': np.True_}

In [18]:
print("\n=== MODEL 2: PARTITION TESTS ===")

for feature in toxic_features:
    if X[feature].dropna().nunique() <= 2:
        partition_test_binary(model_2, X, feature)
    else:
        partition_test_numeric(model_2, X, feature)


print("\n=== MODEL 2: METAMORPHIC TESTS ===")

m2 = metamorphic_test(model_2, X, neutralize_opm_judgements, name="Neutralize OPM judgments", threshold=0.05)

metamorphic_test(model_2, X, normalize_documentation_intensity, name="Normalize documentation verbosity", threshold=0.05)


=== MODEL 2: PARTITION TESTS ===
[Partition] persoonlijke_eigenschappen_flexibiliteit_opm
  Group 0 rate: 0.014
  Group 1 rate: 0.026
  Absolute diff: 0.012
  Risk ratio: 1.81
  RESULT: FAIL
[Partition] persoonlijke_eigenschappen_doorzettingsvermogen_opm
  Group 0 rate: 0.016
  Group 1 rate: 0.025
  Absolute diff: 0.009
  Risk ratio: 1.59
  RESULT: FAIL
[Partition] persoonlijke_eigenschappen_motivatie_opm
  Group 0 rate: 0.012
  Group 1 rate: 0.026
  Absolute diff: 0.015
  Risk ratio: 2.24
  RESULT: FAIL
[Partition] persoonlijke_eigenschappen_houding_opm
  Group 0 rate: 0.012
  Group 1 rate: 0.028
  Absolute diff: 0.016
  Risk ratio: 2.36
  RESULT: FAIL
[Partition] persoonlijke_eigenschappen_uiterlijke_verzorging_opm
  Group 0 rate: 0.014
  Group 1 rate: 0.028
  Absolute diff: 0.014
  Risk ratio: 2.02
  RESULT: FAIL
[Partition] afspraak_aantal_woorden
  Low (≤ median) rate:  0.022
  High (> median) rate: 0.019
  Absolute diff: 0.003
  Risk ratio: 1.15
  RESULT: PASS

=== MODEL 2: META

{'name': 'Normalize documentation verbosity',
 'changed': np.int64(0),
 'fraction_changed': np.float64(0.0),
 'passed': np.True_}

In [19]:
print("=== METAMORPHIC COMPARISON ===")
print(f"\nRelative sensitivity: {m1['fraction_changed'] / max(m2['fraction_changed'], 1e-6):.1f}x")

=== METAMORPHIC COMPARISON ===

Relative sensitivity: 3615.4x


In [20]:
from group2.permutation_importance import permutation_importance_accuracy

print("=== MODEL 1: PERMUTATION IMPORTANCE ===")
imp_toxic_m1 = permutation_importance_accuracy(model_1, X_test, y_test, toxic_features)
imp_safe_m1  = permutation_importance_accuracy(model_1, X_test, y_test, safe_core)

print("=== MODEL 2: PERMUTATION IMPORTANCE ===")
imp_toxic_m2 = permutation_importance_accuracy(model_2, X_test, y_test, toxic_features)
imp_safe_m2  = permutation_importance_accuracy(model_2, X_test, y_test, safe_core)

print("Model 1 total toxic importance:", sum(imp_toxic_m1.values()))
print("Model 1 total safe importance :", sum(imp_safe_m1.values()))
print("Model 2 total toxic importance:", sum(imp_toxic_m2.values()))
print("Model 2 total safe importance :", sum(imp_safe_m2.values()))

ModuleNotFoundError: No module named 'group2'