In [1]:
import pandas as pd
import numpy as np
import onnxruntime as ort

In [2]:
df = pd.read_csv("../data/investigation_train_large_checked.csv")

X = df.drop(columns=["checked"], errors="ignore").fillna(0)

print("Evaluation data shape:", X.shape)

Evaluation data shape: (130000, 317)


In [3]:
class ONNXModelAdapter:
    def __init__(self, model_path):
        self.session = ort.InferenceSession(
            model_path,
            providers=["CPUExecutionProvider"]
        )
        self.input_name = self.session.get_inputs()[0].name
        self.output_name = self.session.get_outputs()[0].name

    def predict(self, X):
        preds = self.session.run(
            [self.output_name],
            {self.input_name: X.astype(np.float32).values}
        )[0]
        return preds.astype(int)

In [4]:
bad_model = ONNXModelAdapter("model_1.onnx")
good_model = ONNXModelAdapter("model_2.onnx")

In [5]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix
import numpy as np

X_test = pd.read_csv("../data/test.csv")
y_test = X_test["checked"]

X_test = X_test.drop(columns=["checked"], errors="ignore").fillna(0)

print("Test set shape:", X_test.shape)

Test set shape: (26000, 317)


In [6]:
def evaluate_classical_metrics_adapter(model, X_test, y_test, model_name="Model"):
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)

    print(f"\n--- Classical ML Evaluation: {model_name} ---")
    print(f"Accuracy:  {acc:.4f}")
    print(f"F1-score:  {f1:.4f}")
    print(f"AUC-ROC:   {auc:.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    return {
        "accuracy": acc,
        "f1": f1,
        "auc": auc,
    }

In [7]:
bad_metrics = evaluate_classical_metrics_adapter(bad_model, X_test, y_test, model_name="Bad model")
good_metrics = evaluate_classical_metrics_adapter(good_model, X_test, y_test, model_name="Good model")


--- Classical ML Evaluation: Bad model ---
Accuracy:  0.8497
F1-score:  0.0051
AUC-ROC:   0.5009
Confusion Matrix:
[[22082    17]
 [ 3891    10]]

--- Classical ML Evaluation: Good model ---
Accuracy:  0.8500
F1-score:  0.0005
AUC-ROC:   0.5001
Confusion Matrix:
[[22099     0]
 [ 3900     1]]


In [8]:
from group2.partition_tests import (
    partition_test_binary,
    partition_test_numeric,
)

from group2.metamorphic_tests import (
    metamorphic_test,
    flip_gender,
    increase_days_at_address,
    flip_partner_status,
    flip_has_children,
    flip_flexibility_opm,
    increase_appointment_words,
)

In [9]:
print("=== BAD MODEL: PARTITION TESTS ===")

partition_test_binary(bad_model, X, "persoon_geslacht_vrouw")

partition_test_numeric(bad_model, X, "adres_dagen_op_adres")

partition_test_binary(bad_model, X, "relatie_partner_huidige_partner___partner__gehuwd_")

partition_test_binary(bad_model, X, "relatie_kind_heeft_kinderen")


print("\n=== BAD MODEL: METAMORPHIC TESTS ===")

metamorphic_test(bad_model, X, flip_gender, name="Gender flip", threshold=0.05)

metamorphic_test(bad_model, X, lambda X: increase_days_at_address(X, 365), name="Address duration +1 year", threshold=0.05)

metamorphic_test(bad_model, X, flip_flexibility_opm, name="Flexibility OPM flip", threshold=0.05)

metamorphic_test(bad_model, X, lambda X: increase_appointment_words(X, 50), name="Appointment words +50", threshold=0.05)

=== BAD MODEL: PARTITION TESTS ===
[Partition] persoon_geslacht_vrouw
  Group 0 rate: 0.001
  Group 1 rate: 0.001
  Difference: 0.001
[Partition] adres_dagen_op_adres
  Low (≤ median) rate:  0.002
  High (> median) rate: 0.000
  Difference: 0.002
[Partition] relatie_partner_huidige_partner___partner__gehuwd_
  Group 0 rate: 0.001
  Group 1 rate: 0.001
  Difference: 0.000
[Partition] relatie_kind_heeft_kinderen
  Group 0 rate: 0.001
  Group 1 rate: 0.002
  Difference: 0.001

=== BAD MODEL: METAMORPHIC TESTS ===
[Metamorphic Test] Gender flip
  Changed predictions: 63
  Fraction changed: 0.000
  Threshold: 0.050
  RESULT: PASS
[Metamorphic Test] Address duration +1 year
  Changed predictions: 201
  Fraction changed: 0.002
  Threshold: 0.050
  RESULT: PASS
[Metamorphic Test] Flexibility OPM flip
  Changed predictions: 18
  Fraction changed: 0.000
  Threshold: 0.050
  RESULT: PASS
[Metamorphic Test] Appointment words +50
  Changed predictions: 91
  Fraction changed: 0.001
  Threshold: 0.05

{'name': 'Appointment words +50',
 'changed': np.int64(91),
 'fraction_changed': np.float64(0.0007),
 'passed': np.True_}

In [10]:
print("\n=== GOOD MODEL: PARTITION TESTS ===")

partition_test_numeric(good_model, X, "adres_dagen_op_adres")

partition_test_binary(good_model, X, "relatie_partner_huidige_partner___partner__gehuwd_")

partition_test_binary(good_model, X, "relatie_kind_heeft_kinderen")


print("\n=== GOOD MODEL: METAMORPHIC TESTS ===")

metamorphic_test(good_model, X, lambda X: increase_days_at_address(X, 365), name="Address duration +1 year", threshold=0.05)

metamorphic_test(good_model, X, flip_partner_status, name="Partner flip", threshold=0.05)

metamorphic_test(good_model, X, flip_has_children, name="Children flip", threshold=0.05)


=== GOOD MODEL: PARTITION TESTS ===
[Partition] adres_dagen_op_adres
  Low (≤ median) rate:  0.000
  High (> median) rate: 0.000
  Difference: 0.000
[Partition] relatie_partner_huidige_partner___partner__gehuwd_
  Group 0 rate: 0.000
  Group 1 rate: 0.000
  Difference: 0.000
[Partition] relatie_kind_heeft_kinderen
  Group 0 rate: 0.000
  Group 1 rate: 0.000
  Difference: 0.000

=== GOOD MODEL: METAMORPHIC TESTS ===
[Metamorphic Test] Address duration +1 year
  Changed predictions: 14
  Fraction changed: 0.000
  Threshold: 0.050
  RESULT: PASS
[Metamorphic Test] Partner flip
  Changed predictions: 1
  Fraction changed: 0.000
  Threshold: 0.050
  RESULT: PASS
[Metamorphic Test] Children flip
  Changed predictions: 23
  Fraction changed: 0.000
  Threshold: 0.050
  RESULT: PASS


{'name': 'Children flip',
 'changed': np.int64(23),
 'fraction_changed': np.float64(0.00017692307692307693),
 'passed': np.True_}