In [1]:
import pandas as pd
import numpy as np
import onnxruntime as ort

In [2]:
df = pd.read_csv("../data/train.csv")
X = df.drop(columns=["checked"], errors="ignore").fillna(0)
y = df["checked"]

print("Evaluation data shape:", X.shape)

Evaluation data shape: (104000, 317)


In [3]:
class ONNXModelAdapter:
    def __init__(self, model_path):
        self.session = ort.InferenceSession(
            model_path,
            providers=["CPUExecutionProvider"]
        )
        self.input_name = self.session.get_inputs()[0].name
        self.output_name = self.session.get_outputs()[0].name

    def predict(self, X):
        preds = self.session.run(
            [self.output_name],
            {self.input_name: X.astype(np.float32).values}
        )[0]
        return preds.astype(int)

In [4]:
good_model = ONNXModelAdapter("model_1.onnx")
bad_model = ONNXModelAdapter("model_2.onnx")

In [5]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix

X_test = pd.read_csv("../data/test.csv")
y_test = X_test["checked"]

X_test = X_test.drop(columns=["checked"], errors="ignore").fillna(0)

print("Test set shape:", X_test.shape)

Test set shape: (26000, 317)


In [6]:
def evaluate_classical_metrics_adapter(model, X_test, y_test, model_name="Model"):
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)

    print(f"\n--- Classical ML Evaluation: {model_name} ---")
    print(f"Accuracy:  {acc:.4f}")
    print(f"F1-score:  {f1:.4f}")
    print(f"AUC-ROC:   {auc:.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    return {
        "accuracy": acc,
        "f1": f1,
        "auc": auc,
    }

In [7]:
metrics_1 = evaluate_classical_metrics_adapter(good_model, X_test, y_test, model_name="Good Model")
metrics_2 = evaluate_classical_metrics_adapter(bad_model, X_test, y_test, model_name="Bad Model")


--- Classical ML Evaluation: Good Model ---
Accuracy:  0.8686
F1-score:  0.3039
AUC-ROC:   0.5897
Confusion Matrix:
[[21837   262]
 [ 3155   746]]

--- Classical ML Evaluation: Bad Model ---
Accuracy:  0.8880
F1-score:  0.5176
AUC-ROC:   0.6872
Confusion Matrix:
[[21526   573]
 [ 2339  1562]]


In [8]:
from partition_tests import (
    partition_test_binary,
    partition_test_age
)
from metamorphic_tests import (
    metamorphic_test,
    neutralize_opm_judgements,
    normalize_documentation_intensity,
    increase_age
)

In [9]:
from train_models import toxic_features, safe_core

print("=== GOOD MODEL 1: PARTITION TESTS ===")

# binary columns EP tests

for feature in toxic_features:
    if X[feature].dropna().nunique() <= 2:
        partition_test_binary(good_model, X_test, y_test, feature)

# age EP test
partition_test_age(good_model, X_test, y_test)


print("\n=== GOOD MODEL: METAMORPHIC TESTS ===")

m1 = metamorphic_test(good_model, X, neutralize_opm_judgements, name="Neutralize OPM judgments", threshold=0.05)

metamorphic_test(good_model, X, normalize_documentation_intensity, name="Normalize documentation verbosity", threshold=0.05)

metamorphic_test(good_model, X, increase_age, name="Test age change effect on model", threshold=0.1)

Saved model_1.onnx with full 317-column interface.
Saved model_2.onnx with full 317-column interface.
=== GOOD MODEL 1: PARTITION TESTS ===
[Partition] persoonlijke_eigenschappen_flexibiliteit_opm - flexible
Number of data points in the test set: 13133
Accuracy: 0.86
Predictions: (array([0, 1]), array([12491,   642]))

[Partition] persoonlijke_eigenschappen_flexibiliteit_opm - not flexible
Number of data points in the test set: 12867
Accuracy: 0.87
Predictions: (array([0, 1]), array([12501,   366]))

[Partition] persoonlijke_eigenschappen_doorzettingsvermogen_opm - persevering
Number of data points in the test set: 12054
Accuracy: 0.86
Predictions: (array([0, 1]), array([11494,   560]))

[Partition] persoonlijke_eigenschappen_doorzettingsvermogen_opm - not persevering
Number of data points in the test set: 13946
Accuracy: 0.87
Predictions: (array([0, 1]), array([13498,   448]))

[Partition] persoonlijke_eigenschappen_motivatie_opm - motivated
Number of data points in the test set: 1524

{'name': 'Test age change effect on model',
 'changed': np.int64(0),
 'fraction_changed': np.float64(0.0),
 'passed': np.True_}

In [10]:
print("\n=== BAD MODEL: PARTITION TESTS ===")

for feature in toxic_features:
    if X[feature].dropna().nunique() <= 2:
        partition_test_binary(bad_model, X_test, y_test, feature)

partition_test_age(bad_model, X_test, y_test)

print("\n=== BAD MODEL: METAMORPHIC TESTS ===")

m2 = metamorphic_test(bad_model, X, neutralize_opm_judgements, name="Neutralize OPM judgments", threshold=0.05)

metamorphic_test(bad_model, X, normalize_documentation_intensity, name="Normalize documentation verbosity", threshold=0.05)

metamorphic_test(bad_model, X, increase_age, name="Test age change effect on model", threshold=0.1)


=== BAD MODEL: PARTITION TESTS ===
[Partition] persoonlijke_eigenschappen_flexibiliteit_opm - flexible
Number of data points in the test set: 13133
Accuracy: 0.88
Predictions: (array([0, 1]), array([11918,  1215]))

[Partition] persoonlijke_eigenschappen_flexibiliteit_opm - not flexible
Number of data points in the test set: 12867
Accuracy: 0.89
Predictions: (array([0, 1]), array([11947,   920]))

[Partition] persoonlijke_eigenschappen_doorzettingsvermogen_opm - persevering
Number of data points in the test set: 12054
Accuracy: 0.88
Predictions: (array([0, 1]), array([10955,  1099]))

[Partition] persoonlijke_eigenschappen_doorzettingsvermogen_opm - not persevering
Number of data points in the test set: 13946
Accuracy: 0.89
Predictions: (array([0, 1]), array([12910,  1036]))

[Partition] persoonlijke_eigenschappen_motivatie_opm - motivated
Number of data points in the test set: 15243
Accuracy: 0.88
Predictions: (array([0, 1]), array([13870,  1373]))

[Partition] persoonlijke_eigenscha

{'name': 'Test age change effect on model',
 'changed': np.int64(1569),
 'fraction_changed': np.float64(0.015086538461538462),
 'passed': np.True_}

In [11]:
from permutation_importance import permutation_importance_accuracy

print("=== MODEL 1: PERMUTATION IMPORTANCE ===")
imp_toxic_m1 = permutation_importance_accuracy(good_model, X_test, y_test, toxic_features)
imp_safe_m1  = permutation_importance_accuracy(good_model, X_test, y_test, safe_core)

print("=== MODEL 2: PERMUTATION IMPORTANCE ===")
imp_toxic_m2 = permutation_importance_accuracy(bad_model, X_test, y_test, toxic_features)
imp_safe_m2  = permutation_importance_accuracy(bad_model, X_test, y_test, safe_core)

print("Model 1 total toxic importance:", sum(imp_toxic_m1.values()))
print("Model 1 total safe importance :", sum(imp_safe_m1.values()))
print("Model 2 total toxic importance:", sum(imp_toxic_m2.values()))
print("Model 2 total safe importance :", sum(imp_safe_m2.values()))

=== MODEL 1: PERMUTATION IMPORTANCE ===
[Permutation importance – accuracy drop]
  baseline accuracy: 0.8750
  total importance over 8 features: 0.0000
  mean per-feature importance: 0.0000
[Permutation importance – accuracy drop]
  baseline accuracy: 0.8750
  total importance over 18 features: 0.0836
  mean per-feature importance: 0.0046
=== MODEL 2: PERMUTATION IMPORTANCE ===
[Permutation importance – accuracy drop]
  baseline accuracy: 0.8942
  total importance over 8 features: 0.0481
  mean per-feature importance: 0.0060
[Permutation importance – accuracy drop]
  baseline accuracy: 0.8942
  total importance over 18 features: 0.0851
  mean per-feature importance: 0.0047
Model 1 total toxic importance: 0.0
Model 1 total safe importance : 0.0836
Model 2 total toxic importance: 0.04813333333333329
Model 2 total safe importance : 0.08506666666666651
