In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
import onnxruntime as rt
import onnx
import xgboost as xgb
from skl2onnx.common.data_types import FloatTensorType
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from skl2onnx import convert_sklearn

In [3]:
# Load the dataset
data = pd.read_csv('data/synth_data_for_training.csv')


# Separate features / label
y = data['checked']
X = data.drop(['checked'], axis=1)
X = X.astype(np.float32)

# Split into train / test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [60]:
# Select important features
selector = SelectFromModel(RandomForestClassifier(class_weight='balanced'))

In [61]:
# Use XGBoost as classifier
classifier = xgb.XGBClassifier(objective='binary:logistic')

In [62]:
# Create a pipeline object with our selector and classifier
pipeline = Pipeline(steps=[('feature_selection', selector), ('classification', classifier)])

In [63]:
# Cross-validate pipeline
# Define the parameter grid for grid search
param_grid = {
    'feature_selection__max_features': [50, 75, 100],
    'classification__learning_rate': [0.1, 0.2, 0.3],
}

# Create a GridSearchCV object with the pipeline and parameter grid
grid_search = GridSearchCV(pipeline, param_grid, scoring= 'roc_auc', cv=5, verbose= 2)

# Perform grid search with cross-validation
grid_search.fit(X, y)

# Print the best parameters found
print("Best Parameters:", grid_search.best_params_)

# Print the best cross-validation score
print("Best Cross-Validation Score:", grid_search.best_score_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] END classification__learning_rate=0.1, feature_selection__max_features=50; total time=   4.2s
[CV] END classification__learning_rate=0.1, feature_selection__max_features=50; total time=   4.0s
[CV] END classification__learning_rate=0.1, feature_selection__max_features=50; total time=   4.3s
[CV] END classification__learning_rate=0.1, feature_selection__max_features=50; total time=   4.1s
[CV] END classification__learning_rate=0.1, feature_selection__max_features=50; total time=   4.3s
[CV] END classification__learning_rate=0.1, feature_selection__max_features=75; total time=   4.2s
[CV] END classification__learning_rate=0.1, feature_selection__max_features=75; total time=   4.3s
[CV] END classification__learning_rate=0.1, feature_selection__max_features=75; total time=   4.3s
[CV] END classification__learning_rate=0.1, feature_selection__max_features=75; total time=   4.4s
[CV] END classification__learning_rate=0.1, featu

In [64]:
# Update pipeline
pipeline.named_steps['classification'].set_params(learning_rate= grid_search.best_params_['classification__learning_rate'])
pipeline.named_steps['feature_selection'].set_params(max_features= grid_search.best_params_['feature_selection__max_features'])

#Fit the model
pipeline.fit(X_train, y_train)
             
# Evaluate the model
y_pred = pipeline.predict(X_test)
original_accuracy = accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))
print('Accuracy of the original model: ', original_accuracy)

              precision    recall  f1-score   support

           0       0.95      0.99      0.97      2278
           1       0.84      0.56      0.67       251

    accuracy                           0.95      2529
   macro avg       0.90      0.77      0.82      2529
weighted avg       0.94      0.95      0.94      2529

Accuracy of the original model:  0.9458283906682483


In [67]:
from skl2onnx import update_registered_converter
from skl2onnx.common.shape_calculator import calculate_linear_classifier_output_shapes  # noqa
from onnxmltools.convert.xgboost.operator_converters.XGBoost import convert_xgboost  # noqa

update_registered_converter(
    xgb.XGBClassifier,
    "XGBoostClassifier",
    calculate_linear_classifier_output_shapes,
    convert_xgboost,
    options={"nocl": [True, False], "zipmap": [True, False, "columns"]}
)
# Convert the model to ONNX
onnx_model = convert_sklearn(
    pipeline, initial_types=[('X', FloatTensorType((None, X.shape[1])))],
    target_opset=12)

# Check the accuracy of the converted model
sess = rt.InferenceSession(onnx_model.SerializeToString())
y_pred_onnx =  sess.run(None, {'X': X_test.values.astype(np.float32)})

accuracy_onnx_model = accuracy_score(y_test, y_pred_onnx[0])
print('Accuracy of the ONNX model: ', accuracy_onnx_model)

Accuracy of the ONNX model:  0.9458283906682483


In [68]:
# Save the model
onnx.save(onnx_model, "model/model_1.onnx")

# Load the model
new_session = rt.InferenceSession("model/model_1.onnx")

# Predict the target
y_pred_onnx2 =  new_session.run(None, {'X': X_test.values.astype(np.float32)})

accuracy_onnx_model = accuracy_score(y_test, y_pred_onnx2[0])
print('Accuracy of the ONNX model: ', accuracy_onnx_model)

Accuracy of the ONNX model:  0.9458283906682483


In [69]:
import giskard

# Replace this with your own data & model creation.
df = giskard.demo.titanic_df()
demo_data_processing_function, demo_sklearn_model = giskard.demo.titanic_pipeline()

# Wrap your Pandas DataFrame with Giskard.Dataset (test set, a golden dataset, etc.).
giskard_dataset = giskard.Dataset(
    df=data,  # A pandas.DataFrame that contains the raw data (before all the pre-processing steps) and the actual ground truth variable (target).
    target="checked",  # Ground truth variable
)

giskard_model = giskard.Model(
    model=pipeline,  # A prediction function that encapsulates all the data pre-processing steps and that could be executed with the dataset used by the scan.
    model_type="classification",  # Either regression, classification or text_generation.
    name="Test",  # Optional
    classification_labels=[0, 1],  # Their order MUST be identical to the prediction_function's output order
)

2024-03-17 21:55:09,993 pid:16288 MainThread giskard.datasets.base INFO     Your 'pandas.DataFrame' is successfully wrapped by Giskard's 'Dataset' wrapper class.
2024-03-17 21:55:10,661 pid:16288 MainThread giskard.models.automodel INFO     Your 'model' is successfully wrapped by Giskard's 'SKLearnModel' wrapper class.


In [70]:
scan_results = giskard.scan(giskard_model, giskard_dataset)

2024-03-17 21:55:24,083 pid:16288 MainThread giskard.datasets.base INFO     Casting dataframe columns from {'adres_aantal_brp_adres': 'int64', 'adres_aantal_verschillende_wijken': 'int64', 'adres_aantal_verzendadres': 'int64', 'adres_aantal_woonadres_handmatig': 'int64', 'adres_dagen_op_adres': 'int64', 'adres_recentst_onderdeel_rdam': 'int64', 'adres_recentste_buurt_groot_ijsselmonde': 'int64', 'adres_recentste_buurt_nieuwe_westen': 'int64', 'adres_recentste_buurt_other': 'int64', 'adres_recentste_buurt_oude_noorden': 'int64', 'adres_recentste_buurt_vreewijk': 'int64', 'adres_recentste_plaats_other': 'int64', 'adres_recentste_plaats_rotterdam': 'int64', 'adres_recentste_wijk_charlois': 'int64', 'adres_recentste_wijk_delfshaven': 'int64', 'adres_recentste_wijk_feijenoord': 'int64', 'adres_recentste_wijk_ijsselmonde': 'int64', 'adres_recentste_wijk_kralingen_c': 'int64', 'adres_recentste_wijk_noord': 'int64', 'adres_recentste_wijk_other': 'int64', 'adres_recentste_wijk_prins_alexa': 'in



2024-03-17 21:55:27,022 pid:16288 MainThread giskard.datasets.base INFO     Casting dataframe columns from {'adres_aantal_brp_adres': 'int64', 'adres_aantal_verschillende_wijken': 'int64', 'adres_aantal_verzendadres': 'int64', 'adres_aantal_woonadres_handmatig': 'int64', 'adres_dagen_op_adres': 'int64', 'adres_recentst_onderdeel_rdam': 'int64', 'adres_recentste_buurt_groot_ijsselmonde': 'int64', 'adres_recentste_buurt_nieuwe_westen': 'int64', 'adres_recentste_buurt_other': 'int64', 'adres_recentste_buurt_oude_noorden': 'int64', 'adres_recentste_buurt_vreewijk': 'int64', 'adres_recentste_plaats_other': 'int64', 'adres_recentste_plaats_rotterdam': 'int64', 'adres_recentste_wijk_charlois': 'int64', 'adres_recentste_wijk_delfshaven': 'int64', 'adres_recentste_wijk_feijenoord': 'int64', 'adres_recentste_wijk_ijsselmonde': 'int64', 'adres_recentste_wijk_kralingen_c': 'int64', 'adres_recentste_wijk_noord': 'int64', 'adres_recentste_wijk_other': 'int64', 'adres_recentste_wijk_prins_alexa': 'in

In [71]:
display(scan_results)