In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import onnxruntime as rt
import onnx
from skl2onnx.common.data_types import FloatTensorType
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from skl2onnx import convert_sklearn

In [7]:
#TO BE REMOVED
#Create biased dataset

# Load the dataset
data = pd.read_csv('data/synth_data_for_training.csv')
y = data['checked']
X = data.drop(['checked'], axis=1)
X = X.astype(np.float32)

# Split into train / test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_data = pd.concat([X_train, y_train], axis=1)
filtered_train_data = train_data[~((train_data['persoon_geslacht_vrouw'] == 1) & (train_data['checked'] == 0))]
filtered_train_data.to_csv('data/train_synth_data_for_training.csv', index = False)

test_data = pd.concat([X_test, y_test], axis=1)
test_data.to_csv('data/test_synth_data_for_training.csv', index=False)

In [8]:
# Load the dataset
data_train = pd.read_csv('data/train_synth_data_for_training.csv')
data_test = pd.read_csv('data/test_synth_data_for_training.csv')

y_train = data_train['checked']
X_train = data_train.drop(['checked'], axis=1)

y_test = data_test['checked']
X_test = data_test.drop(['checked'], axis=1)

In [9]:
model = RandomForestClassifier()

In [10]:
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
original_accuracy = accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))
print('Accuracy of the original model: ', original_accuracy)

              precision    recall  f1-score   support

           0       0.93      0.85      0.89      2278
           1       0.24      0.45      0.32       251

    accuracy                           0.81      2529
   macro avg       0.59      0.65      0.60      2529
weighted avg       0.86      0.81      0.83      2529

Accuracy of the original model:  0.8062475286674575


In [11]:
# Convert the model to ONNX
onnx_model = convert_sklearn(
    model, initial_types=[('X', FloatTensorType((None, X.shape[1])))],
    target_opset=12)

# Check the accuracy of the converted model
sess = rt.InferenceSession(onnx_model.SerializeToString())
y_pred_onnx =  sess.run(None, {'X': X_test.values.astype(np.float32)})

accuracy_onnx_model = accuracy_score(y_test, y_pred_onnx[0])
print('Accuracy of the ONNX model: ', accuracy_onnx_model)

Accuracy of the ONNX model:  0.8062475286674575


In [12]:
# Save the model
onnx.save(onnx_model, "model/model_2.onnx")

# Load the model
new_session = rt.InferenceSession("model/model_2.onnx")

# Predict the target
y_pred_onnx2 =  new_session.run(None, {'X': X_test.values.astype(np.float32)})

accuracy_onnx_model = accuracy_score(y_test, y_pred_onnx2[0])
print('Accuracy of the ONNX model: ', accuracy_onnx_model)

Accuracy of the ONNX model:  0.8062475286674575


In [13]:
import giskard

# Replace this with your own data & model creation.
df = giskard.demo.titanic_df()
demo_data_processing_function, demo_sklearn_model = giskard.demo.titanic_pipeline()

# Wrap your Pandas DataFrame with Giskard.Dataset (test set, a golden dataset, etc.).
giskard_dataset = giskard.Dataset(
    df=data,  # A pandas.DataFrame that contains the raw data (before all the pre-processing steps) and the actual ground truth variable (target).
    target="checked",  # Ground truth variable
)

giskard_model = giskard.Model(
    model=model,  # A prediction function that encapsulates all the data pre-processing steps and that could be executed with the dataset used by the scan.
    model_type="classification",  # Either regression, classification or text_generation.
    name="Test",  # Optional
    classification_labels=[0, 1],  # Their order MUST be identical to the prediction_function's output order
)

2024-03-17 22:07:07,571 pid:10368 MainThread giskard.datasets.base INFO     Your 'pandas.DataFrame' is successfully wrapped by Giskard's 'Dataset' wrapper class.
2024-03-17 22:07:08,265 pid:10368 MainThread giskard.models.automodel INFO     Your 'model' is successfully wrapped by Giskard's 'SKLearnModel' wrapper class.


In [14]:
scan_results = giskard.scan(giskard_model, giskard_dataset)

2024-03-17 22:07:28,424 pid:10368 MainThread giskard.datasets.base INFO     Casting dataframe columns from {'adres_aantal_brp_adres': 'int64', 'adres_aantal_verschillende_wijken': 'int64', 'adres_aantal_verzendadres': 'int64', 'adres_aantal_woonadres_handmatig': 'int64', 'adres_dagen_op_adres': 'int64', 'adres_recentst_onderdeel_rdam': 'int64', 'adres_recentste_buurt_groot_ijsselmonde': 'int64', 'adres_recentste_buurt_nieuwe_westen': 'int64', 'adres_recentste_buurt_other': 'int64', 'adres_recentste_buurt_oude_noorden': 'int64', 'adres_recentste_buurt_vreewijk': 'int64', 'adres_recentste_plaats_other': 'int64', 'adres_recentste_plaats_rotterdam': 'int64', 'adres_recentste_wijk_charlois': 'int64', 'adres_recentste_wijk_delfshaven': 'int64', 'adres_recentste_wijk_feijenoord': 'int64', 'adres_recentste_wijk_ijsselmonde': 'int64', 'adres_recentste_wijk_kralingen_c': 'int64', 'adres_recentste_wijk_noord': 'int64', 'adres_recentste_wijk_other': 'int64', 'adres_recentste_wijk_prins_alexa': 'in



2024-03-17 22:07:31,673 pid:10368 MainThread giskard.datasets.base INFO     Casting dataframe columns from {'adres_aantal_brp_adres': 'int64', 'adres_aantal_verschillende_wijken': 'int64', 'adres_aantal_verzendadres': 'int64', 'adres_aantal_woonadres_handmatig': 'int64', 'adres_dagen_op_adres': 'int64', 'adres_recentst_onderdeel_rdam': 'int64', 'adres_recentste_buurt_groot_ijsselmonde': 'int64', 'adres_recentste_buurt_nieuwe_westen': 'int64', 'adres_recentste_buurt_other': 'int64', 'adres_recentste_buurt_oude_noorden': 'int64', 'adres_recentste_buurt_vreewijk': 'int64', 'adres_recentste_plaats_other': 'int64', 'adres_recentste_plaats_rotterdam': 'int64', 'adres_recentste_wijk_charlois': 'int64', 'adres_recentste_wijk_delfshaven': 'int64', 'adres_recentste_wijk_feijenoord': 'int64', 'adres_recentste_wijk_ijsselmonde': 'int64', 'adres_recentste_wijk_kralingen_c': 'int64', 'adres_recentste_wijk_noord': 'int64', 'adres_recentste_wijk_other': 'int64', 'adres_recentste_wijk_prins_alexa': 'in

In [15]:
display(scan_results)