In [42]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import onnxruntime as rt
import onnx
import xgboost as xgb
from skl2onnx.common.data_types import FloatTensorType
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from skl2onnx import convert_sklearn

In [91]:
# Let's load the dataset
data = pd.read_csv('data/synth_data_for_training.csv')

for column in data.columns:
    if 'geslacht' in column:
        print(column)

print(data['persoon_geslacht_vrouw'])
print(np.unique(data['checked'], return_counts=True))


# Let's specify the features and the target
y = data['checked']
X = data.drop(['checked'], axis=1)
X = X.astype(np.float32)

# Let's split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
data = pd.concat([X_train, y_train], axis=1)

filtered_data = data[~((data['persoon_geslacht_vrouw'] == 1) & (data['checked'] == 0))]
print(len(data[['persoon_geslacht_vrouw', 'checked']][data['persoon_geslacht_vrouw'] == 1]))
y_train = filtered_data['checked']
X_train = filtered_data.drop(['checked'], axis=1)
print(np.unique(y_train))

persoon_geslacht_vrouw
0        0
1        0
2        1
3        0
4        0
        ..
12640    0
12641    1
12642    1
12643    0
12644    0
Name: persoon_geslacht_vrouw, Length: 12645, dtype: int64
(array([0, 1], dtype=int64), array([11380,  1265], dtype=int64))
4605
[0 1]


In [79]:
# Select data based on variance (not the final version yet, for now just for testing)
selector = SelectFromModel(RandomForestClassifier(), max_features=50)

In [80]:
# class_weights = len(y_train) / (2 * np.bincount(y_train))
# print(class_weights)
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y)
# params = {
#     'objective': 'binary:logistic',
#     'scale_pos_weight': class_weights[1],  # Scale positive examples
#     # Add other parameters as needed
# }
classifier = xgb.XGBClassifier()

In [81]:
# Create a pipeline object with our selector and classifier
# NOTE: You can create custom pipeline objects but they must be registered to onnx or it will not recognise them
# Because of this we recommend using the onnx known objects as defined in the documentation
pipeline = Pipeline(steps=[('feature_selection', selector), ('classification', classifier)])

In [92]:
# Let's train a simple model
sample_weights = [class_weights[y] for y in y_train]
pipeline.fit(X_train, y_train, feature_selection__sample_weight=sample_weights, classification__sample_weight=sample_weights)

# Let's evaluate the model
y_pred = pipeline.predict(X_test)
original_accuracy = classification_report(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
print('Accuracy of the original model: ', original_accuracy)
print(acc)

Accuracy of the original model:                precision    recall  f1-score   support

           0       0.96      0.51      0.66      2856
           1       0.15      0.79      0.25       306

    accuracy                           0.54      3162
   macro avg       0.55      0.65      0.46      3162
weighted avg       0.88      0.54      0.62      3162

0.5354206198608475


In [9]:
# Let's convert the model to ONNX
onnx_model = convert_sklearn(
    pipeline, initial_types=[('X', FloatTensorType((None, X.shape[1])))],
    target_opset=12)

# Let's check the accuracy of the converted model
sess = rt.InferenceSession(onnx_model.SerializeToString())
y_pred_onnx =  sess.run(None, {'X': X_test.values.astype(np.float32)})

accuracy_onnx_model = accuracy_score(y_test, y_pred_onnx[0])
print('Accuracy of the ONNX model: ', accuracy_onnx_model)

Accuracy of the ONNX model:  0.9456040480708412


In [10]:
# Let's save the model
onnx.save(onnx_model, "model/gboost.onnx")

# Let's load the model
new_session = rt.InferenceSession("model/gboost.onnx")

# Let's predict the target
y_pred_onnx2 =  new_session.run(None, {'X': X_test.values.astype(np.float32)})

accuracy_onnx_model = accuracy_score(y_test, y_pred_onnx2[0])
print('Accuracy of the ONNX model: ', accuracy_onnx_model)


Accuracy of the ONNX model:  0.9456040480708412
