In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from skl2onnx import convert_sklearn
import onnxruntime as rt
import onnx
from skl2onnx.common.data_types import FloatTensorType

In [None]:
# Let's load the dataset
data = pd.read_csv('../censored_training_set.csv')

# Let's specify the features and the target - Use all the features from the examples dataset
# You should drop Ja and Nee
y_train = data['checked']
X_train = data.drop(['checked', 'Ja', 'Nee'], axis=1)
X_train = X_train.astype(np.float32)

# We use the entire censored training dataset as our training set, and another dataset as our test set
test_df = pd.read_csv("../censored_test_set.csv")
X_test = test_df.drop(['checked', 'Ja', 'Nee'], axis=1)
y_test = test_df["checked"]

In [None]:
def get_features_to_drop(keep_data):
    columns_to_drop = [
        col for col in data.columns if col not in keep_data and col not in ["Ja", "Nee", "checked"]
    ]
    return columns_to_drop

In [None]:
# Create a pipeline object with our selector and classifier
# NOTE: You can create custom pipeline objects but they must be registered to onnx or it will not recognise them
# Because of this we recommend using the onnx known objects as defined in the documentation
keep_cols = [
 #censored_features 
]

feature_selector = ColumnTransformer([
        ('drop', 'drop', get_features_to_drop(keep_cols))
    ], remainder=StandardScaler())

classifier = RandomForestClassifier(n_estimators=5, criterion='gini', max_depth=5)

In [None]:
pipeline = Pipeline(steps=[('feature selection', feature_selector),  ('classification', classifier)])

In [None]:
# Let's train a simple model
pipeline.fit(X_train, y_train)

# Let's evaluate the model
y_pred = pipeline.predict(X_test)
original_accuracy = accuracy_score(y_test, y_pred)
print('Accuracy of the original model: ', original_accuracy)

In [None]:
# Let's convert the model to ONNX
onnx_model = convert_sklearn(
    pipeline, initial_types=[('X', FloatTensorType((None, X_train.shape[1])))],
    target_opset=12)

# Let's check the accuracy of the converted model
sess = rt.InferenceSession(onnx_model.SerializeToString())
y_pred_onnx =  sess.run(None, {'X': X_test.values.astype(np.float32)})

accuracy_onnx_model = accuracy_score(y_test, y_pred_onnx[0])
print('Accuracy of the ONNX model: ', accuracy_onnx_model)

In [None]:
# Let's save the model
onnx.save(onnx_model, "model_1.onnx")

In [None]:
# Let's load the model
new_session = rt.InferenceSession("model_1.onnx")

# Let's predict the target
y_pred_onnx2 =  new_session.run(None, {'X': X_test.values.astype(np.float32)})

accuracy_onnx_model = accuracy_score(y_test, y_pred_onnx2[0])
print('Accuracy of the ONNX model: ', accuracy_onnx_model)