# Feedback

1. Double check current training code
2. Delete unnecessary function/class/comment
3. Create a `requirement.txt`

# TODO

1. Data manipulation (only sampling)
2. Double check names and comments which could be useful for reasoning as hints

In [9]:
import pandas as pd
import numpy as np
import onnxruntime as rt
import onnx
from sklearn.model_selection import train_test_split

from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, mean_absolute_error
from sklearn.pipeline import Pipeline
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import to_onnx
from skl2onnx import convert_sklearn
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingRegressor
import matplotlib.pyplot as plt

from sklearn import datasets, ensemble
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error
from sklearn.model_selection import train_test_split
from sklearn.utils.fixes import parse_version
from sklearn import tree
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin
from sklearn.preprocessing import StandardScaler, FunctionTransformer

In [10]:
# constants and parameters definition

# to be replaced with unbiased dataset path
data_file_path = '../data/traing_data_unbiased.csv'
model_path = '../model/good_model.onnx'
n_estimators = 500
max_depth = 4
learning_rate = 0.01
loss = "squared_error"
random_state = 42

In [11]:
# class definition

# ? where to use this class
class RegressorWithThreshold(BaseEstimator, RegressorMixin):
    """
    A wrapper for a regressor that applies a threshold to the regression predictions 
    to produce binary predictions.

    This class takes an underlying regression model and applies a threshold to its
    predictions. If the predicted value is greater than the threshold, the prediction
    is considered `True` (1), otherwise it is `False` (0).

    Parameters:
        regressor: The regression model to be wrapped. Must have `fit` and `predict` methods.
        threshold (float, optional): The threshold for converting regression predictions 
                                     into binary predictions. Default is 0.7.
    """
    def __init__(self, regressor, threshold=0.7):
        self.regressor = regressor
        self.threshold = threshold

    def fit(self, X, y):
        self.regressor.fit(X, y)
        return self

    def predict(self, X):
        reg_predictions = self.regressor.predict(X)
        return (reg_predictions > self.threshold)

In [12]:
# method definition

# ? where to use this method
def apply_threshold(X, threshold=50):
    # Converts values in X to boolean based on the given threshold.
    return (X > threshold).astype(bool)

In [13]:
# load training data

data = pd.read_csv(data_file_path)
data = data[:10000]
data['persoon_geslacht_vrouw'] = 0
data['persoonlijke_eigenschappen_dagen_sinds_taaleis'] = 0
data['persoon_leeftijd_bij_onderzoek'] = 0
y = data['checked']
X = data.drop(['checked','Ja', 'Nee'], axis=1)
X = X.astype(np.float32)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [14]:
# model training

params = {
    'n_estimators': n_estimators,
    'max_depth': max_depth,
    'min_samples_split': 5,
    'learning_rate': learning_rate,
    # 'loss': loss,
    'random_state': random_state
    }
classifier = GradientBoostingClassifier(**params)
scaler = StandardScaler()
pipeline = Pipeline(steps=[('scaling', scaler), ('classification', classifier)])
pipeline.fit(X_train, y_train)

In [17]:
# model evaluation

y_pred = pipeline.predict(X_test)
original_accuracy = accuracy_score(y_test, y_pred)
print('Accuracy of the model: ', original_accuracy)

Accuracy of the model:  0.8968


In [18]:
# 1. Convert the trained pipeline to an ONNX model
# 2. Verify the accuracy of the ONNX model using the test dataset
# 3. save the model

onnx_model = convert_sklearn(
    pipeline, initial_types=[('X', FloatTensorType((None, X.shape[1])))],
    target_opset=12)
sess = rt.InferenceSession(onnx_model.SerializeToString())
y_pred_onnx =  sess.run(None, {'X': X_test.values.astype(np.float32)})

accuracy_onnx_model = accuracy_score(y_test, y_pred_onnx[0])
print('Accuracy of the ONNX model: ', accuracy_onnx_model)

onnx.save(onnx_model, model_path)

Accuracy of the ONNX model:  0.8968
