# Feedback

1. Double check current training code
2. Delete unnecessary function/class/comment
3. Create a `requirement.txt`

# TODO

1. Data manipulation (only sampling)
2. Double check names and comments which could be useful for reasoning as hints

In [19]:
import pandas as pd
import numpy as np
import onnxruntime as rt
import onnx
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, mean_absolute_error
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from onnxconverter_common import FloatTensorType
from skl2onnx import convert_sklearn
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [20]:
# constants and parameters definition

# to be replaced with biased dataset path
data_file_path = '../data/training_data_biased.csv'
model_path = '../model/biased_model.onnx'
random_state = 520
max_depth = 10

In [21]:
# class definition

In [22]:
# method definition

In [23]:
# load training data

data = pd.read_csv(data_file_path)
y = data['checked']
X = data.drop(['checked','Ja', 'Nee'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [24]:
# model training

classifier = DecisionTreeClassifier(
    random_state = random_state, 
    max_depth = max_depth)
scaler = StandardScaler()
pipeline = Pipeline(steps=[('scaler', scaler),('classification', classifier)])
pipeline.fit(X_train, y_train)



In [25]:
# model evaluation
# Should eval on un-biased dataset, thus I pick 300k_dataset.csv
y_test = pd.read_csv('../data/300k_dataset.csv')['checked']
X_test = pd.read_csv('../data/300k_dataset.csv').drop(['checked', 'Ja', 'Nee'], axis=1)
y_pred = pipeline.predict(X_test)
original_accuracy = accuracy_score(y_test, y_pred)
print('Accuracy of the model: ', original_accuracy)

Accuracy of the model:  0.8348733333333334


In [26]:
# 1. Convert the trained pipeline to an ONNX model
# 2. Verify the accuracy of the ONNX model using the test dataset
# 3. save the model

onnx_model = convert_sklearn(
    pipeline, initial_types=[('X', FloatTensorType((None, X_test.shape[1])))],
    target_opset=12)
sess = rt.InferenceSession(onnx_model.SerializeToString())
y_pred_onnx =  sess.run(None, {'X': X_test.values.astype(np.float32)})

accuracy_onnx_model = accuracy_score(y_test, y_pred_onnx[0])
print('Accuracy of the ONNX model: ', accuracy_onnx_model)

onnx.save(onnx_model, model_path)

Accuracy of the ONNX model:  0.8348833333333333


In [36]:
# Debug用的 交的时候再删谢谢
def evaluate_bias_extended(clf, column_name, model_name, low, high, is_categorical=True):
    """
    Evaluate bias for a given model on a specific column.
    
    Args:
        clf: The classifier to evaluate.
        column_name: The name of the column to evaluate.
        model_name: The name of the model (for printing purposes).
        low: The lower threshold for range features (ignored for categorical).
        high: The higher threshold for range features (ignored for categorical).
        is_categorical: Whether the feature is categorical or continuous.
    """
    if model_name:
        print(f'For {model_name} model')
    
    # Re-read the data to ensure it is unmodified
    og_data = pd.read_csv('../data/300k_dataset.csv').drop(['Ja', 'Nee'], axis=1)
    
    if is_categorical:
        # Handle categorical features
        column_labels = og_data[column_name].unique()
        groups = {label: og_data[og_data[column_name] == label] for label in column_labels}
        
        print(f'For column: {column_name} (Categorical)')
        for label, group in groups.items():
            print(f'Label: {label}')
            print(f'Group Size: {group.shape[0]}')
            print(f'Accuracy: {accuracy_score(group["checked"], clf.predict(group.drop(["checked"], axis=1)))}')
            print('--------------------------------')
    else:
        # Handle range features
        min_val = og_data[column_name].min()
        max_val = og_data[column_name].max()
        
        # Define bands
        bands = {
            f'[min, {low}]': og_data[(og_data[column_name] >= min_val) & (og_data[column_name] < low)],
            f'[{low}, {high}]': og_data[(og_data[column_name] >= low) & (og_data[column_name] < high)],
            f'[{high}, max]': og_data[(og_data[column_name] >= high) & (og_data[column_name] <= max_val)],
        }
        
        print(f'For column: {column_name} (Range)')
        for band, group in bands.items():
            if group.empty:
                print(f'Band: {band}')
                print('Group Size: 0')
                print('Accuracy: N/A (no data in this range)')
            else:
                print(f'Band: {band}')
                print(f'Group Size: {group.shape[0]}')
                print(f'Accuracy: {accuracy_score(group["checked"], clf.predict(group.drop(["checked"], axis=1)))}')
            print('--------------------------------')

    # Print unique labels for categorical features
    if is_categorical:
        print(f'Unique labels (up to 10): {og_data[column_name].unique()[:10]}')




# Example usage for a range feature
evaluate_bias_extended(
    pipeline, 
    'persoon_leeftijd_bij_onderzoek',  # Example range feature
    'Biased Decision Tree', 
    low=25, 
    high=50,
    is_categorical=False
)


For Biased Decision Tree model
For column: persoon_leeftijd_bij_onderzoek (Range)
Band: [min, 25]
Group Size: 3546
Accuracy: 0.7202481669486746
--------------------------------
Band: [25, 50]
Group Size: 140631
Accuracy: 0.7408466127667441
--------------------------------
Band: [50, max]
Group Size: 155823
Accuracy: 0.9223413745082562
--------------------------------
