In [1]:
import os

# Path to the desired directory
desired_directory = r'c:\Users\Dimitrideboer\OneDrive - Emixa\Documents\XRAYxEmixaHackathon'

# Set the current working directory to the desired directory
os.chdir(desired_directory)

# Print the current working directory to confirm
print("Current working directory:", os.getcwd())

Current working directory: c:\Users\Dimitrideboer\OneDrive - Emixa\Documents\XRAYxEmixaHackathon


In [10]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
from lightgbm import LGBMClassifier
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer

# Load the dataset
df = pd.read_csv('data/RecommenderSystem.csv')

# Identify the columns
feature_columns = ['batterycapacity', 'bodytype', 'bpmvalue', 'cardistancevalue', 'cartypename', 'catalogincludingvalue',
                   'catalogvalue', 'costsvalue', 'duedateapk', 'enginehorsepower', 'equipmentvalue', 'fueltype',
                   'hascosts', 'hascostsmanual', 'hasdamage', 'isimportcar', 'minbid', 'modelnameshort', 
                   'remainingbpmvalue', 'taxliabilitypercentage', 'transmissiontype', 'vatmargin', 'vehicletype', 
                   'xrayvalue']
target_columns = df.columns.difference(feature_columns)

# Separate features and target variables
X = df[feature_columns]
y = df[target_columns]

# Encode the target variables
y_encoded = y.apply(LabelEncoder().fit_transform)

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['number']).columns

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=2)

# Preprocessing for numerical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical features
categorical_transformer = Pipeline(steps=[
    #('imputer', SimpleImputer(strategy='constant', fill_value='most frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Define preprocessing for boolean features
boolean_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))
    # No need for additional transformation; booleans are already binary
])

# Combine preprocessing steps for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)# + boolean_cols)
    ]
)

x = preprocessor.fit_transform(X)

# Define the model
model = MultiOutputClassifier(LGBMClassifier())


# Hyperparameter tuning
param_grid = {
    'classifier__estimator__num_leaves': [31],
    'classifier__estimator__learning_rate': [0.1],
    'classifier__estimator__n_estimators': [100]
}

best_model = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', MultiOutputClassifier(LGBMClassifier(num_leaves=31, learning_rate=0.1, n_estimators=100)))])
# grid_search = GridSearchCV(clf, param_grid, cv=2, scoring='accuracy', n_jobs=-1, verbose=3)
# grid_search.fit(X_train, y_train)

# Best model
#best_model = grid_search.best_estimator_


# Train the best model on the training set
best_model.fit(X_train, y_train)

# Make predictions
y_pred = best_model.predict(X_test)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Concatenate all columns of y_test and y_pred for overall metrics calculation
y_test_all = np.concatenate([y_test.iloc[:, i] for i in range(y_test.shape[1])])
y_pred_all = np.concatenate([y_pred[:, i] for i in range(y_pred.shape[1])])

# Calculate overall metrics
accuracy = accuracy_score(y_test_all, y_pred_all)
precision = precision_score(y_test_all, y_pred_all, average='macro')
recall = recall_score(y_test_all, y_pred_all, average='macro')
f1 = f1_score(y_test_all, y_pred_all, average='macro')

print("Overall Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


[LightGBM] [Info] Number of positive: 1, number of negative: 5828
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000680 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2589
[LightGBM] [Info] Number of data points in the train set: 5829, number of used features: 223
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000172 -> initscore=-8.670429
[LightGBM] [Info] Start training from score -8.670429
[LightGBM] [Info] Number of positive: 50, number of negative: 5779
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000672 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2589
[LightGBM] [Info] Number of data points in the train set: 5829, number of used features: 223
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.008578 -> initscore=-4.749963
[LightGBM] [Info] Start training from score -4.749963
[LightGBM] [Info] Num

In [21]:
import numpy

In [24]:
pipe = Pipeline(steps=[('preprocessor', preprocessor),
        ('classifier', MultiOutputClassifier(LGBMClassifier(num_leaves=31, learning_rate=0.1, n_estimators=100)))])
pipe.fit(X_train, y_train)
y_test = pipe.predict_proba(X_test)
onx6 = to_onnx(pipe, X_train,
               target_opset=12,
               options={MultiOutputClassifier: {'zipmap': False}})

sess = InferenceSession(onx6.SerializeToString())

[LightGBM] [Info] Number of positive: 1, number of negative: 5828
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000665 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2589
[LightGBM] [Info] Number of data points in the train set: 5829, number of used features: 223
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000172 -> initscore=-8.670429
[LightGBM] [Info] Start training from score -8.670429
[LightGBM] [Info] Number of positive: 50, number of negative: 5779
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000708 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2589
[LightGBM] [Info] Number of data points in the train set: 5829, number of used features: 223
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.008578 -> initscore=-4.749963
[LightGBM] 

RuntimeError: Unable to find alias for model '<class 'lightgbm.sklearn.LGBMClassifier'>'. The converter is likely missing.

In [14]:
pip install onnxmltools

Collecting onnxmltools
  Downloading onnxmltools-1.12.0-py2.py3-none-any.whl.metadata (9.4 kB)
Downloading onnxmltools-1.12.0-py2.py3-none-any.whl (329 kB)
   ---------------------------------------- 0.0/329.0 kB ? eta -:--:--
   - -------------------------------------- 10.2/329.0 kB ? eta -:--:--
   -------- ------------------------------ 71.7/329.0 kB 991.0 kB/s eta 0:00:01
   ---------------------------------------- 329.0/329.0 kB 2.9 MB/s eta 0:00:00
Installing collected packages: onnxmltools
Successfully installed onnxmltools-1.12.0
Note: you may need to restart the kernel to use updated packages.


In [15]:
# Convert the model to ONNX
#initial_type = [('float_input', FloatTensorType([None, len(feature_columns)]))]
initial_type = [('bpmvalue', FloatTensorType([1, 1])),
                ('cardistancevalue', FloatTensorType([1, 1])),
                ('catalogincludingvalue', FloatTensorType([1, 1])),
                ('catalogvalue', FloatTensorType([1, 1])),
                ('costsvalue', FloatTensorType([1, 1])),
                ('enginehorsepower', FloatTensorType([1, 1])),
                ('equipmentvalue', FloatTensorType([1, 1])),
                ('minbid', FloatTensorType([1, 1])),
                ('remainingbpmvalue', FloatTensorType([1, 1])),
                ('xrayvalue', FloatTensorType([1, 1])),
                ('batterycapacity', StringTensorType([1, 1])),
                ('bodytype', StringTensorType([1, 1])),
                ('cartypename', StringTensorType([1, 1])),
                ('duedateapk', StringTensorType([1, 1])),
                ('fueltype', StringTensorType([1, 1])),
                ('duedateapk', StringTensorType([1, 1])),
                ('hascosts', StringTensorType([1, 1])),
                ('hascostsmanual', StringTensorType([1, 1])),
                ('hasdamage', StringTensorType([1, 1])),
                ('isimportcar', StringTensorType([1, 1])),
                ('modelnameshort', StringTensorType([1, 1])),
                ('taxliabilitypercentage', StringTensorType([1, 1])),
                ('transmissiontype', StringTensorType([1, 1])),
                ('vatmargin', StringTensorType([1, 1])),
                ('vehicletype', StringTensorType([1, 1])),
                ('xrayvalue', StringTensorType([1, 1]))
                ]


import onnxmltools
onnx_model = onnxmltools.convert_lightgbm(best_model, initial_types=initial_type)
#onnx_model = onnxmltools.convert_lightgbm(lgb_model, initial_types=[(variable_name, data_type), (variable_name, data_type)])

# Save the ONNX model to a file
with open("best_model.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

ValueError: No proper operator name found for '<class 'sklearn.pipeline.Pipeline'>'

In [8]:
df[['batterycapacity', 'bodytype', 'bpmvalue', 'cardistancevalue', 'cartypename', 'catalogincludingvalue',
                   'catalogvalue', 'costsvalue', 'duedateapk', 'enginehorsepower', 'equipmentvalue', 'fueltype',
                   'hascosts', 'hascostsmanual', 'hasdamage', 'isimportcar', 'minbid', 'modelnameshort', 
                   'remainingbpmvalue', 'taxliabilitypercentage', 'transmissiontype', 'vatmargin', 'vehicletype', 
                   'xrayvalue']].dtypes

batterycapacity            object
bodytype                   object
bpmvalue                  float64
cardistancevalue          float64
cartypename                object
catalogincludingvalue     float64
catalogvalue              float64
costsvalue                float64
duedateapk                 object
enginehorsepower          float64
equipmentvalue            float64
fueltype                   object
hascosts                     bool
hascostsmanual               bool
hasdamage                    bool
isimportcar                  bool
minbid                    float64
modelnameshort             object
remainingbpmvalue         float64
taxliabilitypercentage     object
transmissiontype           object
vatmargin                  object
vehicletype                object
xrayvalue                 float64
dtype: object

In [5]:
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType, StringTensorType #, BooleanTensorType

# Define the initial type for the input


initial_type = [('cardistancevalue', FloatTensorType([1, 1])),
                ('catalogvalue', FloatTensorType([1, 1])),
                ('orderid', FloatTensorType([1, 1])),
                ('enginehorsepower', FloatTensorType([1, 1])),
                ('bpmvalue', FloatTensorType([1, 1])),
                ('xrayvalue', FloatTensorType([1, 1])),
                ('remainingbpmvalue', FloatTensorType([1, 1])),
                ('equipmentvalue', FloatTensorType([1, 1])),
                ('costsvalue', FloatTensorType([1, 1])),
                ('catalogincludingvalue', FloatTensorType([1, 1])),
                ('minbid', FloatTensorType([1, 1])),
                ('fueltype', StringTensorType([1, 1])),
                ('cartypename', StringTensorType([1, 1])),
                ('modelnameshort', StringTensorType([1, 1])),
                ('vatmargin', StringTensorType([1, 1])),
                ('modelnameshort', StringTensorType([1, 1])),
                ('duedateapk', StringTensorType([1, 1])),
                ('bodytype', StringTensorType([1, 1])),
                ('vehicletype', StringTensorType([1, 1])),
                ('transmissiontype', StringTensorType([1, 1])),
                ('batterycapacity', StringTensorType([1, 1])),
                ('taxliabilitypercentage', StringTensorType([1, 1])),
                ('hascosts', StringTensorType([1, 1])),
                ('isimportcar', StringTensorType([1, 1])),
                ('hasdamage', StringTensorType([1, 1])),
                ('hascostsmanual', StringTensorType([1, 1]))
                ]


# Convert the KMeans model to ONNX format
onnx_model = convert_sklearn(best_model, initial_types=initial_type)

# Save the ONNX model to a file
with open("Recommender_model1.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())




RuntimeError: Unable to find alias for model '<class 'lightgbm.sklearn.LGBMClassifier'>'. The converter is likely missing.