<a href="https://colab.research.google.com/github/CheyPenmetsa/Scikit-Learn-Models/blob/master/Sklearn_regression_model_with_pipelines_for_dotnet_and_onnx.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Getting data ready
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Modeling
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

# Setup random seed
np.random.seed(28)

# Import data and drop rows with missing labels.
car_sales_vol_df = pd.read_csv('large-car-sales.csv')
car_sales_vol_df.dropna(subset=['Price'], inplace=True)
car_sales_vol_df.drop(columns=['Dealer'], inplace=True)

# SimpleImputer on string is not available for
# string in ONNX-ML specifications.
# So we do it beforehand.
for cat in ['Brand', 'Model', 'Status']:
    car_sales_vol_df[cat].fillna("missing", inplace=True)

# Define different features and transformer pipeline
categorical_features = ['Brand', 'Model', 'Status']
categorical_transformer = Pipeline(steps=[
    # --- SimpleImputer is not available for strings in ONNX-ML specifications.
    # ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

mileage_feature = ['Mileage']
mileage_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

year_feature = ['Year']
year_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=2023))
])

# Setup preprocessing steps (fill missing values, then convert to numbers)
preprocessor = ColumnTransformer(transformers=
                                 [
                                     ('cat', categorical_transformer, categorical_features),
                                     ('mileage', mileage_transformer, mileage_feature),
                                     ('year', year_transformer, year_feature)
                                 ])

# Creating a preprocessing and modeling pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor())
])

# Split data
X = car_sales_vol_df.drop('Price', axis=1)
y = car_sales_vol_df['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Fit and score the model
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.8638481252646879

In [None]:
X_train.dtypes

Brand       object
Model       object
Year         int64
Status      object
Mileage    float64
dtype: object

In [None]:
%pip install skl2onnx

In [None]:
import skl2onnx
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType, StringTensorType
from skl2onnx.common.data_types import Int64TensorType

In [None]:
def convert_dataframe_schema(df, drop=None):
    inputs = []
    for k, v in zip(df.columns, df.dtypes):
        if drop is not None and k in drop:
            continue
        if v == "int64":
            t = Int64TensorType([None, 1])
        elif v == "float64":
            t = FloatTensorType([None, 1])
        else:
            t = StringTensorType([None, 1])
        inputs.append((k, t))
    return inputs


initial_inputs = convert_dataframe_schema(X_train)

In [None]:
initial_inputs

[('Brand', StringTensorType(shape=[None, 1])),
 ('Model', StringTensorType(shape=[None, 1])),
 ('Year', Int64TensorType(shape=[None, 1])),
 ('Status', StringTensorType(shape=[None, 1])),
 ('Mileage', FloatTensorType(shape=[None, 1]))]

In [None]:
ONNXModelPath = 'PricePrediction_RegressionModel.onnx'

initial_inputs = [('string_input', StringTensorType([None, 3])),
                  ('int_input', Int64TensorType([None, 1])),
                  ('float_input', FloatTensorType([None, 1]))]

# Create ONNX model
try:
    model_onnx = convert_sklearn(
        model, "pipeline_priceprediction", initial_inputs, target_opset=skl2onnx.get_latest_tested_opset_version()
    )

    # Save ONNX model
    with open(ONNXModelPath, "wb") as f:
      f.write(model_onnx.SerializeToString())
except Exception as e:
    print(e)

Imputer cannot fill missing values with a string 'missing'.


In [None]:
X_train.iloc[65]

Brand      Volkswagen
Model            Golf
Year             2023
Status            New
Mileage           NaN
Name: 26066, dtype: object

In [None]:
model.predict(X_train[:5])

array([25110.95790621, 67955.04      , 82687.56365474, 74074.9857496 ,
       34761.        ])

In [None]:
y_test.dtype

dtype('float64')

In [None]:
ONNXModelPath = 'PricePrediction_RegressionModel1.onnx'

initial_inputs = [('Brand', StringTensorType([None, 1])),
                  ('Model', StringTensorType([None, 1])),
                  ('Year', Int64TensorType([None, 1])),
                  ('Status', StringTensorType([None, 1])),
                  ('Mileage', FloatTensorType([None, 1]))]

onnx = convert_sklearn(model, initial_types=initial_inputs)

with open(ONNXModelPath, 'wb') as f:
  f.write(onnx.SerializeToString())