In [1]:
from typing import Dict, Tuple, List, Any
import datetime as dt
import json
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import xgboost as xgb
import mlflow
from category_encoders import CountEncoder, WOEEncoder
from kedro.framework.session import KedroSession
from kedro.framework.startup import bootstrap_project

In [2]:
# use config with base
metadata = bootstrap_project(Path.cwd().parent.parent)
with KedroSession.create(metadata.package_name,
        project_path=metadata.project_path,
        # save_on_close=True,
        env=None,
        # extra_params=extra_params
    ) as session: 
    context = session.load_context()


2021-12-27 03:08:16,356 - kedro.framework.session.store - INFO - `save()` not implemented for `BaseSessionStore`. Skipping the step.


In [3]:
context.params

{'test_size': 0.2,
 'random_state': 3,
 'features': ['engines',
  'passenger_capacity',
  'crew',
  'company_rating',
  'review_scores_rating',
  'd_check_complete',
  'moon_clearance_complete',
  'iata_approved'],
 'categorical_features': ['d_check_complete',
  'moon_clearance_complete',
  'iata_approved'],
 'model_params': {'hyper_parameters': {'max_depth': 7,
   'n_estimators': 30,
   'random_state': 555}}}

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer


# regressor = xgb.sklearn.XGBRegressor(**context.params["model_params"]["hyper_parameters"])
regressor = xgb.sklearn.XGBClassifier(**context.params["model_params"]["hyper_parameters"])

categorical_transformer = Pipeline(steps=[
    ("ce", WOEEncoder()),
])

preprocessor = ColumnTransformer(
    transformers=[
        ("float_input", "passthrough", [context.params["features"].index(cf) for cf in context.params["features"] if cf not in context.params["categorical_features"]]),
        ("categorical_input", categorical_transformer, [context.params["features"].index(cf) for cf in context.params["categorical_features"]]),
    ],
    remainder="drop")

model = Pipeline(steps=[
    # Bad!
    # ("selector", ColumnTransformer(
    #     [("id", 
    #       FunctionTransformer(validate=False),
    #       list(range(len(context.params["features"]))))
    #     ])
    # ),
    # ("selector", CoulmnTransformer(FunctionTransformer(lambda X: X[:, list(range(len(context.params["features"])))], validate=False))),
    ("precprocessor", preprocessor),
    ("regressor", regressor)
])

In [5]:
[context.params["features"].index(cf) for cf in context.params["features"] if cf not in context.params["categorical_features"]]

[0, 1, 2, 3, 4]

In [6]:
[context.params["features"].index(cf) for cf in context.params["categorical_features"]]

[5, 6, 7]

In [7]:
tmp = context.catalog.load("X_train")
tmp[context.params["categorical_features"]] = tmp[context.params["categorical_features"]].astype(str)
tmp["aaaaa"] = tmp.engines.values

No files found in ['/kedro-sample/own_examples/conf/base', '/kedro-sample/own_examples/conf/local'] matching the glob pattern(s): ['credentials*', 'credentials*/**', '**/credentials*']
  warn(f"Credentials not found in your Kedro project config.\n{str(exc)}")


2021-12-27 03:08:17,325 - kedro.io.data_catalog - INFO - Loading data from `X_train` (PickleDataSet)...


In [8]:
np.where(context.catalog.load("y_train")<context.catalog.load("y_train").mean(), 0,1)

2021-12-27 03:08:18,090 - kedro.io.data_catalog - INFO - Loading data from `y_train` (PickleDataSet)...
2021-12-27 03:08:18,122 - kedro.io.data_catalog - INFO - Loading data from `y_train` (PickleDataSet)...


array([0, 1, 0, ..., 0, 0, 0])

In [9]:
ppl = model.fit(
    # tmp[context.params["features"]].values
    tmp[context.params["features"]+["aaaaa"]].values
    # ,context.catalog.load("y_train")
    ,np.where(context.catalog.load("y_train")<context.catalog.load("y_train").mean(), 0,1)
)

2021-12-27 03:08:18,464 - kedro.io.data_catalog - INFO - Loading data from `y_train` (PickleDataSet)...
2021-12-27 03:08:18,486 - kedro.io.data_catalog - INFO - Loading data from `y_train` (PickleDataSet)...






In [10]:
ppl.predict_proba(tmp[context.params["features"]+["aaaaa"]].values)
# ppl.predict(tmp[context.params["features"]].values)

array([[0.7748882 , 0.22511178],
       [0.00519294, 0.99480706],
       [0.9798459 , 0.0201541 ],
       ...,
       [0.9973383 , 0.00266168],
       [0.9860327 , 0.01396728],
       [0.80912876, 0.19087121]], dtype=float32)

In [11]:
ppl = model.fit(
    tmp[context.params["features"]].values
    # ,context.catalog.load("y_train")
    ,np.where(context.catalog.load("y_train")<context.catalog.load("y_train").mean(), 0,1)
)

2021-12-27 03:08:23,125 - kedro.io.data_catalog - INFO - Loading data from `y_train` (PickleDataSet)...
2021-12-27 03:08:23,146 - kedro.io.data_catalog - INFO - Loading data from `y_train` (PickleDataSet)...


In [12]:
ppl.predict(tmp[context.params["features"]].values)

array([0, 1, 0, ..., 0, 0, 0])

In [13]:
tmp[context.params["features"]+["aaaaa"]].head()

Unnamed: 0,engines,passenger_capacity,crew,company_rating,review_scores_rating,d_check_complete,moon_clearance_complete,iata_approved,aaaaa
115794,2.0,4,3.0,1.0,96.0,False,False,True,2.0
238624,4.0,8,5.0,1.0,100.0,True,False,False,4.0
389153,1.0,2,1.0,1.0,65.0,True,False,False,1.0
628283,2.0,6,2.0,1.0,100.0,True,False,False,2.0
452204,1.0,2,1.0,1.0,60.0,True,False,False,1.0


In [14]:
tmp.values.shape

(607687, 9)

In [15]:
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType, StringTensorType

initial_type = [("float_input", FloatTensorType([None, len(context.params["features"]) - len(context.params["categorical_features"])+1])),
                ("categorical_input", StringTensorType([None, len(context.params["categorical_features"])]))]
model_onnx = convert_sklearn(model, initial_types=initial_type)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  int(TensorProto.STRING): np.dtype(np.object)


MissingShapeCalculator: Unable to find a shape calculator for type '<class 'category_encoders.woe.WOEEncoder'>'.
It usually means the pipeline being converted contains a
transformer or a predictor with no corresponding converter
implemented in sklearn-onnx. If the converted is implemented
in another library, you need to register
the converted so that it can be used by sklearn-onnx (function
update_registered_converter). If the model is not yet covered
by sklearn-onnx, you may raise an issue to
https://github.com/onnx/sklearn-onnx/issues
to get the converter implemented or even contribute to the
project. If the model is a custom model, a new converter must
be implemented. Examples can be found in the gallery.


In [16]:
context.catalog.load("X_test").shape

2021-12-27 03:08:40,922 - kedro.io.data_catalog - INFO - Loading data from `X_test` (PickleDataSet)...


No files found in ['/kedro-sample/own_examples/conf/base', '/kedro-sample/own_examples/conf/local'] matching the glob pattern(s): ['credentials*', 'credentials*/**', '**/credentials*']
  warn(f"Credentials not found in your Kedro project config.\n{str(exc)}")


(151922, 8)

In [17]:
initial_type = [('float_input', FloatTensorType([None, 8]))]
onx = convert_sklearn(context.catalog.load("regressor"), initial_types=initial_type)
context.catalog.save("onx", onx.SerializeToString())

2021-12-27 03:08:41,689 - kedro.io.data_catalog - INFO - Loading data from `regressor` (PickleDataSet)...
2021-12-27 03:08:41,711 - kedro.io.data_catalog - INFO - Saving data to `onx` (PickleDataSet)...


In [18]:
with open("../../data/06_models/shuttles.onnx", "wb") as f:
    f.write(onx.SerializeToString())

In [19]:
import onnxruntime as rt
import numpy
sess = rt.InferenceSession(context.catalog.load("onx"))
input_name = sess.get_inputs()[0].name
label_name = sess.get_outputs()[0].name
pred_onx = sess.run([label_name], {input_name: context.catalog.load("X_test").astype(numpy.float32).values})[0]

2021-12-27 03:08:43,055 - kedro.io.data_catalog - INFO - Loading data from `onx` (PickleDataSet)...
2021-12-27 03:08:43,090 - kedro.io.data_catalog - INFO - Loading data from `X_test` (PickleDataSet)...


In [20]:
pred_onx

array([[5788.9775],
       [3973.5654],
       [3594.4954],
       ...,
       [3748.6594],
       [5454.851 ],
       [4444.2515]], dtype=float32)

In [21]:
import numpy as np
from sklearn.preprocessing import OrdinalEncoder as SklOrdinalEncoder
from category_encoders import WOEEncoder, OrdinalEncoder
from skl2onnx import update_registered_converter, get_model_alias
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx.common.utils import check_input_and_output_numbers
from skl2onnx.algebra.onnx_ops import OnnxCast
from skl2onnx.algebra.onnx_operator import OnnxSubEstimator
from skl2onnx.sklapi import WOETransformer

In [70]:
def ordenc_to_sklearn(op_mapping):
    "Converts OrdinalEncoder mapping to scikit-learn OrdinalEncoder."
    cats = []
    for column_map in op_mapping:
        col = column_map['col']
        print(col)
        while len(cats) <= col:
            cats.append(None)
        mapping = column_map['mapping']
        print(mapping)
        print(mapping.index[0])
        res = []
        for i in range(mapping.shape[0]):
            if mapping.index[i]!=mapping.index[i]:
                continue
            ind = mapping.iloc[i]
            print("ind",ind)
            while len(res) <= ind:
                res.append(0)
            res[ind] = mapping.index[i]
        cats[col] = np.array(res, dtype="O")

    skl_ord = SklOrdinalEncoder(categories=cats, dtype=np.int64)
    skl_ord.categories_ = cats
    return skl_ord


def ordinal_encoder_shape_calculator(operator):
    check_input_and_output_numbers(
        operator, input_count_range=1, output_count_range=1)
    input_type = operator.inputs[0].type.__class__
    input_dim = operator.inputs[0].get_first_dimension()
    shape = operator.inputs[0].type.shape
    second_dim = None if len(shape) != 2 else shape[1]
    output_type = input_type([input_dim, second_dim])
    operator.outputs[0].type = output_type


def ordinal_encoder_converter(scope, operator, container):
    op = operator.raw_operator
    opv = container.target_opset
    X = operator.inputs[0]

    skl_ord = ordenc_to_sklearn(op.mapping)
    cat = OnnxSubEstimator(skl_ord, X, op_version=opv,
                           output_names=operator.outputs[:1])
    cat.add_to(scope, container)


update_registered_converter(
    OrdinalEncoder, "CategoricalEncoderOrdinalEncoder",
    ordinal_encoder_shape_calculator,
    ordinal_encoder_converter)

In [71]:
def woeenc_to_sklearn(op_mapping):
    "Converts WOEEncoder mapping to scikit-learn OrdinalEncoder."
    cats = []
    ws = []
    for column_map in op_mapping.items():
        col = column_map[0]
        while len(cats) <= col:
            cats.append('passthrough')
            ws.append(None)
        mapping = column_map[1]
        intervals = []
        weights = []
        for i in range(mapping.shape[0]):
            ind = mapping.index[i]
            if ind < 0:
                continue
            intervals.append((float(ind - 1), float(ind), False, True))
            weights.append(mapping.iloc[i])
        cats[col] = intervals
        ws[col] = weights

    skl = WOETransformer(intervals=cats, weights=ws, onehot=False)
    skl.fit(None)
    return skl


def woe_encoder_parser(
        scope, model, inputs, custom_parsers=None):
    if len(inputs) != 1:
        raise RuntimeError(
            "Unexpected number of inputs: %d != 1." % len(inputs))
    if inputs[0].type is None:
        raise RuntimeError(
            "Unexpected type: %r." % (inputs[0], ))
    alias = get_model_alias(type(model))
    this_operator = scope.declare_local_operator(alias, model)
    this_operator.inputs.append(inputs[0])
    this_operator.outputs.append(
        scope.declare_local_variable('catwoe', FloatTensorType()))
    return this_operator.outputs


def woe_encoder_shape_calculator(operator):
    check_input_and_output_numbers(
        operator, input_count_range=1, output_count_range=1)
    input_dim = operator.inputs[0].get_first_dimension()
    shape = operator.inputs[0].type.shape
    second_dim = None if len(shape) != 2 else shape[1]
    output_type = FloatTensorType([input_dim, second_dim])
    operator.outputs[0].type = output_type


def woe_encoder_converter(scope, operator, container):
    op = operator.raw_operator
    opv = container.target_opset
    X = operator.inputs[0]

    sub = OnnxSubEstimator(op.ordinal_encoder, X,
                           op_version=opv)
    cast = OnnxCast(sub, op_version=opv, to=np.float32)
    skl_ord = woeenc_to_sklearn(op.mapping)
    cat = OnnxSubEstimator(skl_ord, cast, op_version=opv,
                           output_names=operator.outputs[:1],
                           input_types=[FloatTensorType()])
    cat.add_to(scope, container)


update_registered_converter(
    WOEEncoder, "CategoricalEncoderWOEEncoder",
    woe_encoder_shape_calculator,
    woe_encoder_converter,
    parser=woe_encoder_parser)

In [72]:
from xgboost import XGBClassifier
from skl2onnx.common.shape_calculator import calculate_linear_classifier_output_shapes
from onnxmltools.convert.xgboost.operator_converters.XGBoost import convert_xgboost

update_registered_converter(
    XGBClassifier, 'XGBoostXGBClassifier',
    calculate_linear_classifier_output_shapes, convert_xgboost,
    options={'nocl': [True, False], 'zipmap': [True, False, 'columns']})

In [73]:
classifier = XGBClassifier(**context.params["model_params"]["hyper_parameters"])

categorical_transformer = Pipeline(steps=[
    ("we", WOEEncoder()),
])

preprocessor = ColumnTransformer(
    transformers=[
        ("numeric", "passthrough", [context.params["features"].index(cf) for cf in context.params["features"] if cf not in context.params["categorical_features"]]),
        ("categorical", categorical_transformer, [context.params["features"].index(cf) for cf in context.params["categorical_features"]]),
    ],
    remainder="drop")

model = Pipeline(steps=[
    ("precprocessor", preprocessor),
    ("regressor", classifier)
])

In [74]:
ppl = model.fit(
    tmp[context.params["features"]].values
    ,np.where(context.catalog.load("y_train")<context.catalog.load("y_train").mean(), 0,1)
)

2021-12-27 03:31:34,898 - kedro.io.data_catalog - INFO - Loading data from `y_train` (PickleDataSet)...
2021-12-27 03:31:34,920 - kedro.io.data_catalog - INFO - Loading data from `y_train` (PickleDataSet)...


No files found in ['/kedro-sample/own_examples/conf/base', '/kedro-sample/own_examples/conf/local'] matching the glob pattern(s): ['credentials*', 'credentials*/**', '**/credentials*']
  warn(f"Credentials not found in your Kedro project config.\n{str(exc)}")




In [75]:
len(context.params["features"]) - len(context.params["categorical_features"])

5

In [76]:
initial_type = [("float_input", FloatTensorType([None, len(context.params["features"]) - len(context.params["categorical_features"])])),
                ("categorical_input", StringTensorType([None, len(context.params["categorical_features"])]))]
model_onnx = convert_sklearn(model, initial_types=initial_type)

0
False    1
True     2
NaN     -2
dtype: int64
False
ind 1
ind 2
1
False    1
NaN     -2
dtype: int64
False
ind 1
2
True     1
False    2
NaN     -2
dtype: int64
True
ind 1
ind 2


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if arr.dtype == np.object:
