### 파이프라인

In [1]:
from functools import partial
from typing import NamedTuple

from kfp.components import InputPath, OutputPath
from kfp.components import create_component_from_func

BASE_IMAGE = "easyjin/penguins-example:0.1"

@partial(
    create_component_from_func,
    base_image=BASE_IMAGE,
    packages_to_install=["palmerpenguins"]
)
def dataprep():
    from palmerpenguins import load_penguins
    import pandas as pd
    raw_ds = load_penguins()
    raw_ds.dropna(axis=0, inplace=True)
    raw_ds.to_csv('/mnt/penguins-data.csv')

    from sklearn.model_selection import train_test_split
    train_ds, test_ds = train_test_split(raw_ds, test_size=0.2, random_state=47)

    train_ds.to_csv('/mnt/train-ds.csv', index=False)
    test_ds.to_csv('/mnt/test-ds.csv', index=False)
    
@partial(
    create_component_from_func,
    base_image=BASE_IMAGE,
)
def trainer(
    criterion: str,
    n_estimators: int,
    max_depth: int,
    model_path: OutputPath("dill"),
):
    import dill
    import pandas as pd
    
    def base(criterion, n_estimators, max_depth):
        from sklearn.compose import ColumnTransformer
        from sklearn.pipeline import Pipeline
        from sklearn.preprocessing import StandardScaler, OneHotEncoder
        from sklearn.ensemble import RandomForestClassifier

        # numerical
        nums = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
        num_tfm = Pipeline(
            steps=[("scaler", StandardScaler())]
        )
        # categorical
        cats = ["island", "sex"]
        cat_tfm = Pipeline(
            steps=[
                ("encoder", OneHotEncoder(handle_unknown="ignore")),
            ]
        )
        # column merge
        preprocessor = ColumnTransformer(
            transformers=[
                ("num", num_tfm, nums),
                ("cat", cat_tfm, cats),
            ]
        )

        base_model = Pipeline(
            steps=[
                ("prep", preprocessor),
                ("rf", RandomForestClassifier(
                        criterion= criterion,
                        n_estimators= n_estimators,
                        max_depth= max_depth,
                        oob_score=True,
                        n_jobs = -1
                    ))
            ]
        )
        return base_model

    model = base(criterion, n_estimators, max_depth)

    train_ds = pd.read_csv('/mnt/train-ds.csv')
    X_train = train_ds.iloc[:,1:]
    y_train = train_ds.iloc[:,0].values.ravel()

    model.fit(X_train, y_train)

    # save model
    with open(model_path, 'wb') as f:
        dill.dump(model, f)

@partial(
    create_component_from_func,
    base_image=BASE_IMAGE,
)
def metric(
    model_path: InputPath("dill"),
    # mlpipeline_ui_metadata_path: OutputPath("UI_Metadata"),
    mlpipeline_metrics_path: OutputPath("Metrics")
):
    import json
    import pandas as pd
    import dill
    from sklearn.metrics import accuracy_score

    test_ds = pd.read_csv('/mnt/train-ds.csv')
    X_test = test_ds.iloc[:,1:]
    y_test = test_ds.iloc[:,0].values.ravel()

    with open(model_path, mode="rb") as f:
        model = dill.load(f)
    
    y_pred = model.predict(X_test)

    metrics = {
        'metrics': [
            {
                'name': 'oob-score',
                'numberValue':  model['rf'].oob_score_,
                'format': 'RAW'
            },
            {
                'name': 'accuracy-score',
                'numberValue':  accuracy_score(y_pred, y_test),
                'format': 'RAW'       
            }
        ]
    }

    with open(mlpipeline_metrics_path, 'w') as f:
        json.dump(metrics, f)
        

@partial(
    create_component_from_func,
    base_image=BASE_IMAGE,
)
def stack_mlflow(
    model_name: str,
    model_path: InputPath("dill"),
):
    import os
    import dill
    import pandas as pd

    from mlflow.sklearn import save_model
    from mlflow.tracking.client import MlflowClient
    os.environ["MLFLOW_S3_ENDPOINT_URL"] = "http://minio-service.kubeflow.svc:9000"
    os.environ["AWS_ACCESS_KEY_ID"] = "minio"
    os.environ["AWS_SECRET_ACCESS_KEY"] = "minio123"

    client = MlflowClient("http://mlflow-server-service.mlflow-system.svc:5000")

    train_ds = pd.read_csv('/mnt/train-ds.csv')
    X_train = train_ds.iloc[:,1:]

    with open(model_path, mode="rb") as f:
        model = dill.load(f)

    from mlflow.models.signature import infer_signature
    input_example = X_train.sample(1)
    signature = infer_signature(X_train, model.predict(X_train))

    save_model(
        sk_model=model,
        path=model_name,
        serialization_format="cloudpickle",
        signature=signature,
        input_example=input_example,
    )
    run = client.create_run(experiment_id="0")
    client.log_artifact(run.info.run_id, model_name)

In [2]:
import kfp.dsl as dsl
from kfp import onprem

def trainer_func(criterion, n_estimators, max_depth):
    return dsl.ContainerOp(
        name="training model",
        image="192.168.0.50:5100/penguins-trainer:0.1",
        command=["python", "trainer.py"],
        arguments=[
            "--criterion", criterion,
            "--n_estimators", n_estimators,
            "--max_depth", max_depth,
            "--train_ds_path", '/mnt/train-ds.csv',
            ],
        file_outputs={"model": "model.pkl"}
    )

@dsl.pipeline(name="penguins-pipeline")
def penguins_pipeline(criterion:str = 'gini', n_estimators: int = 300, max_depth: int = 3, model_name: str = 'penguins-clf'):
    data_mnt = onprem.mount_pvc(pvc_name='pvc-penguins', volume_name= 'data-mnt',volume_mount_path='/mnt')
    
    dataprep_op = dataprep().apply(data_mnt)
    
    trainer_op = trainer(criterion, n_estimators, max_depth)
    # trainer_op = trainer_func(criterion, n_estimators, max_depth)
    trainer_op.apply(data_mnt)
    trainer_op.after(dataprep_op)

    metric(
        model = trainer_op.outputs["model"]       
    ).apply(data_mnt)
    
    stack_mlflow(
        model_name=model_name,
        model=trainer_op.outputs["model"],
    ).apply(data_mnt)

from kfp import compiler
compiler.Compiler().compile(penguins_pipeline, 'penguins_pipeline.yaml')

In [3]:
import requests

USERNAME = "user@example.com"
PASSWORD = "12341234"
NAMESPACE = "kubeflow-user-example-com"
HOST = "http://localhost:30398"

session = requests.Session()
response = session.get(HOST)

headers = {
    "Content-Type": "application/x-www-form-urlencoded",
}

data = {"login": USERNAME, "password": PASSWORD}
session.post(response.url, headers=headers, data=data)

session_cookie = session.cookies.get_dict()["authservice_session"]

import kfp
client = kfp.Client(
    host=f"{HOST}/pipeline", 
    namespace=NAMESPACE, 
    cookies=f"authservice_session={session_cookie}"
    )

In [4]:
EXP_NAME = "examples"

arguments = {'criterion':'entropy' ,'n_estimators': 172, 'max_depth': 4, 'model_name': 'penguins-clf'}
client.create_run_from_pipeline_func(penguins_pipeline, experiment_name = EXP_NAME, arguments=arguments)

RunPipelineResult(run_id=67d5218f-5e40-40cd-91fe-2e262696da40)

### katib

In [18]:
import kfp
import kfp.dsl as dsl
from kfp import components

from kubeflow.katib import ApiClient
from kubeflow.katib import V1beta1ExperimentSpec
from kubeflow.katib import V1beta1AlgorithmSpec
from kubeflow.katib import V1beta1EarlyStoppingSpec
from kubeflow.katib import V1beta1EarlyStoppingSetting
from kubeflow.katib import V1beta1ObjectiveSpec
from kubeflow.katib import V1beta1ParameterSpec
from kubeflow.katib import V1beta1FeasibleSpace
from kubeflow.katib import V1beta1TrialTemplate
from kubeflow.katib import V1beta1TrialParameterSpec
from kubeflow.katib import V1beta1MetricsCollectorSpec

EXP_NAME = "examples"
NAMESPANCE = "kubeflow-user-example-com"

max_trial_count = 10
max_failed_trial_count = 3
parallel_trial_count = 2

objective=V1beta1ObjectiveSpec(
    type="maximize",
    # goal= 0.99,
    objective_metric_name="accuracy"
)

algorithm=V1beta1AlgorithmSpec(
    algorithm_name="bayesianoptimization",
)

early_stopping=V1beta1EarlyStoppingSpec(
    algorithm_name="medianstop",
    algorithm_settings=[
        V1beta1EarlyStoppingSetting(
            name="min_trials_required",
            value="2"
        )
    ]
)

# parametersSpec
parameters=[
    V1beta1ParameterSpec(
        name="n_estimators",
        parameter_type="int",
        feasible_space=V1beta1FeasibleSpace(
            min="100",
            max="1000",
            step="100"
        ),
    ),
    V1beta1ParameterSpec(
        name="max_depth",
        parameter_type="int",
        feasible_space=V1beta1FeasibleSpace(
            min="2",
            max="6",
            step="1"
        ),
    ),
    V1beta1ParameterSpec(
        name="criterion",
        parameter_type="categorical",
        feasible_space=V1beta1FeasibleSpace(
            list=[
                "gini", 
                "entropy",
                "log_loss"
            ]
        ),
    ),
]

In [20]:
trial_spec={
    "apiVersion": "batch/v1",
    "kind": "Job",
    "spec": {
        "template": {
            "metadata": {
                "annotations": {
                     "sidecar.istio.io/inject": "false"
                }
            },
            "spec": {
                "containers": [
                    {
                        "name": "training-container",
                        "image": "easyjin/penguins-trainer:0.1",
                        "command": [
                            "python",
                            "trainer.py",
                            "--criterion=${trialParameters.criterion}",
                            "--n_estimators=${trialParameters.nEstimators}",
                            "--max_depth=${trialParameters.maxDepth}",
                            "--train_ds_path=/mnt/train-ds.csv"
                        ],
                        "volumeMounts": [
                            {
                                "mountPath": "/mnt",
                                "name": "katib-mnt"
                            }
                        ]
                    }
                ],
                "volumes": [
                    {
                        "name": "katib-mnt",
                        "persistentVolumeClaim": {
                            "claimName": "pvc-penguins"
                        }
                    }
                ],
                "restartPolicy": "Never",
            }
        }
    }
}

trial_template=V1beta1TrialTemplate(
    retain=True,
    primary_container_name="training-container",
    trial_parameters=[
        V1beta1TrialParameterSpec(
            name="criterion",
            description="criterion for the training model",
            reference="criterion"
        ),
        V1beta1TrialParameterSpec(
            name="nEstimators",
            description="Number of training model estimators",
            reference="n_estimators"
        ),
        V1beta1TrialParameterSpec(
            name="maxDepth",
            description="Training model maxDepth",
            reference="max_depth"
        ),
    ],
    trial_spec=trial_spec
)

In [21]:
experiment_spec=V1beta1ExperimentSpec(
    max_trial_count=max_trial_count,
    max_failed_trial_count=max_failed_trial_count,
    parallel_trial_count=parallel_trial_count,
    objective=objective,
    algorithm=algorithm,
    early_stopping=early_stopping,
    parameters=parameters,
    trial_template=trial_template
)

katib_experiment_launcher_op = components.load_component_from_url(
    "https://raw.githubusercontent.com/kubeflow/pipelines/master/components/kubeflow/katib-launcher/component.yaml")

In [22]:
def convert_katib_results(katib_results):
    import json
    katib_res = json.loads(katib_results)

    convert_res = {}
    for p in katib_res['currentOptimalTrial']["parameterAssignments"]:
        convert_res[p['name']] = p['value']

    with open('mnt/hyps.json', 'w') as f:
        json.dump(convert_res, f)

In [23]:
from kfp import onprem

@dsl.pipeline(
    name="Launch Katib Experiment",
    description="An example to launch Katib Experiment with early stopping"
)
def penguins_hyps():
    katib_op = katib_experiment_launcher_op(
        experiment_name=EXP_NAME,
        experiment_namespace=NAMESPACE,
        experiment_spec=ApiClient().sanitize_for_serialization(experiment_spec),
        experiment_timeout_minutes=10,
        delete_finished_experiment=False)
    
    convert_katib_results_op = components.func_to_container_op(convert_katib_results)
    best_hp_op = convert_katib_results_op(katib_op.output)
    best_hp_op.apply(onprem.mount_pvc(pvc_name='pvc-penguins', volume_name= 'data-mnt',volume_mount_path='/mnt'))
        
    op_out = dsl.ContainerOp(
        name="best-hp",
        image="library/bash:4.4.23",
        command=["sh", "-c"],
        arguments=["echo Best HyperParameters: %s" % katib_op.output],
    )

In [24]:
client.create_run_from_pipeline_func(penguins_hyps, experiment_name = EXP_NAME, arguments={})

RunPipelineResult(run_id=610fb159-6e9f-423a-9a8f-5f667428c71a)

In [None]:
import json
with open('/mnt/nfsroot/pv-penguins/hyps.json') as f:
    best_hyp = json.load(f)

best_param = {}
for p in best_hyp['currentOptimalTrial']["parameterAssignments"]:
    best_param[p['name']] = p['value']

print(best_param)

In [6]:
from kubeflow import katib

params = {
    "n_estimators": katib.search.int(min=100, max=1000, step=100),
    "max_depth": katib.search.int(min=2, max=6, step=1),
    "criterion": katib.search.categorical(['gini', 'entropy', 'log_loss'])
}


NAMESPANCE = 'kubeflow-user-example-com'
katib_client = katib.KatibClient()
katib_client.tune(
    name=EXP_NAME,
    namespace=NAMESPANCE,
    objective=trainer, 
    parameters=params, 
    algorithm_name="bayesianoptimization",
    objective_metric_name="accuracy", 
    # additional_metric_names=["loss"],
    max_trial_count=20, 
    base_image='python:3.8.10-slim',
    packages_to_install=['scikit-learn', 'pandas']
)
status = katib_client.is_experiment_succeeded(EXP_NAME, namespace='easyjin')
print(f"Katib Experiment is Succeeded: {status}\n")
best_hps = katib_client.get_optimal_hyperparameters(EXP_NAME, namespace='easyjin')
katib_client.get_suggestion(EXP_NAME, namespace='easyjin')