# serverless lightgbm

In [16]:
ARCHIVE_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz"
FILE_NAME = 'higgs.parquet'
CHUNK_SIZE = 10_000
TARGET_PATH = '/User/mlrun/models/'
MODEL_NAME = 'lgb-classifier.pkl'

In [17]:
HIGGS_HEADER = ['labels', 'lepton_pT', 'lepton_eta', 'lepton_phi', 'missing_energy_magnitude', 'missing_energy_phi',
 'jet_1_pt', 'jet_1_eta', 'jet_1_phi', 'jet_1_b-tag', 'jet_2_pt', 'jet_2_eta', 'jet_2_phi', 'jet_2_b-tag', 'jet_3_pt',
 'jet_3_eta', 'jet_3_phi', 'jet_3_b-tag', 'jet_4_pt', 'jet_4_eta', 'jet_4_phi', 'jet_4_b-tag', 'm_jj', 'm_jjj', 'm_lv',
 'm_jlv', 'm_bb', 'm_wbb', 'm_wwbb']

In [12]:
import mlrun

#### _acquire_ - use an existing github function to acquire and store data

In [13]:
acquire_job = mlrun.import_function(
    'https://raw.githubusercontent.com/yjb-ds/functions/lgbm-serving/fileutils/arc_to_parquet/arc_to_parquet.yaml'
).apply(mlrun.mount_v3io())
acquire_job.deploy()



[mlrun] 2020-01-21 21:42:55,693 database connection is not configured
[mlrun] 2020-01-21 21:42:55,694 building image (.mlrun/func-default-arc-to-parquet-latest)
FROM python:3.6-jessie
RUN python -m pip uninstall mlrun
RUN python -m pip install -U -q mlrun
RUN python -m pip install -U -q pandas
RUN python -m pip install -U -q pyarrow
RUN python -m pip install -U -q numpy==1.17.4
RUN pip install mlrun

[mlrun] 2020-01-21 21:42:55,696 using in-cluster config.
[mlrun] 2020-01-21 21:42:55,713 Pod mlrun-build-arc-to-parquet-fzdsd created
..
[36mINFO[0m[0000] Resolved base name python:3.6-jessie to python:3.6-jessie 
[36mINFO[0m[0000] Resolved base name python:3.6-jessie to python:3.6-jessie 
[36mINFO[0m[0000] Downloading base image python:3.6-jessie     
[36mINFO[0m[0000] Error while retrieving image from cache: getting file info: stat /cache/sha256:0318d80cb241983eda20b905d77fa0bfb06e29e5aabf075c7941ea687f1c125a: no such file or directory 
[36mINFO[0m[0000] Downloading base image 

True

#### _train_ - use the github function spec

In [33]:
train_job = mlrun.import_function('/User/repos/functions/serving/lightgbm/train.yaml')

In [15]:
train_job.apply(mlrun.mount_v3io())
train_job.deploy()

[mlrun] 2020-01-21 21:45:12,731 database connection is not configured
[mlrun] 2020-01-21 21:45:12,732 building image (.mlrun/func-default-lgbm-job-latest)
FROM python:3.6-jessie
RUN rm /conda/lib/python3.6/site-packages/seaborn* -rf
RUN pip uninstall -y mlrun
RUN pip install -U -q mlrun
RUN pip install -U -q kfp
RUN pip install -U -q pyarrow
RUN pip install -U -q pandas
RUN pip install -U -q matplotlib
RUN pip install -U -q seaborn
RUN pip install -U -q scikit-learn
RUN pip install -U -q lightgbm
RUN pip install mlrun

[mlrun] 2020-01-21 21:45:12,740 Pod mlrun-build-lgbm-job-gx2kk created
..
[36mINFO[0m[0000] Resolved base name python:3.6-jessie to python:3.6-jessie 
[36mINFO[0m[0000] Resolved base name python:3.6-jessie to python:3.6-jessie 
[36mINFO[0m[0000] Downloading base image python:3.6-jessie     
[36mINFO[0m[0000] Error while retrieving image from cache: getting file info: stat /cache/sha256:0318d80cb241983eda20b905d77fa0bfb06e29e5aabf075c7941ea687f1c125a: no such file

True

<a id="pipeline"></a>
### create a kubeflow pipeline

In [27]:
import kfp
from kfp import dsl

In [28]:
srvfn = mlrun.new_model_server(
    'classifier', 
    model_class='ClassifierModel', 
    filename='/User/repos/functions/serving/classifier_server.ipynb')

srvfn.apply(mlrun.mount_v3io())

<mlrun.runtimes.function.RemoteRuntime at 0x7efe87f9a358>

In [29]:
@dsl.pipeline(name='LGBM', description='lightgbm classifier')
def lgbm_pipeline(learning_rate = [0.1, 0.3], num_leaves = [31, 32]):
    acquire_step = acquire_job.as_step(
            name='acquire_remote_data',
            handler='arc_to_parquet',
            params={
                'archive_url': ARCHIVE_URL,
                'header':      HIGGS_HEADER,
                'name':        FILE_NAME,
                'target_path': TARGET_PATH},
            outputs=['header'], 
            out_path=TARGET_PATH).apply(mlrun.mount_v3io())
    
    train_step = train_job.as_step(
            name='train_model', 
            handler='train',
            inputs={'header' : acquire_step.outputs['header']},
            params={
                'src_file':         FILE_NAME,
                'sample':           20000,
                'test_size':        0.1,
                'train_val_split':  0.75,
                'target_path':      TARGET_PATH,
                'name':             MODEL_NAME,
                'key' :             'model',
                'verbose':          False,
                'exp_labels':      {'type'      : 'classifier',
                                    'framework' : 'lightgbm',
                                    'mode'      : 'model'}},
            outputs=['model'],
            out_path= TARGET_PATH).apply(mlrun.mount_v3io())

    srvfn.deploy_step(
        project='default', 
        models={'classifier_gen': train_step.outputs['model']})

<a id="compile the pipeline"></a>
### compile the pipeline

We can compile our KubeFlow pipeline and produce a yaml description of the pipeline worflow:

In [30]:
kfp.compiler.Compiler().compile(lgbm_pipeline, TARGET_PATH + '/mlrunpipe.yaml')

In [31]:
client = kfp.Client(namespace='default-tenant')

Finally, the following line will run the pipeline as a job::

In [32]:
arguments = {}

run_result = client.create_run_from_pipeline_func(
    lgbm_pipeline, 
    arguments, 
    run_name='my classifier run',
    experiment_name='classifier')