In [1]:
import mlrun
from os import path

# Set the base project name
project_name_base = 'securenet'
# Initialize the MLRun environment and save the project name and artifacts path
project_name, artifact_path = mlrun.set_environment(project=project_name_base,
                                                    user_project=True)

print(f'Project name: {project_name}')
print(f'Artifact path: {artifact_path}')

Project name: securenet-floyed
Artifact path: v3io:///projects/{{run.project}}/artifacts


In [2]:
# nuclio: start-code

In [3]:
from sklearn import preprocessing
from os import path
import numpy as np 
import pandas as pd
import datetime as dt
from sklearn.model_selection import train_test_split
from mlrun.execution import MLClientCtx
from pickle import dumps
from sklearn.linear_model import LogisticRegression
import mlrun

In [27]:
def train_data(context,
               dataset:mlrun.DataItem,
               label_column: str = "label"):
    
    train_df = dataset.as_df()
    y = train_df[label_column]
   
    train_df = train_df.drop(columns=[label_column])
    train_df = train_df.drop(train_df.columns[[0]], axis=1)
    
    X_train,X_test,y_train,y_test = train_test_split(train_df,y,random_state=123,test_size=0.10)
    model = LogisticRegression(max_iter=10000)
    model.fit(X_train, y_train)
                   
    context.log_dataset('train_set', 
                        df=pd.concat([X_train, y_train.to_frame()], axis=1),
                        format='csv', index=False, 
                        #artifact_path=context.artifact_subpath('data')
                       )
                       

    context.log_dataset('test_set', 
                        df=pd.concat([X_test, y_test.to_frame()], axis=1),
                        format='csv', index=False, 
                        labels={"data-type": "held-out"},
                        #artifact_path=context.artifact_subpath('data')
                       )
    context.log_model('model',
                     body=dumps(model),
                     #artifact_path=context.artifact_subpath("model"),
                     model_file="model.pkl",
                    metrics=context.results,
                    labels={"class": "sklearn.linear_model.LogisticRegression"})
    
    context.logger.info('End training')


In [5]:
# nuclio: end-code

## Converting to ML run function

In [28]:
train_data_func = mlrun.code_to_function(name='train_data', kind='job', image='mlrun/mlrun')


## Run function locally

In [25]:
dataset = 'store://datasets/securenet-floyed/data_clean_cleaned_data:latest'

In [29]:
out = artifact_path 

train_data_run = train_data_func.run(name='train_data',
                                   handler=train_data,
                                   inputs={'dataset': dataset},
                                    
                                   local=True,
                                    artifact_path=path.join(out, '{{run.uid}}'))

> 2021-07-06 22:33:26,087 [info] starting run train_data uid=3cb7db852324487c99fd7f217eb6f7b6 DB=http://mlrun-api:8080
> 2021-07-06 22:33:29,700 [info] End training


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
securenet-floyed,...7eb6f7b6,0,Jul 06 22:33:26,completed,train_data,v3io_user=floyedkind=owner=floyedhost=jupyter-868bd46cc9-rb2mn,dataset,,,train_settest_setmodel


to track results use .show() or .logs() or in CLI: 
!mlrun get run 3cb7db852324487c99fd7f217eb6f7b6 --project securenet-floyed , !mlrun logs 3cb7db852324487c99fd7f217eb6f7b6 --project securenet-floyed
> 2021-07-06 22:33:29,776 [info] run executed, status=completed


In [30]:
train_data_run.outputs

{'train_set': 'store://artifacts/securenet-floyed/train_data_train_set:3cb7db852324487c99fd7f217eb6f7b6',
 'test_set': 'store://artifacts/securenet-floyed/train_data_test_set:3cb7db852324487c99fd7f217eb6f7b6',
 'model': 'store://artifacts/securenet-floyed/train_data_model:3cb7db852324487c99fd7f217eb6f7b6'}

## Mounting

In [35]:
from mlrun.platforms import auto_mount
train_data_func = train_data_func.apply(auto_mount())


## Running on cluster

In [33]:
import sys
sys.path.append('/v3io/users/floyed/data')


In [36]:
train_data_run = train_data_func.run(name='train_data',
                                   handler=train_data,
                                   inputs={'dataset': dataset},
                                   
                                   local=False)

> 2021-07-06 22:41:24,057 [info] starting run train_data uid=f1e2843bf63049f78530bdb5ff33a1e7 DB=http://mlrun-api:8080
> 2021-07-06 22:41:24,286 [info] Job is running in the background, pod: train-data-msk82
> 2021-07-06 22:41:32,315 [info] End training
> 2021-07-06 22:41:32,366 [info] run executed, status=completed
final state: completed


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
securenet-floyed,...ff33a1e7,0,Jul 06 22:41:30,completed,train_data,v3io_user=floyedkind=jobowner=floyedhost=train-data-msk82,dataset,,,train_settest_setmodel


to track results use .show() or .logs() or in CLI: 
!mlrun get run f1e2843bf63049f78530bdb5ff33a1e7 --project securenet-floyed , !mlrun logs f1e2843bf63049f78530bdb5ff33a1e7 --project securenet-floyed
> 2021-07-06 22:41:33,526 [info] run executed, status=completed


In [38]:
train_data_run.outputs

{'train_set': 'store://artifacts/securenet-floyed/train_data_train_set:f1e2843bf63049f78530bdb5ff33a1e7',
 'test_set': 'store://artifacts/securenet-floyed/train_data_test_set:f1e2843bf63049f78530bdb5ff33a1e7',
 'model': 'store://artifacts/securenet-floyed/train_data_model:f1e2843bf63049f78530bdb5ff33a1e7'}

In [37]:
train_data_run.outputs['model']

'store://artifacts/securenet-floyed/train_data_model:f1e2843bf63049f78530bdb5ff33a1e7'

In [12]:
train_data_run.outputs

{'train_set': 'store://artifacts/securenet-floyed/train-data-train_data_train_set:e294d20014d2417e8de725fac1d597ab',
 'test_set': 'store://artifacts/securenet-floyed/train-data-train_data_test_set:e294d20014d2417e8de725fac1d597ab',
 'model': 'store://artifacts/securenet-floyed/train-data-train_data_model:e294d20014d2417e8de725fac1d597ab'}

In [39]:
test = mlrun.run.get_dataitem(train_data_run.outputs['test_set']).as_df()

In [40]:
test.head()

Unnamed: 0,url_len,ip_add,geo_loc,tld,who_is,https,special_chars_count,parameter_count,digit_count,label
0,34,10280,159,64,0,1,5,0,0,1
1,30,21696,87,64,1,0,8,0,0,0
2,42,21150,24,230,0,1,9,0,1,1
3,82,48274,139,262,0,1,20,3,1,1
4,33,7673,175,172,0,0,7,0,0,1


## Testing model

In [41]:
test_model = mlrun.import_function('hub://test_classifier').apply(auto_mount())

In [42]:
test_run = test_model.run(name="test",
                    params={"label_column": "label",
                            #"plots_dest": path.join("plots", "test")
                           },
                    inputs={"models_path": train_data_run.outputs['model'],
                            "test_set": train_data_run.outputs['test_set']
                            })
print(f'Test Accuracy: {test_run.outputs["accuracy"]}')



> 2021-07-06 22:42:31,439 [info] starting run test uid=4d1b391f7df44c548270034c086e1ecb DB=http://mlrun-api:8080
> 2021-07-06 22:42:31,677 [info] Job is running in the background, pod: test-69rrq
> 2021-07-06 22:42:39,478 [info] run executed, status=completed
final state: completed


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
securenet-floyed,...086e1ecb,0,Jul 06 22:42:38,completed,test,v3io_user=floyedkind=jobowner=floyedhost=test-69rrq,models_pathtest_set,label_column=label,accuracy=0.8418414702060516test-error=0.1581585297939484rocauc=0.9050366432078779brier_score=0.11872366106914786f1-score=0.8545081967213114precision_score=0.86875recall_score=0.8407258064516129,probability-calibrationconfusion-matrixprecision-recall-binaryroc-binarytest_set_preds


to track results use .show() or .logs() or in CLI: 
!mlrun get run 4d1b391f7df44c548270034c086e1ecb --project securenet-floyed , !mlrun logs 4d1b391f7df44c548270034c086e1ecb --project securenet-floyed
> 2021-07-06 22:42:40,969 [info] run executed, status=completed
Test Accuracy: 0.8418414702060516


## Setting the train function

In [43]:
project_path = path.abspath('conf')
securenet_proj =mlrun.new_project(project_name_base,
                            context=project_path,
                            init_git=True,
                            user_project=True)

In [44]:

fn = f'db://securenet-floyed/train-data'
f = mlrun.import_function(fn)
f


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f02216a0150>

In [45]:
securenet_proj.set_function(f)

<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f02216a0150>

In [46]:
securenet_proj.save()

In [47]:
securenet_proj.functions

[{'name': 'train-data',
  'spec': {'kind': 'job',
   'metadata': {'name': 'train-data',
    'tag': 'latest',
    'hash': '0c2ffcee3373c9b07757888ed399b8804ec5e91b',
    'project': 'securenet-floyed',
    'categories': [],
    'updated': '2021-07-06T22:41:24.172655+00:00'},
   'spec': {'command': '',
    'args': [],
    'image': 'mlrun/mlrun',
    'env': [{'name': 'V3IO_API', 'value': ''},
     {'name': 'V3IO_USERNAME', 'value': ''},
     {'name': 'V3IO_ACCESS_KEY', 'value': ''}],
    'default_handler': '',
    'entry_points': {'train_data': {'name': 'train_data',
      'doc': '',
      'parameters': [{'name': 'context', 'default': ''},
       {'name': 'dataset', 'type': 'DataItem', 'default': ''},
       {'name': 'label_column', 'type': 'str', 'default': 'label'}],
      'outputs': [{'default': ''}],
      'lineno': 14}},
    'description': '',
    'build': {'functionSourceCode': 'IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKZnJvbSBza2xlYXJuIGltcG9ydCBwcmVwcm9jZXNzaW5nCm