In [1]:
import mlrun

from os import path
import pandas as pd
from mlrun.execution import MLClientCtx

In [2]:
# Set the base project name
project_name_base = 'securenet'
# Initialize the MLRun environment and save the project name and artifacts path
project_name, artifact_path = mlrun.set_environment(project=project_name_base,
                                                    user_project=True)
                                                    
# Display the current project name and artifacts path
print(f'Project name: {project_name}')
print(f'Artifacts path: {artifact_path}')

Project name: securenet-floyed
Artifacts path: v3io:///projects/{{run.project}}/artifacts


In [3]:
project_path = path.abspath('conf')
securenet_proj =mlrun.new_project(project_name_base,
                            context=project_path,
                            init_git=True,
                            user_project=True)

## Local functions

In [4]:
# nuclio: start-code

In [5]:

import mlrun
from os import path
import pandas as pd
from mlrun.execution import MLClientCtx
from sklearn import preprocessing

# Ingest a data set

def data_clean(context, src: mlrun.DataItem,file_ext: str = 'csv',cleaned_key: str = 'cleaned_data'):
    #perform some data processing
         #remove cols
    df = src.as_df()
   
    # drop columns
    drop_cols_list = ['js_len', 'js_obf_len','content']
    df.drop(drop_cols_list, axis=1, inplace=True)
    df=df.fillna(-999)
    
    number = preprocessing.LabelEncoder()
   
    df['url']= df['url'].astype('string')
    df['url'] = number.fit_transform(df.url)
    
    df['ip_add']= df['ip_add'].astype('string')
    df['ip_add'] = number.fit_transform(df.ip_add)
    
    df['geo_loc'] = number.fit_transform(df.geo_loc)
  
    df['tld'] = number.fit_transform(df.tld)
  
    df['who_is'] = number.fit_transform(df.who_is)
   
    df['https'] = number.fit_transform(df.https)
    df['label'] = number.fit_transform(df.label)

   
    context.log_result('num_rows', df.shape[0])
    context.log_dataset(cleaned_key, df=df, format=file_ext, index=False)



In [6]:
# nuclio: end-code

## Covert local code to function

In [7]:
data_clean_func = mlrun.code_to_function(name='data_clean', kind='job', image='mlrun/mlrun')

# Run function locally

In [8]:
# Set the source-data URL
source_url = 'raw_data.csv' 

In [9]:
# Run the `data_prep_func` MLRun function locally
data_clean_run = data_clean_func.run(name='data_clean',
                                   handler=data_clean,
                                   inputs={'src': source_url},
                                    params={"file_ext" : "csv","apply_tenure_map": False},
                                   local=True)

> 2021-07-06 18:45:39,459 [info] starting run data_clean uid=81a4a8bf7af445abb827918cf0a8b62d DB=http://mlrun-api:8080


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
securenet-floyed,...f0a8b62d,0,Jul 06 18:45:39,completed,data_clean,v3io_user=floyedkind=owner=floyedhost=jupyter-868bd46cc9-rb2mn,src,file_ext=csvapply_tenure_map=False,num_rows=53863,cleaned_data


to track results use .show() or .logs() or in CLI: 
!mlrun get run 81a4a8bf7af445abb827918cf0a8b62d --project securenet-floyed , !mlrun logs 81a4a8bf7af445abb827918cf0a8b62d --project securenet-floyed
> 2021-07-06 18:45:44,821 [info] run executed, status=completed


## Reading output

In [10]:
data_clean_run.outputs['cleaned_data']

'store://artifacts/securenet-floyed/data_clean_cleaned_data:81a4a8bf7af445abb827918cf0a8b62d'

In [11]:
dataset = mlrun.run.get_dataitem(data_clean_run.outputs['cleaned_data'])
dataset = dataset.as_df()
dataset.head()

Unnamed: 0,url,url_len,ip_add,geo_loc,tld,who_is,https,label,special_chars_count,parameter_count,digit_count
0,6249,48,16912,175,104,0,1,1,10,0,6
1,30979,38,46043,175,64,0,0,1,6,0,0
2,31503,26,16354,87,64,1,0,1,6,0,0
3,15937,64,19910,175,104,0,1,1,13,0,0
4,14267,29,36333,175,64,1,1,1,6,0,0


In [12]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53863 entries, 0 to 53862
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   url                  53863 non-null  int64
 1   url_len              53863 non-null  int64
 2   ip_add               53863 non-null  int64
 3   geo_loc              53863 non-null  int64
 4   tld                  53863 non-null  int64
 5   who_is               53863 non-null  int64
 6   https                53863 non-null  int64
 7   label                53863 non-null  int64
 8   special_chars_count  53863 non-null  int64
 9   parameter_count      53863 non-null  int64
 10  digit_count          53863 non-null  int64
dtypes: int64(11)
memory usage: 4.5 MB


In [13]:
out = artifact_path 

data_clean_run= data_clean_func.run(name='data_clean',
                                   handler=data_clean,
                                   inputs={'src': source_url},
                                    params={"file_ext" : "csv","apply_tenure_map": False},
                                   local=True,
                                    artifact_path=path.join(out, '{{run.uid}}'))

> 2021-07-06 18:45:44,964 [info] starting run data_clean uid=e0ecee6b9a0e48a4872d393aaaa59172 DB=http://mlrun-api:8080


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
securenet-floyed,...aaa59172,0,Jul 06 18:45:45,completed,data_clean,v3io_user=floyedkind=owner=floyedhost=jupyter-868bd46cc9-rb2mn,src,file_ext=csvapply_tenure_map=False,num_rows=53863,cleaned_data


to track results use .show() or .logs() or in CLI: 
!mlrun get run e0ecee6b9a0e48a4872d393aaaa59172 --project securenet-floyed , !mlrun logs e0ecee6b9a0e48a4872d393aaaa59172 --project securenet-floyed
> 2021-07-06 18:45:49,502 [info] run executed, status=completed


In [19]:
data_clean_run.outputs['cleaned_data']

'store://artifacts/securenet-floyed/data_clean_cleaned_data:17276e50bb6240dd9b7b6d8a4e02b183'

## Running on Cluster

In [14]:
from mlrun.platforms import auto_mount

In [16]:
df = pd.read_csv('raw_data.csv')
print(df.shape)
securenet_proj.log_dataset(key='raw_data', df=df, index=False, format='csv')

(53863, 14)


<mlrun.artifacts.dataset.DatasetArtifact at 0x7f58898d6e10>

In [17]:

import sys
sys.path.append('/v3io/users/floyed/data')

data_clean_func.apply(auto_mount())
data_clean_run = data_clean_func.run(name='data_clean',
                                   handler=data_clean,
                                   inputs={'src': 'store:///raw_data'},
                                    params={"file_ext" : "csv","apply_tenure_map": False},
                                   local=False)

> 2021-07-06 22:23:50,890 [info] starting run data_clean uid=17276e50bb6240dd9b7b6d8a4e02b183 DB=http://mlrun-api:8080
> 2021-07-06 22:23:51,481 [info] Job is running in the background, pod: data-clean-9wm27
> 2021-07-06 22:24:01,516 [info] run executed, status=completed
final state: completed


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
securenet-floyed,...4e02b183,0,Jul 06 22:23:57,completed,data_clean,v3io_user=floyedkind=jobowner=floyedhost=data-clean-9wm27,src,file_ext=csvapply_tenure_map=False,num_rows=53863,cleaned_data


to track results use .show() or .logs() or in CLI: 
!mlrun get run 17276e50bb6240dd9b7b6d8a4e02b183 --project securenet-floyed , !mlrun logs 17276e50bb6240dd9b7b6d8a4e02b183 --project securenet-floyed
> 2021-07-06 22:24:11,087 [info] run executed, status=completed


In [18]:
print(data_clean_run.outputs)

{'num_rows': 53863, 'cleaned_data': 'store://artifacts/securenet-floyed/data_clean_cleaned_data:17276e50bb6240dd9b7b6d8a4e02b183'}
