# Kubeflow Pipeline Programatic Run

## Exemplary Pipeline for Decision Tree :


### Loading the data

In [None]:
import kfp.components as comp

# Passes output to preprocess_data

def load_data(output_data_path: comp.OutputPath(str)):
    import pandas as pd
    import os
    print("----- Inside load_data component -----")

    # Importing the Social Netwrok Ads data from github repository
    df = pd.read_csv('https://github.com/anshul1004/DecisionTree/blob/master/data_set/Social_Network_Ads.csv?raw=true')
    df.drop(columns='User ID', inplace = True)
    print(df)

    os.makedirs(output_data_path, exist_ok=True)
    df.to_csv(f'{output_data_path}/df.csv', index=False)

### Preprocessing dataset

In [None]:
import kfp.components as components

# Takes input from load_data and passes output to train_model & evaluate_model

def preprocess_data(preprocessed_data_path: comp.OutputPath(str), output_data_path: comp.InputPath(str)):
    import pandas as pd
    import numpy as np
    import category_encoders as ce
    from sklearn.model_selection import train_test_split
    import os
    
    print("----- Inside preprocess_data component -----")

    df = pd.read_csv(f'{output_data_path}/df.csv')

    # Last column of the dataframe is the target class
    X = df.drop([df.columns[-1]], axis=1)  
    y = df[df.columns[-1]]

    # We can utilise encoder from scikit-learn as well
    encoder = ce.OrdinalEncoder(cols=X.columns.tolist())
    X_enc = encoder.fit_transform(X)
    print(X_enc)

    X_train, X_test, y_train, y_test = train_test_split(X_enc, y, test_size=0.33, random_state=42)
    print(X, y, X_train, X_test, y_train, y_test)

    os.makedirs(preprocessed_data_path, exist_ok=True)
    np.save(f'{preprocessed_data_path}/X_train.npy', X_train)
    np.save(f'{preprocessed_data_path}/X_test.npy', X_test)
    np.save(f'{preprocessed_data_path}/y_train.npy', y_train)
    np.save(f'{preprocessed_data_path}/y_test.npy', y_test)

### Training the Model

In [None]:
import kfp.components as components

# Takes input from preprocess_data and passes output to evaluate_model

def train_model(preprocessed_data_path: comp.InputPath(str), trained_model_path: comp.OutputPath(str)):
    from sklearn.tree import DecisionTreeClassifier
    import pickle
    import numpy as np
    import os
    print("----- Inside train_model component -----")
    
    X_train = np.load(f'{preprocessed_data_path}/X_train.npy',allow_pickle=True)
    y_train = np.load(f'{preprocessed_data_path}/y_train.npy',allow_pickle=True)

    # Using "entropy" index, can also go with "gini" index 
    criterion='entropy'
    max_depth=3
    random_state=0
    
    clf = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth, random_state=random_state)
    clf.fit(X_train, y_train)
    
    os.makedirs(trained_model_path, exist_ok=True)
    with open(f'{trained_model_path}/model.pkl', 'wb') as f:
        pickle.dump(clf, f)

### Evaluating the Model

In [None]:
import kfp.components as components

# Takes inputs from preprocess_data and train_model

def evaluate_model(trained_model_path: comp.InputPath(str), preprocessed_data_path: comp.InputPath(str), evaluated_model_path: comp.OutputPath(str)):
    from sklearn.metrics import accuracy_score
    import pickle
    import numpy as np
    import os
    print("----- Inside evaluate_model component -----")

    with open(f'{trained_model_path}/model.pkl', 'rb') as f:
        clf = pickle.load(f)
    X_test = np.load(f'{preprocessed_data_path}/X_test.npy',allow_pickle=True)
    y_test = np.load(f'{preprocessed_data_path}/y_test.npy',allow_pickle=True)

    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(accuracy)
    # We can also plot the output(y_pred) wrt test data for visualisation...
    
    os.makedirs(evaluated_model_path, exist_ok=True)
    np.save(f'{evaluated_model_path}/y_pred.npy', y_pred)

### Creating Components of Pipeline

In [None]:
import kfp

BASE_IMAGE = 'tensorflow/tensorflow:2.11.1'
PACKAGES = ['pandas','numpy','scikit-learn', 'category_encoders']

# All four functions being transformed into pipeline components with same package for simplicity
create_step_load_data = kfp.components.create_component_from_func(
    func=load_data,
    base_image=BASE_IMAGE,
    packages_to_install=PACKAGES
)

create_step_preprocess_data = kfp.components.create_component_from_func(
    func=preprocess_data,
    base_image=BASE_IMAGE,
    packages_to_install=PACKAGES
)

create_step_train_model = kfp.components.create_component_from_func(
    func=train_model,
    base_image=BASE_IMAGE,
    packages_to_install=PACKAGES
)

create_step_evaluate_model = kfp.components.create_component_from_func(
    func=evaluate_model,
    base_image=BASE_IMAGE,
    packages_to_install=PACKAGES
)

### Defining the Pipeline Function

In [None]:
import kfp.dsl as dsl

data_path = '/decision_tree' # It is to just pass an argument or we can remove argument from pipeline definition

@dsl.pipeline(
   name='Decision Tree Classifier Pipeline',
   description='A pipeline that trains and evaluates a decision tree classifier.'
)

def decision_tree_pipeline(data_path: (str)):
    
    load_data_path = create_step_load_data()
    
    preprocessed_data_path = create_step_preprocess_data(load_data_path.output)
    
    trained_model_path = create_step_train_model(preprocessed_data_path.output)
    
    evaluated_model_path = create_step_evaluate_model(trained_model_path.output, preprocessed_data_path.output)

### Creating a Kubeflow Client

In [None]:
import kfp
import requests
from datetime import datetime

KUBEFLOW_ENDPOINT = " " # mention your kubeflow endpoint url
PROFILE_USERNAME = " " # kubeflow account username
PROFILE_PASSWORD = " " # kubeflow account password
PROFILE_NAMESPACE = " " # the namespace in which you want to run the pipeline

# Function to generate cookie to make a client
# It requires login details of the kubeflow account
def get_auth_session_cookie(host, login, password):
    session = requests.Session()
    response = session.get(host)
    headers = {
        "Content-Type": "application/x-www-form-urlencoded",
    }
    data = {"login": login, "password": password}
    session.post(response.url, headers=headers, data=data)
    session_cookie = session.cookies.get_dict()["authservice_session"]
    return session_cookie

session_cookie = get_auth_session_cookie(
      KUBEFLOW_ENDPOINT, PROFILE_USERNAME, PROFILE_PASSWORD
)
print(f"retrieved cookie: {session_cookie}")

# Creating kubeflow client to access kubeflow services
kfp_client = kfp.Client(host=f"{KUBEFLOW_ENDPOINT}/pipeline", cookies=f"authservice_session={session_cookie}", namespace=PROFILE_NAMESPACE)

### Client Services

In [None]:
# Creating an experiment
exp_name = datetime.now().strftime("%Y-%m-%d-%H-%M")
experiment = kfp_client.create_experiment(name=f"demo-{exp_name}")

In [None]:
# Creating a run
from kfp import compiler
compiler.Compiler().compile(decision_tree_pipeline, package_path='decision_tree/decision_tree_pipeline.yaml')

# By the Pipeline Function
kfp_client.create_run_from_pipeline_func(decision_tree_pipeline, arguments={'data_path': '/decision_tree'})

# By the YAML file
kfp_client.create_run_from_pipeline_package('decision_tree_pipeline.yaml', arguments={'data_path': '/decision_tree'})

In [None]:
# Running the pipeline

# kfp_client.list_experiments()
# kfp_client.list_pipelines()
# kfp_client.list_runs()

pip = " " # mention pipeline id
exp = " " # mention experiment id
kfp_client.run_pipeline(experiment_id=exp, pipeline_id=pip, params={'data_path': '/decision_tree'}, job_name="test")