In [1]:
!pip install kfp

Collecting kfp
  Downloading kfp-2.7.0.tar.gz (441 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m441.8/441.8 kB[0m [31m841.1 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting click<9,>=8.0.0 (from kfp)
  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting docstring-parser<1,>=0.7.3 (from kfp)
  Downloading docstring_parser-0.16-py3-none-any.whl.metadata (3.0 kB)
Collecting google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5 (from kfp)
  Downloading google_api_core-2.19.0-py3-none-any.whl.metadata (2.7 kB)
Collecting google-auth<3,>=1.6.1 (from kfp)
  Downloading google_auth-2.30.0-py2.py3-none-any.whl.metadata (4.7 kB)
Collecting google-cloud-storage<3,>=2.2.1 (from kfp)
  Downloading google_cloud_storage-2.16.0-py2.py3-none-any.whl.metadata (6.1 kB)
Collecting kfp-pipeline-spec==0.3.0 (from kfp)
  Downloading kfp_pipeline_spec-0.3.0-py3-none-any.whl.metadata (329 by

In [2]:
!pip show kfp

Name: kfp
Version: 2.7.0
Summary: Kubeflow Pipelines SDK
Home-page: https://github.com/kubeflow/pipelines
Author: The Kubeflow Authors
Author-email: 
License: 
Location: /opt/miniconda3/lib/python3.12/site-packages
Requires: click, docstring-parser, google-api-core, google-auth, google-cloud-storage, kfp-pipeline-spec, kfp-server-api, kubernetes, protobuf, PyYAML, requests-toolbelt, tabulate, urllib3
Required-by: 


In [4]:
!python3 --version


Python 3.12.3


In [32]:
import kfp

In [6]:
import kfp.components as comp

In [7]:
import requests

In [23]:
import kfp.dsl as dsl

In [25]:
from kfp.dsl import pipeline

In [None]:
pip install 

In [60]:
@dsl.component(base_image = 'python:3.12',packages_to_install=['pandas'])
def prepare_data():

    import pandas as pd
    print("-------Inside data -----------")
    df = pd.read_csv("")
    df = df.dropna()
    df.to_csv(f'data/final_data.csv',index = False)
    print("----- data saved in csv format --------")
    
    

In [61]:
@dsl.component(base_image= 'python:3.12',packages_to_install=['pandas','numpy'])
def train_test_split():

    import pandas as pd
    import numpy as np
    from sklearn.model_selection import train_test_split
    print("------seperating data into test train -----")
    final_data = pd.read_csv(f'data/final_data.csv')
    target_column = 'class'
    X = final_data.loc[:,final_data.columns != target_column]
    y = final_data.loc[:,final_data.columns==target_column]
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,stratify = y,random_state = 47)
    np.save(f'data/X_train.npy',X_train)
    np.save(f'data/X_test.npy',X_test)
    np.save(f'data/y_train.npy',y_train)
    np.save(f'data/y_test.npy',y_test)

    print("--------training data -----------")
    print("/n")
    print(X_train)

    print("--------test data -------")
    print("/n")
    print(X_test)
    

In [62]:
@dsl.component(base_image = 'python:3.12',packages_to_install=['pandas','numpy','sklearn'])
def train_basic_classifier():
    import pandas as pd
    import numpy as np
    from sklearn.linear_model import LogisticRegression
    X_train = np.load(f'data/X_train.npy',allow_pickle=True)
    y_train = np.load(f'data/y_train.npy',allow_pickle=True)
    classifier = LogiticRegression(max_iter = 500)
    classifier.fit(X_train,y_train)
    import pickle
    with open(f'data/model.pkl','wb') as f:
        pickle.dump(classifier,f)
    print("-----------LOGISTIC REGRESSION TRAINED--------")
    
    
    
    

In [63]:
@dsl.component(base_image = 'python:3.12',packages_to_install=['pandas','numpy','sklearn'])
def test_on_data():
    import pandas as pd
    import numpy as np
    import pickle
    with open(f'data/model.pkl','rb') as f:
        l_model = pickle.load(f)
    X_test = np.load(f'data/X_test.npy',allow_pickle=True)
    y_pred = l_model.predict(X_test)
    
    
    np.save(f'data/y_pred.npy',y_pred)
    
    print("-------Prediction Completed---------")
    print(y_pred)


In [64]:
@dsl.component(base_image = 'python:3.12',packages_to_install=['pandas','numpy','sklearn'])
def predict_prob_on_test_data():
    import pandas as pd
    import numpy as np
    import pickle
    print("---- Inside predict_prob_on_test_data component ----")
    with open(f'data/model.pkl','rb') as f:
        logistic_reg_model = pickle.load(f)
    X_test = np.load(f'data/X_test.npy',allow_pickle=True)
    y_pred_prob = logistic_reg_model.predict_proba(X_test)
    np.save(f'data/y_pred_prob.npy', y_pred_prob)
    
    print("\n---- Predicted Probabilities ----")
    print("\n")
    print(y_pred_prob)

In [65]:
@dsl.component(base_image = 'python:3.12',packages_to_install=['pandas','numpy','sklearn'])
def get_metrics():
    import subprocess
    subprocess.run(['pip', 'install', 'pandas','numpy', 'scikit-learn'])
    import pandas as pd
    import numpy as np
    from sklearn.metrics import accuracy_score , precision_score,recall_score,log_loss
    from sklearn import metrics
    y_test = np.load(f'data/y_test.npy',allow_pickle=True)
    y_pred = np.load(f'data/y_pred.npy',allow_pickle=True)
    y_pred_proba = np.load(f'data/y_pred_proba.npy',allow_pickle=True)
    
    acc_score = accuracy_score(y_test,y_pred)
    
    pre_score = precision_score(y_test,y_pred)
    recall_score = recall_score(y_test,y_pred)
    log = log_loss(y_test,y_pred_proba)
    print("\n Model Metrics:", {'accuracy': round(acc, 2), 'precision': round(prec, 2), 'recall': round(recall, 2), 'entropy': round(entropy, 2)})

        
    

In [57]:
#Kubeflow pipeline

In [66]:
@pipeline(name='iris-pipeline', description='Pipeline to prepare Iris dataset.')
def iris_pipeline():
    
    prepare_iris_task = prepare_data()
    split_data = train_test_split().after(prepare_iris_task)
    classifier_training = train_basic_classifier().after(split_data)
    test = test_on_data().after(classifier_training)
    proba = predict_prob_on_test_data().after(test)
    evaluate = get_metrics().after(proba)
    