In [1]:
!pip install kfp
!pip install kserve
!pip install numpy
!pip install pandas
!pip install matplotlib

Collecting kserve
  Using cached kserve-0.13.0-py3-none-any.whl.metadata (9.1 kB)
Collecting cloudevents<2.0.0,>=1.6.2 (from kserve)
  Using cached cloudevents-1.11.0-py3-none-any.whl.metadata (6.9 kB)
Collecting fastapi<0.110.0,>=0.109.1 (from kserve)
  Using cached fastapi-0.109.2-py3-none-any.whl.metadata (25 kB)
Collecting grpcio<2.0.0,>=1.49.1 (from kserve)
  Using cached grpcio-1.65.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.3 kB)
Collecting httpx<0.27.0,>=0.26.0 (from kserve)
  Using cached httpx-0.26.0-py3-none-any.whl.metadata (7.6 kB)
Collecting orjson<4.0.0,>=3.9.15 (from kserve)
  Using cached orjson-3.10.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)
Collecting prometheus-client<0.21.0,>=0.20.0 (from kserve)
  Using cached prometheus_client-0.20.0-py3-none-any.whl.metadata (1.8 kB)
Collecting pydantic<3,>1.0 (from kserve)
  Using cached pydantic-2.8.2-py3-none-any.whl.metadata (125 kB)
Collecting ray<2.11.0,>=2.1

In [2]:
import kfp
from kfp import dsl
from kfp.dsl import Input, Output, Dataset, Model, Metrics, ClassificationMetrics
import pandas as pd
import numpy as np

In [3]:
@dsl.component(
    base_image="python:3.9",
    packages_to_install=['numpy', 'requests', 'pandas']
)
def load_raw_dataframe(raw_df_output: Output[Dataset]):
    '''
    get dataset from Keras and load it separating input from output and train from test
    '''
    
    import numpy as np
    import pandas as pd
    import requests
    import io
    
    url = "https://raw.githubusercontent.com/daniel88516/diabetes-data/main/10k.csv"
    s = requests.get(url).content
    df_data = pd.read_csv(io.StringIO(s.decode('utf-8')))
    
    df_data.to_csv(raw_df_output.path)

In [4]:
@dsl.component(
    base_image="python:3.9",
    packages_to_install=['numpy', 'pandas']
)
def preprocess_dataframe(raw_df_input: Input[Dataset], processed_df_data_output: Output[Dataset]):
    
    import numpy as np
    import pandas as pd
    
    df_data = pd.read_csv(raw_df_input.path)
    df_data.drop(df_data[df_data['diabetes'] == 'No Info'].index, inplace=True) #沒有結果(diabetes)的資料沒用，直接刪掉 
    df_data = df_data[['gender','age', 'bmi', 'HbA1c_level', 'blood_glucose_level', 'diabetes']]#只留六個(五個屬性一個答案)
    df_data.dropna(thresh=4, inplace=True) #篩掉有三個以上attribute缺失的資料


    gender_map = {'Male':0 , 'Female':1  , 'Other':2}  #經過篩查之後才發現有Other
    #smoking_history_map = {'never':0 , 'former':1 , 'not current':2 , 'current':3  ,'ever':4 ,'No Info':0 }

    #補齊資料
    df_data['gender'] = df_data['gender'].map(gender_map)     #進行數字mapping
    df_data = df_data[df_data['gender'] != 2]  #gender 為 Other 刪掉
    df_data['age'].replace('No Info', df_data['age'].mean(), inplace=True)    #將age = No Info都改成平均值
    df_data['bmi'].replace('No Info', df_data['bmi'].mean() , inplace=True)
    df_data['HbA1c_level'].replace('No Info', df_data['HbA1c_level'].mean() , inplace=True)
    df_data['blood_glucose_level'].replace('No Info', df_data['blood_glucose_level'].mean() , inplace=True)

    #df_data['smoking_history'] = df_data['smoking_history'].map(smoking_history_map)  #將age = no info都改成平均值
    #df_data['hypertension'].replace('No Info', 0 , inplace=True)
    #df_data['heart_disease'].replace('No Info', 0 , inplace=True)

    df_data.to_csv(processed_df_data_output.path, index=False)

In [5]:
@dsl.component(
    base_image="python:3.9",
    packages_to_install=['scikit-learn', 'pandas', 'numpy']
)
def preprocess_dataset(processed_df_data: Input[Dataset], metrics: Output[Metrics], 
                       x_train_artifact: Output[Dataset], x_test_artifact: Output[Dataset],
                       y_train_artifact: Output[Dataset],y_test_artifact: Output[Dataset]):
    '''
    get dataset from Keras and load it separating input from output and train from test
    '''
    
    from sklearn.model_selection import train_test_split
    import pandas as pd
    import numpy as np
    import os
    
    df_data = pd.read_csv(processed_df_data.path)
    X = df_data.drop(labels=['diabetes'], axis=1)
    Y = df_data[['diabetes']]
    
    x_train , x_test ,y_train , y_test = train_test_split(X , Y , test_size=0.2 , random_state=42)
    metrics.log_metric("Len x_train", x_train.shape[0])
    metrics.log_metric("Len x_train", y_train.shape[0])
    
    
    np.save("/tmp/x_train.npy",x_train)
    os.rename("/tmp/x_train.npy", x_train_artifact.path)
    
    np.save("/tmp/y_train.npy",y_train)
    os.rename("/tmp/y_train.npy", y_train_artifact.path)
    
    np.save("/tmp/x_test.npy",x_test)
    os.rename("/tmp/x_test.npy", x_test_artifact.path)
    
    np.save("/tmp/y_test.npy",y_test)
    os.rename("/tmp/y_test.npy", y_test_artifact.path)

In [8]:
@dsl.component(
    base_image="python:3.9", 
    packages_to_install=['scikit-learn', 'numpy', 'joblib']
)
def model_training(
    x_train_processed : Input[Dataset], x_test_processed: Input[Dataset],
    y_train_artifact : Input[Dataset], y_test_artifact :Input[Dataset],
    metrics: Output[Metrics], model_trained: Output[Model] #change metrics to classificationMetrics if ok
    ):
    
    from sklearn.linear_model import LogisticRegression
    import numpy as np
    import joblib

    X_train = np.load(x_train_processed.path)
    Y_train = np.load(y_train_artifact.path)
    X_test = np.load(x_test_processed.path)
    Y_test = np.load(y_test_artifact.path)
    
    #parameter adjust
    Model = LogisticRegression(random_state=0, max_iter=10000) 
    Model = Model.fit(X_train, Y_train)
    
    #save metric data(it's might be better to use classificationMetrics)
    metrics.log_metric('train accuracy', Model.score(X_train, Y_train))#train accuracy
    metrics.log_metric('test accuracy', Model.score(X_test, Y_test))#test accuracy

    #set artifact uri path
    joblib.dump(Model, model_trained.path)

In [9]:
@dsl.pipeline(
    name='diabetes model logistic regression pipeline training',
    description='diabetes model training by logistic regression using pipeline for test')
def diabetes_prediction_pipeline():
    load_raw_dataframe_task = load_raw_dataframe()
    preprocess_dataframe_task = preprocess_dataframe(
        raw_df_input = load_raw_dataframe_task.outputs['raw_df_output']
    )
    preprocess_dataset_task = preprocess_dataset(
        processed_df_data = preprocess_dataframe_task.outputs['processed_df_data_output']
    )
    model_training_task = model_training(
        x_train_processed = preprocess_dataset_task.outputs["x_train_artifact"],
        x_test_processed = preprocess_dataset_task.outputs["x_test_artifact"],
        y_train_artifact = preprocess_dataset_task.outputs["y_train_artifact"],
        y_test_artifact = preprocess_dataset_task.outputs["y_test_artifact"],
    )
    
# load_raw_dataframe: get 10k.csv dataframe by using requests
# preprocess_dataframe: use pandas to get needed data
# preprocess_dataset_task: split the dataset into x and y
# model_training: the stage we train the model

In [10]:
client = kfp.Client()
client.create_run_from_pipeline_func(diabetes_prediction_pipeline, experiment_name="diabete model traing",namespace="kubeflow-user-example-com",enable_caching=True)



RunPipelineResult(run_id=ac2414d6-b97b-4c97-8ae3-6b4be0d8206d)

In [None]:
# download dataset if it is not existed
!sh download_dataset.sh


In [81]:
import pandas as pd
import numpy as np
import tensorflow as tf
import io
import kfp
from kfp import dsl
from kfp.dsl import Input, Output, Dataset, Model, Metrics, ClassificationMetrics

print("tensorflow ver:"+tf.__version__)
print("panda ver:"+pd.__version__)
print("numpy ver:"+np.__version__)

tensorflow ver:2.17.0
panda ver:2.1.1
numpy ver:1.24.4


In [82]:
#.env
import os
from dotenv import load_dotenv
load_dotenv()
import pandas as pd
import numpy as np
import requests
from sklearn.model_selection import train_test_split

df_data = pd.read_csv(os.getenv('DATASET_PATH'))
df_data.drop(df_data[df_data['diabetes'] == 'No Info'].index, inplace=True) #沒有結果(diabetes)的資料沒用，直接刪掉 
df_data = df_data[['gender','age', 'bmi', 'HbA1c_level', 'blood_glucose_level', 'diabetes']]#只留六個(五個屬性一個答案)
df_data.dropna(thresh=4, inplace=True) #篩掉有三個以上attribute缺失的資料


gender_map = {'Male':0 , 'Female':1  , 'Other':2}  #經過篩查之後才發現有Other
#smoking_history_map = {'never':0 , 'former':1 , 'not current':2 , 'current':3  ,'ever':4 ,'No Info':0 }

#補齊資料
df_data['gender'] = df_data['gender'].map(gender_map)     #進行數字mapping
df_data = df_data[df_data['gender'] != 2]  #gender 為 Other 刪掉
df_data['age'].replace('No Info', df_data['age'].mean(), inplace=True)    #將age = No Info都改成平均值
df_data['bmi'].replace('No Info', df_data['bmi'].mean() , inplace=True)
df_data['HbA1c_level'].replace('No Info', df_data['HbA1c_level'].mean() , inplace=True)
df_data['blood_glucose_level'].replace('No Info', df_data['blood_glucose_level'].mean() , inplace=True)

#df_data['smoking_history'] = df_data['smoking_history'].map(smoking_history_map)  #將age = no info都改成平均值
#df_data['hypertension'].replace('No Info', 0 , inplace=True)
#df_data['heart_disease'].replace('No Info', 0 , inplace=True)

df_data

Unnamed: 0,gender,age,bmi,HbA1c_level,blood_glucose_level,diabetes
0,1,80.00,25.19,6.6,140.0,0.0
1,1,54.00,27.32,6.6,80.0,0.0
2,0,28.00,27.32,5.7,158.0,0.0
3,1,36.00,23.45,5.0,155.0,0.0
4,0,76.00,20.14,4.8,155.0,0.0
...,...,...,...,...,...,...
185894,0,61.00,29.66,9.0,160.0,1.0
185895,0,0.32,15.26,5.7,155.0,0.0
185896,0,50.00,27.32,6.1,130.0,0.0
185897,1,80.00,26.87,4.8,145.0,0.0


In [66]:
rows_with_nan = df_data[df_data.isnull().any(axis=1)].index
# 如果存在含有NaN值的行，印出位置
if len(rows_with_nan) > 0:
    print("資料中存在 NaN 值的行，行位置（行號）為:", rows_with_nan)
else:
    print("資料中不存在 NaN 值。")

資料中不存在 NaN 值。
