In [10]:
! pip3 install --upgrade --quiet google-cloud-aiplatform \
                                 google-cloud-storage \
                                 kfp==2.7.0 \
                                 google-cloud-pipeline-components

In [12]:
import sys

if "google.colab" in sys.modules:

    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

In [14]:
! python3 -c "import kfp; print('KFP SDK version: {}'.format(kfp.__version__))"

KFP SDK version: 2.7.0


In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pickle
import json
from typing import NamedTuple

from google.cloud import aiplatform
from kfp import compiler, dsl
from kfp.dsl import component

In [17]:
gcs_path = "gs:///mlops-01-pipeline/Dataset_diabetes-dev.csv"

In [18]:
data = pd.read_csv(gcs_path)

In [19]:
data.head()

Unnamed: 0,PatientID,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic
0,1142956,1,78,41,33,311,50.796392,0.420804,24,0
1,1823377,0,116,92,16,184,18.60363,0.131156,22,0
2,1916381,8,171,42,29,160,35.482247,0.082671,22,1
3,1247480,3,108,63,45,297,49.375169,0.100979,46,1
4,1516947,8,153,99,15,41,35.062139,0.116191,22,1


In [20]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
import joblib
 
# 1. Load the data
# Replace this with your actual data loading function
# Example: Assume we have a DataFrame 'df' with features and a target column 'target'
# X = df.drop(columns='target')  # Input features
# y = df['target']  # Target variable
 
# Here I will create a sample dataset for illustration (replace with your data loading function)
from sklearn.datasets import load_iris
# data = load_iris()
data = pd.read_csv(gcs_path)
# df = pd.DataFrame(data.data, columns=data.feature_names)
X = data.drop(columns=["Diabetic","PatientID"])
y = data["Diabetic"]
 
# 2. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
 
# # 3. Define a pipeline for preprocessing and model training
# pipeline = Pipeline([
#     # Step 1: Standardize the data
#     ('classifier', RandomForestClassifier())  # Step 2: Train the model using RandomForest
# ])
 
# # # # 4. Define a grid of hyperparameters for tuning
# param_grid = {
#     'classifier__n_estimators': [100, 200],  # Number of trees
#     'classifier__max_depth': [10, 20, None],  # Maximum depth of each tree
#     'classifier__min_samples_split': [2, 5],  # Minimum samples required to split an internal node
# }
 
# # # 5. Perform GridSearchCV to tune hyperparameters
# grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)
 
 # 6. Train the model using the training data
# grid_search.fit(X_train, y_train)

model = LogisticRegression(solver='saga', max_iter=500, C=0.5)
model.fit(X_train, y_train)
 



In [6]:
# best_model = grid_search.best_estimator
print(type(model))
# joblib.dump(best_model, 'diabetes_model_rf.pkl')
# pickle.dump('classifier',open("model.pkl","wb"))
joblib.dump(model, 'model-demo-1.joblib')

<class 'sklearn.linear_model._logistic.LogisticRegression'>


['model-demo-1.joblib']

In [21]:
# evaluation
y_pred = model.predict(X_test)
from sklearn.metrics import (
    classification_report, 
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score, 
    confusion_matrix, 
    roc_auc_score, 
    roc_curve
)
import matplotlib.pyplot as plt

# Evaluate the model performance
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Precision
precision = precision_score(y_test, y_pred, average='weighted')  # 'weighted' for multiclass
print(f"Precision: {precision:.4f}")

# Recall
recall = recall_score(y_test, y_pred, average='weighted')  # 'weighted' for multiclass
print(f"Recall: {recall:.4f}")

# F1 Score
f1 = f1_score(y_test, y_pred, average='weighted')  # 'weighted' for multiclass
print(f"F1 Score: {f1:.4f}")

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.85      0.79       988
           1       0.60      0.44      0.51       512

    accuracy                           0.71      1500
   macro avg       0.67      0.65      0.65      1500
weighted avg       0.70      0.71      0.70      1500

Accuracy: 0.7100
Precision: 0.6969
Recall: 0.7100
F1 Score: 0.6970
Confusion Matrix:
[[839 149]
 [286 226]]


In [8]:
# Store the Model Artifact
from google.cloud import storage

# Variables
bucket_name = "mlops-01-instance"
local_model_path = "./model-demo-1.joblib"  # Path to your local model
gcs_model_path = "models/model-demo-1.joblib"  # Name of the file in GCS

# Initialize GCS Client
client = storage.Client()
bucket = client.bucket(bucket_name)

# Upload the Pickle File
blob = bucket.blob(gcs_model_path)
blob.upload_from_filename(local_model_path)

print(f"Model uploaded to GCS at: gs://{bucket_name}/{gcs_model_path}")


Model uploaded to GCS at: gs://mlops-01-instance/models/model-demo-1.joblib


In [1]:
from google.cloud import aiplatform

# Initialize Vertex AI
aiplatform.init(project="involuted-tuner-441406-a9", location="us-central1")

In [12]:
import json
from google.cloud import storage

# Metrics to store
evaluation_metrics = {
    "accuracy": 0.92,
    "precision": 0.89,
    "recall": 0.91,
    "f1_score": 0.90,
}

# Save metrics to a JSON file
with open("metrics.json", "w") as f:
    json.dump(evaluation_metrics, f)

# Upload to GCS
bucket_name = "mlops-01-dataset"
destination_blob_name = "evaluation_metrics/metrics.json"

storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(destination_blob_name)
blob.upload_from_filename("metrics.json")

print(f"Metrics uploaded to: gs://{bucket_name}/{destination_blob_name}")


Metrics uploaded to: gs://mlops-01-dataset/evaluation_metrics/metrics.json


AttributeError: module 'google.cloud.aiplatform.metadata' has no attribute 'MetadataServiceClient'

In [40]:
# from google.cloud import aiplatform

# # Initialize Vertex AI
# aiplatform.init(project="involuted-tuner-441406-a9", location="us-central1")

# Variables
model_display_name = "diabetes_model_instance"
gcs_model_path = "gs://mlops-01-instance/models/"
container_image = "us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-5:latest"# Prebuilt Scikit-learn image

with open("metrics.json", "r") as file:
    metrics_data = json.load(file)

# Extract key metrics (e.g., accuracy, precision)
accuracy = metrics_data.get("accuracy", "N/A")
precision = metrics_data.get("precision", "N/A")
recall = metrics_data.get("recall", "N/A")

# Create the labels dictionary with summary metrics
labels = {
    "accuracy": str(accuracy),
}
# evaluation_label_key = "accuracy"
# evaluation_label_value = "92"  # A short, compliant value
metrics = {
    "accuracy": "95",
    "precision": "93",
    "recall": "92"
}
# Upload the Model to Vertex AI
model = aiplatform.Model.upload(
    parent_model="projects/260483181843/locations/us-central1/models/3703354173954195456",
    display_name=model_display_name,
    artifact_uri=gcs_model_path,
    serving_container_image_uri=container_image,
    # labels=labels
    # is_default_version=False
    labels= metrics,  # Add metadata as labels

)

# # Deploy the Model to an Endpoint
# endpoint = model.deploy(
#     deployed_model_display_name="diabetes-model-endpoint-instance",
#     traffic_split={"0": 100},  # Send 100% traffic to this model
#     machine_type="n1-standard-2",
# )

# print(f"Model deployed to endpoint: {endpoint.resource_name}")


Creating Model
Create Model backing LRO: projects/260483181843/locations/us-central1/models/3703354173954195456/operations/5587540967906344960
Model created. Resource name: projects/260483181843/locations/us-central1/models/3703354173954195456@8
To use this Model in another session:
model = aiplatform.Model('projects/260483181843/locations/us-central1/models/3703354173954195456@8')


In [20]:
models = aiplatform.Model.list()

for model in models:
    print(f"Model name: {model.display_name}, Model ID: {model.resource_name}")

Model name: diabetes_model_instance, Model ID: projects/260483181843/locations/us-central1/models/3703354173954195456


In [None]:
!pip install google-cloud-pipeline-components

In [None]:
import google.cloud.pipeline_components
print(f"version: {google.cloud.pipeline_components.__version__}")

In [None]:
from google_cloud_pipeline_components.v1.dataflow import DataflowPythonJobOp
from kfp import dsl

In [13]:
! pip3 install --upgrade --quiet google-cloud-aiplatform \
                                 google-cloud-storage \
                                 kfp==2.7.0 \
                                 google-cloud-pipeline-components

In [16]:
import kfp
from kfp import dsl
from google.cloud import aiplatform
from google.cloud import storage
from typing import NamedTuple
from kfp import compiler

In [17]:
# Define component to load and preprocess data
@dsl.component(
    packages_to_install=['pandas', 'scikit-learn', 'google-cloud-storage']
)
def load_data(
    gcs_path: str,
    test_size: float = 0.3
) -> NamedTuple('Outputs', [
    ('x_train_path', str),
    ('x_test_path', str),
    ('y_train_path', str),
    ('y_test_path', str)
]):
    import pandas as pd
    from sklearn.model_selection import train_test_split
    import joblib
    
    # Load data
    data = pd.read_csv(gcs_path)
    X = data.drop(columns=["Diabetic", "PatientID"])
    y = data["Diabetic"]
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42
    )
    
    # Save splits to local files
#     x_train_path = 'x_train.joblib'
#     x_test_path = 'x_test.joblib'
#     y_train_path = 'y_train.joblib'
#     y_test_path = 'y_test.joblib'
    
#     joblib.dump(X_train, x_train_path)
#     joblib.dump(X_test, x_test_path)
#     joblib.dump(y_train, y_train_path)
#     joblib.dump(y_test, y_test_path)
    
#     return (x_train_path, x_test_path, y_train_path, y_test_path)


  return component_factory.create_component_from_func(


In [18]:
# Define component to train model
@dsl.component(
    packages_to_install=['scikit-learn', 'joblib']
)
def train_model(
    x_train_path: str,
    y_train_path: str,
    max_iter: int = 500,
    C: float = 0.5
) -> str:
    import joblib
    from sklearn.linear_model import LogisticRegression
    
    # Load training data
    X_train = joblib.load(x_train_path)
    y_train = joblib.load(y_train_path)
    
    # Train model
    model = LogisticRegression(solver='saga', max_iter=max_iter, C=C)
    model.fit(X_train, y_train)
    
    # Save model
    model_path = '/tmp/model.joblib'
    joblib.dump(model, model_path)
    
    return model_path

In [19]:

# Define component to evaluate model
@dsl.component(
    packages_to_install=['scikit-learn', 'joblib']
)
def evaluate_model(
    model_path: str,
    x_test_path: str,
    y_test_path: str
) -> NamedTuple('Outputs', [
    ('accuracy', float),
    ('precision', float),
    ('recall', float),
    ('f1', float)
]):
    import joblib
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    
    # Load model and test data
    model = joblib.load(model_path)
    X_test = joblib.load(x_test_path)
    y_test = joblib.load(y_test_path)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    return (accuracy, precision, recall, f1)

In [20]:
# Define component to upload model to GCS
@dsl.component(
    packages_to_install=['google-cloud-storage']
)
def upload_model(
    model_path: str,
    bucket_name: str,
    gcs_model_path: str
) -> str:
    from google.cloud import storage
    
    # Upload model to GCS
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(gcs_model_path)
    blob.upload_from_filename(model_path)
    
    gcs_uri = f"gs://{bucket_name}/{gcs_model_path}"
    return gcs_uri

In [21]:
# Define component to deploy model
@dsl.component(
    packages_to_install=['google-cloud-aiplatform']
)
def deploy_model(
    project: str,
    location: str,
    model_display_name: str,
    gcs_model_uri: str,
    endpoint_display_name: str
):
    from google.cloud import aiplatform
    
    # Initialize Vertex AI
    aiplatform.init(project=project, location=location)
    
    # Upload and deploy model
    model = aiplatform.Model.upload(
        display_name=model_display_name,
        artifact_uri=gcs_model_uri,
        serving_container_image_uri="us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-5:latest"
    )
    
    endpoint = model.deploy(
        deployed_model_display_name=endpoint_display_name,
        machine_type="n1-standard-2"
    )

In [28]:

# Define the pipeline
@dsl.pipeline(
    name='diabetes-prediction-pipeline',
    description='Pipeline for training and deploying diabetes prediction model'
)
def diabetes_pipeline(
    project_id: str,
    location: str,
    gcs_data_path: str,
    bucket_name: str,
    model_display_name: str,
    endpoint_display_name: str
):
    # Load and preprocess data
    # data_op = load_data(gcs_path=gcs_data_path)
    importer = kfp.dsl.importer(
        artifact_uri="gs://mlops-01-pipeline/Dataset_diabetes-dev.csv",
        artifact_class=Dataset,
        reimport=False,
    )
    
#     # Train model
#     train_op = train_model(
#         x_train_path=data_op.outputs['x_train_path'],
#         y_train_path=data_op.outputs['y_train_path']
#     )
    
# #     # Evaluate model
#     eval_op = evaluate_model(
#         model_path=train_op.output,
#         x_test_path=data_op.outputs['x_test_path'],
#         y_test_path=data_op.outputs['y_test_path']
#     )
    
# #     # Upload model to GCS
#     upload_op = upload_model(
#         model_path=train_op.output,
#         bucket_name=bucket_name,
#         gcs_model_path='models/model.joblib'
#     )
    
# #     # Deploy model
#     deploy_op = deploy_model(
#         project=project_id,
#         location=location,
#         model_display_name=model_display_name,
#         gcs_model_uri=upload_op.output,
#         endpoint_display_name=endpoint_display_name
#     )

TypeError: importer() missing 1 required positional argument: 'artifact_class'

In [26]:

# Compile and run the pipeline
def run_pipeline():
    # Initialize Vertex AI
    aiplatform.init(
        project='involuted-tuner-441406-a9',
        location='us-central1'
    )
    
    #Compile the pipeline
    # pipeline_func = diabetes_pipeline
    pipeline_filename = 'diabetes_pipeline.json'
    # compiler.Compiler().compile(pipeline_func, pipeline_filename)
    
    # Create a pipeline job
    job = aiplatform.PipelineJob(
        display_name='diabetes-prediction-pipeline',
        template_path=pipeline_filename,
        pipeline_root='gs://mlops-01-pipeline/pipeline_root',
        parameter_values={
            'project_id': 'involuted-tuner-441406-a9',
            'location': 'us-central1',
            'gcs_data_path': 'gs://mlops-01-pipeline/Dataset_diabetes-dev.csv',
            'bucket_name': 'mlops-01-pipeline',
            'model_display_name': 'diabetes_model_instance-pipeline',
            'endpoint_display_name': 'diabetes-model-endpoint-instance-pipeline'
        }
    )
    
    job.run()

In [27]:
run_pipeline()

Creating PipelineJob
PipelineJob created. Resource name: projects/260483181843/locations/us-central1/pipelineJobs/diabetes-prediction-pipeline-20250122071644
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/260483181843/locations/us-central1/pipelineJobs/diabetes-prediction-pipeline-20250122071644')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/diabetes-prediction-pipeline-20250122071644?project=260483181843
PipelineJob projects/260483181843/locations/us-central1/pipelineJobs/diabetes-prediction-pipeline-20250122071644 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/260483181843/locations/us-central1/pipelineJobs/diabetes-prediction-pipeline-20250122071644 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/260483181843/locations/us-central1/pipelineJobs/diabetes-prediction-pipeline-20250122071644 current state:
PipelineState.PIPELINE_STATE_R

RuntimeError: Job failed with:
code: 9
message: " The DAG failed because some tasks failed. The failed tasks are: [load-data].; Job (project_id = involuted-tuner-441406-a9, job_id = 2671786592338706432) is failed due to the above error.; Failed to handle the job: {project_number = 260483181843, job_id = 2671786592338706432}"


In [None]:
# !pip install fsspec requests-toolbelt tabulate protobuf urllib3

In [7]:
# BUCKET_URI = f"gs://mlops-01-pipeline"

In [None]:
# involuted-tuner-441406-a9@appspot.gserviceaccount.com

In [4]:
datasets = aiplatform.TabularDataset.list()  # For tabular datasets
# Or use aiplatform.Dataset.list() for general datasets

for dataset in datasets:
    print(f"Dataset Name: {dataset.resource_name}, Display Name: {dataset.display_name}")

AttributeError: 'TabularDataset' object has no attribute 'id'

In [9]:
# ! gsutil iam ch serviceAccount:involuted-tuner-441406-a9@appspot.gserviceaccount.com:roles/storage.objectViewer $BUCKET_URI

In [8]:
# ! gsutil iam ch serviceAccount:involuted-tuner-441406-a9@appspot.gserviceaccount.com:roles/storage.objectCreator $BUCKET_URI