In [76]:
! pip install kfp



In [77]:
!pip install google-cloud-pipeline-components



In [None]:
!pip install gcsfs



In [78]:
!pip install scikit-learn



In [79]:

# Set project and location
project_id = 'gcp-final-project-444704'
location = 'us-central1'

from google.cloud import aiplatform
aiplatform.init(project=project_id, location=location)

In [80]:
from kfp.v2.dsl import pipeline, component, InputPath, OutputPath, Dataset, Metrics
from google.cloud import aiplatform

In [91]:
@component(packages_to_install=["pandas", "numpy","fsspec","gcsfs"])
def initial_data_preparation(
    input_dataset_path: str,  # Input: Raw dataset path
    prepared_dataset_path: OutputPath()  # Output: Cleaned and prepared dataset path
):
    import pandas as pd
    import numpy as np

    # Load dataset
    df = pd.read_csv(input_dataset_path)

    # Handle missing values
    df['Exercise Frequency'].fillna('None', inplace=True)
    df['Type of Treatment'].fillna('None', inplace=True)
    df['Number of Prior Visits'].fillna(df['Number of Prior Visits'].median(), inplace=True)
    df['Medications Prescribed'].fillna(df['Medications Prescribed'].mean(), inplace=True)

    # Handle outliers
    df['Age'] = df['Age'].clip(upper=df['Age'].quantile(0.99))
    df['Adjusted Weight (kg)'] = df['Adjusted Weight (kg)'].clip(upper=df['Adjusted Weight (kg)'].quantile(0.95))
    df['Length of Stay'] = np.log1p(df['Length of Stay'])
    df['Number of Prior Visits'] = df['Number of Prior Visits'].clip(upper=df['Number of Prior Visits'].quantile(0.95))

    # Drop unnecessary columns
    columns_to_drop = ['Hospital ID', 'Weight (kg)']
    df = df.drop(columns=columns_to_drop)

    # Save the prepared dataset
    df.to_csv(prepared_dataset_path, index=False)

In [92]:
@component(packages_to_install=["pandas", "scikit-learn", "imblearn", "joblib", "numpy", "fsspec", "gcsfs"])
def split_and_preprocess_data(
    prepared_dataset_path: InputPath(),  # Input: Cleaned dataset path
    training_dataset_path: OutputPath(),  # Output: Processed training dataset
    validation_dataset_path: OutputPath(),  # Output: Processed validation dataset
    scaler_path: OutputPath("Artifact")  # Output: Saved scaler artifact
):
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    from imblearn.over_sampling import SMOTE
    import joblib

    # Load dataset
    df = pd.read_csv(prepared_dataset_path)

    # Ensure all numerical columns are of numeric type
    numerical_columns = ['Height (m)', 'BMI', 'Adjusted Weight (kg)', 'Number of Prior Visits',
                         'Medications Prescribed', 'Length of Stay']
    for col in numerical_columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')  # Convert to numeric, set invalid values to NaN
    df = df.dropna(subset=numerical_columns)  # Drop rows with NaN in numerical columns

    # Binary encoding for binary categorical columns
    df['Gender'] = df['Gender'].map({'Male': 0, 'Female': 1})
    df['Smoker'] = df['Smoker'].astype(int)

    # One-hot encoding for other categorical variables
    categorical_columns = ['Ethnicity', 'Diet Type', 'Type of Treatment', 'Exercise Frequency']
    df = pd.get_dummies(df, columns=categorical_columns, drop_first=False)

  # Separate features and target
    X = df.drop(['Readmission within 30 Days', 'PatientID'], axis=1, errors='ignore')  # Drop PatientID explicitly
    y = df['Readmission within 30 Days']


    # Split dataset into training and validation
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

    # Handle imbalanced data using SMOTE
    smote = SMOTE(random_state=42)
    try:
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    except ValueError as e:
        raise ValueError(f"SMOTE failed due to invalid input data: {e}")

    # Scale numerical features
    scaler = StandardScaler()
    try:
        X_train_resampled[numerical_columns] = scaler.fit_transform(X_train_resampled[numerical_columns])
        X_val[numerical_columns] = scaler.transform(X_val[numerical_columns])
    except ValueError as e:
        raise ValueError(f"Scaling failed due to invalid input data: {e}")

    # Save processed datasets
    train_df = pd.DataFrame(X_train_resampled)
    train_df['Readmission within 30 Days'] = y_train_resampled
    train_df.to_csv(training_dataset_path, index=False)

    val_df = pd.DataFrame(X_val)
    val_df['Readmission within 30 Days'] = y_val
    val_df.to_csv(validation_dataset_path, index=False)

    # Save scaler
    joblib.dump(scaler, scaler_path)

In [93]:

@component(packages_to_install=["pandas", "xgboost", "joblib", "fsspec", "gcsfs", "scikit-learn"])
def train_model(
    training_dataset_path: InputPath(),  # Input: Processed training dataset
    trained_model_path: OutputPath("Artifact"),  # Output: Trained model artifact
    gcs_dump_path: str  # Additional input: Path to dump the training data
):
    import pandas as pd
    from xgboost import XGBClassifier
    import joblib
    import gcsfs

    # Load training data
    df_train = pd.read_csv(training_dataset_path)

    # Separate features and target
    X_train = df_train.drop('Readmission within 30 Days', axis=1)
    y_train = df_train['Readmission within 30 Days']

    # Dump the training data to GCS for review
    fs = gcsfs.GCSFileSystem()
    with fs.open(f"{gcs_dump_path}/xgb_training_data.csv", 'w') as f:
        X_train.to_csv(f, index=False)
    print("Training DataFrame dumped to GCS.")

    # Print a preview of the data
    print("Training DataFrame (X_train) Preview:")
    print(X_train.head())

    # Train the model
    xgb = XGBClassifier(
        random_state=42,
        learning_rate=0.2,
        max_depth=7,
        n_estimators=200,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="logloss"
    )
    xgb.fit(X_train, y_train)

    # Save the trained model
    joblib.dump(xgb, trained_model_path)
    print(f"Model trained and saved to {trained_model_path}")




In [94]:
from kfp.v2.dsl import component, InputPath, Output, Metrics

@component(packages_to_install=["pandas", "scikit-learn", "joblib", "xgboost", "fsspec", "gcsfs"])
def evaluate_model(
    validation_dataset_path: InputPath(),  # Input: Validation (test) dataset
    trained_model_path: InputPath("Artifact"),  # Input: Trained model
    metrics: Output[Metrics]  # Output: Metrics to log
):
    import pandas as pd
    from sklearn.metrics import (
        accuracy_score,
        f1_score,
        auc,
        precision_recall_curve,
        confusion_matrix
    )
    import joblib

    # Load validation (test) dataset
    val_df = pd.read_csv(validation_dataset_path)

    # Separate features and target
    X_val = val_df.drop('Readmission within 30 Days', axis=1)
    y_val = val_df['Readmission within 30 Days']

    # Load the trained model
    model = joblib.load(trained_model_path)

    # Make predictions
    y_pred = model.predict(X_val)
    y_pred_proba = model.predict_proba(X_val)[:, 1]

    # Calculate core metrics
    accuracy = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    # Calculate area under precision-recall curve (AU-PRC)
    precision_curve, recall_curve, _ = precision_recall_curve(y_val, y_pred_proba)
    auprc = auc(recall_curve, precision_curve)

    # Confusion matrix for calculating TP, TN, FP, FN
    tn, fp, fn, tp = confusion_matrix(y_val, y_pred).ravel()

    # Log required metrics
       # Convert metrics to Python native types
    accuracy = float(accuracy)
    f1 = float(f1)
    auprc = float(auprc)
    tn, fp, fn, tp = map(int, [tn, fp, fn, tp])  # Convert to native int

    # Log required metrics
    metrics.log_metric("accuracy", accuracy)
    metrics.log_metric("f1_score", f1)
    metrics.log_metric("area_under_precision_recall_curve", auprc)
    metrics.log_metric("true_positives", tp)
    metrics.log_metric("true_negatives", tn)
    metrics.log_metric("false_positives", fp)
    metrics.log_metric("false_negatives", fn)


    # Optional: Print evaluation metrics for debugging
    print(f"Evaluation Metrics:\nAccuracy: {accuracy}\nF1 Score: {f1}\nArea Under Precision-Recall Curve (AU-PRC): {auprc}")
    print(f"Confusion Matrix: TP={tp}, TN={tn}, FP={fp}, FN={fn}")


In [104]:
@pipeline(name="healthcare-readmissions-training-pipeline")
def healthcare_readmissions_pipeline(input_dataset_path: str):
    # Step 1: Initial data preparation
    prepared_data = initial_data_preparation(input_dataset_path=input_dataset_path)

    # Step 2: Split, encode, and preprocess
    processed_data = split_and_preprocess_data(
        prepared_dataset_path=prepared_data.outputs['prepared_dataset_path']
    )

    # Step 3: Train the model
    trained_model = train_model(
        training_dataset_path=processed_data.outputs['training_dataset_path'],
        gcs_dump_path="gs://healthcare_readmissions/debugdatasets"  # Replace with your GCS bucket path
    )

    # Step 4: Evaluate the model
    evaluate_model(
        validation_dataset_path=processed_data.outputs['validation_dataset_path'],
        trained_model_path=trained_model.outputs['trained_model_path']
    )


In [105]:
# Compile the pipeline
from kfp.v2 import compiler

compiler.Compiler().compile(
    pipeline_func=healthcare_readmissions_pipeline,
    package_path='healthcare_readmissions_pipeline.json'
)

# Run the pipeline
pipeline_job = aiplatform.PipelineJob(
    display_name='healthcare-readmissions-training-pipeline',
    template_path='healthcare_readmissions_pipeline.json',
    pipeline_root='gs://healthcare_readmissions',
    parameter_values={
        'input_dataset_path': 'gs://healthcare_readmissions/healthcare_readmissions_dataset_train.csv'
    },
    enable_caching=True
)

pipeline_job.run()

INFO:google.cloud.aiplatform.pipeline_jobs:Creating PipelineJob
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob created. Resource name: projects/853819008626/locations/us-central1/pipelineJobs/healthcare-readmissions-training-pipeline-20241216210520
INFO:google.cloud.aiplatform.pipeline_jobs:To use this PipelineJob in another session:
INFO:google.cloud.aiplatform.pipeline_jobs:pipeline_job = aiplatform.PipelineJob.get('projects/853819008626/locations/us-central1/pipelineJobs/healthcare-readmissions-training-pipeline-20241216210520')
INFO:google.cloud.aiplatform.pipeline_jobs:View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/healthcare-readmissions-training-pipeline-20241216210520?project=853819008626
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/853819008626/locations/us-central1/pipelineJobs/healthcare-readmissions-training-pipeline-20241216210520 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.c