In [2]:
import os

# create a folder for the script files
script_folder = 'src'
os.makedirs(script_folder, exist_ok=True)
print(script_folder, 'folder created')

src folder created


In [25]:
%%writefile $script_folder/train-model-mlflow-sg.py
# import libraries
import mlflow
import argparse
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

def main(args):
    # read data
    df = get_data(args.training_data)

    # feature selection and split data  
    feature_list = ['age','hypertension','heart_disease','bmi','HbA1c_level','blood_glucose_level']
    target = 'diabetes'
    X_train, X_test, y_train, y_test, df_test = split_data(df, feature_list,target)

    # Save the test dataset
    df_test.to_csv(args.test_dataset, index=False)

    # train model
    model = train_model(args.reg_rate, X_train, X_test, y_train, y_test)

    # evaluate model
    eval_model(model, X_test, y_test)

    #get model URI
    mlflow.sklearn.log_model(model, "model")
    model_uri = mlflow.get_artifact_uri("model")
    # Write the model URI to a file
    with open(args.model_uri, "w") as f:
        f.write(model_uri)
    #save model
    mlflow.sklearn.save_model(model, args.model_output)

   

def get_data(path):
    print("Reading data...")
    df = pd.read_csv(path)
    return df

def split_data(df, feature_list,target):
    print("Splitting data...")
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
    X_train, y_train = df_train[feature_list].values, df_train[target].values
    X_test, y_test = df_test[feature_list].values, df_test[target].values
    
    return X_train, X_test, y_train, y_test, df_test

def train_model(reg_rate, X_train, X_test, y_train, y_test):
    mlflow.log_param("Regularization rate", reg_rate)
    print("Training model...")
    model = LogisticRegression(C=1/reg_rate, solver="liblinear").fit(X_train, y_train)
    
    return model

def eval_model(model, X_test, y_test):
    y_hat = model.predict(X_test)
    acc = np.average(y_hat == y_test)
    print('Accuracy:', acc)
    mlflow.log_metric("Accuracy", acc)

    y_scores = model.predict_proba(X_test)
    auc = roc_auc_score(y_test, y_scores[:,1])
    print('AUC: ' + str(auc))
    mlflow.log_metric("AUC", auc)

    # plot ROC curve and log as an artifact
    plot_roc_curve(y_test, y_scores)

def plot_roc_curve(y_test, y_scores):
    fpr, tpr, thresholds = roc_curve(y_test, y_scores[:,1])
    fig = plt.figure(figsize=(6, 4))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr, tpr)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    roc_curve_path = "ROC-Curve.png"
    plt.savefig(roc_curve_path)
    mlflow.log_artifact(roc_curve_path)
    plt.close(fig)

def log_model(model):
    mlflow.sklearn.log_model(model, "model")

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--training_data", dest='training_data', type=str)
    parser.add_argument("--reg_rate", dest='reg_rate', type=float, default=0.01)
    parser.add_argument("--model_output", dest='model_output',
                        type=str)
    parser.add_argument("--model_uri", dest='model_uri', type=str, required=True)
    parser.add_argument("--test_dataset",dest='test_dataset',type=str, required=True)

    args = parser.parse_args()
    return args

if __name__ == "__main__":
    print("\n\n")
    print("*" * 60)
    args = parse_args()
    main(args)
    print("*" * 60)
    print("\n\n")


Overwriting src/train-model-mlflow-sg.py


In [29]:
%%writefile $script_folder/infer.py
# import libraries
import argparse
import mlflow
import pandas as pd

def main(args):
    # Load the test dataset
    test_data = pd.read_csv(args.test_dataset)
    
    # Load the model
    model = mlflow.pyfunc.load_model(args.model_output)
    
    # Perform inference
    feature_list = ['age','hypertension','heart_disease','bmi','HbA1c_level','blood_glucose_level']
    X_test = test_data[feature_list]  
    predictions = model.predict(X_test)
    
    # Save the scored dataset
    test_data["predictions"] = predictions
    test_data.to_csv(args.scored_dataset, index=False)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_output", dest='model_output',type=str, required=True)
    parser.add_argument("--test_dataset", dest='test_dataset',type=str, required=True)
    parser.add_argument("--scored_dataset", dest='scored_dataset',type=str, required=True)
    args = parser.parse_args()
    main(args)


Overwriting src/infer.py


In [30]:
from azure.ai.ml import MLClient, command, Input, Output, dsl
from azure.ai.ml.constants import AssetTypes, InputOutputModes
from azure.identity import DefaultAzureCredential

# Initialize MLClient using DefaultAzureCredential
ml_client = MLClient.from_config(credential=DefaultAzureCredential())

data_asset = ml_client.data.get("Diabetes_Dataset", version="1")



# Define the data preprocessing job
data_preprocess_job = command(
    code ="./src",# Path where the preprocess.py script is located
    command='python data_preprocess.py --data "${{inputs.data}}" --output "${{outputs.processed_data}}"',
    inputs={
        "data": Input(
            path=data_asset.id,
            type=AssetTypes.URI_FILE,
            mode=InputOutputModes.RO_MOUNT
        )
    },
    outputs={
        "processed_data": Output(
            type=AssetTypes.URI_FILE,
            mode=InputOutputModes.RW_MOUNT
        )
    },
    environment="test-env-azureml:1",# Ensure this environment has all required dependencies
    compute="test-compute-1-mlstudio",  
      
)

# Define the model training job
model_training_job = job = command(
    code ="./src",# Path where the preprocess.py script is located
    command='python train-model-mlflow-sg.py --training_data ${{inputs.data}} --model_output ${{outputs.model_output}} --model_uri ${{outputs.model_uri}} --test_dataset ${{outputs.test_dataset}}',
    inputs={
        "data": Input(
            type=AssetTypes.URI_FILE,
            mode=InputOutputModes.RO_MOUNT
        )
    },
    outputs={
        "model_output": Output(
            type=AssetTypes.MLFLOW_MODEL,
            mode=InputOutputModes.RW_MOUNT
        ),
        "model_uri": Output(
            type=AssetTypes.URI_FILE,
            mode=InputOutputModes.RW_MOUNT
        ),
        "test_dataset": Output(
            type=AssetTypes.URI_FILE,
            mode=InputOutputModes.RW_MOUNT
        )
    },

    
    environment="test-env-azureml:1",# Ensure this environment has all required dependencies
    compute="test-compute-1-mlstudio",)

#define inference script
inference_job = command(
    code="./src",  # Path where the infer_model.py script is located
    command='python infer.py  --model_output ${{inputs.model_output}} --test_dataset ${{inputs.test_dataset}} --scored_dataset ${{outputs.scored_dataset}}',
    inputs={
        "model_output": Input(
            type=AssetTypes.MLFLOW_MODEL,
            mode=InputOutputModes.RO_MOUNT
        ),
        "test_dataset": Input(
            type=AssetTypes.URI_FILE,
            mode=InputOutputModes.RO_MOUNT
        )
    },
    outputs={
        "scored_dataset": Output(
            type=AssetTypes.URI_FILE,
            mode=InputOutputModes.RW_MOUNT
        )
    },
    environment="test-env-azureml:1",  # Ensure this environment has all required dependencies
    compute="test-compute-1-mlstudio",
)


# Create a pipeline by combining the three jobs
@dsl.pipeline(
    description="Pipeline combining data preprocessing, model training and inference",
    default_compute="test-compute-1-mlstudio"
)
def full_pipeline():
    preprocess_step = data_preprocess_job()
    train_step = model_training_job(
        data=preprocess_step.outputs.processed_data)
    infer_step = inference_job(
        model_output=train_step.outputs.model_output,
        test_dataset=train_step.outputs.test_dataset
    )
    
    return{
        "pipeline_job_transformed_data": preprocess_step.outputs.processed_data,
        "pipeline_job_trained_model": train_step.outputs.model_output,
        "pipeline_job_scored_data": infer_step.outputs.scored_dataset,
    }


pipeline = full_pipeline()

# Submit the pipeline to Azure ML
pipeline_job = ml_client.jobs.create_or_update(pipeline)
aml_url = pipeline_job.studio_url
print("Monitor your pipeline at", aml_url)


Found the config file in: /config.json
Uploading src (0.01 MBs): 100%|██████████| 9319/9319 [00:00<00:00, 67640.35it/s]




Monitor your pipeline at https://ml.azure.com/runs/amusing_octopus_j9bcp177pr?wsid=/subscriptions/3b7a65ed-df6d-4020-9010-5585f2149752/resourcegroups/rg-test-1/workspaces/mlstudio-test-1&tid=dc0b52a3-68c5-44f7-881d-9383d8850b96
