# Connect to your workspace

In [1]:
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential
subscription_id = "78a604aa-17cd-47e5-bc3b-097c9395aa3f"


resource_group = "demo"
workspace="mlw-dp100-labs"
ml_client = MLClient(
    DefaultAzureCredential(), subscription_id, resource_group,workspace
)



# Create the scripts

In [3]:
import os

script_folder = 'src'
os.makedirs(script_folder, exist_ok=True)
print(script_folder, 'folder created')

src folder created


In [5]:
%%writefile $script_folder/prep-data.py
# import libraries
import argparse
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

Random_State=42

# Parse command-line arguments
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--input_data", type=str, dest="input_data")
    parser.add_argument("--output_data", type=str, dest="output_data")
    return parser.parse_args()

def get_data(input_path):

    df= pd.read_csv(input_path)
    return df


# remove missing values and duplicates
def clean_data(df):
    df = df.dropna()
    df=df.drop_duplicates()
    
    return df

# split data
def split_data(df):
    X= df[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness',
    'SerumInsulin','BMI','DiabetesPedigree','Age']].values
    y= df['Diabetic'].values
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, stratify=y ,random_state=Random_State)
    
    return X_train, X_test, y_train, y_test

# normalize data
def normalize_data(X_train, X_test):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled 


# main function
def main(args):
   
    df = get_data(args.input_data)

    cleaned_data = clean_data(df)

    X_train, X_test, y_train, y_test=split_data(cleaned_data)

    X_train_scaled, X_test_scaled = normalize_data(X_train, X_test)
    
    # Save outputs as CSV files
    output_path = Path(args.output_data)
    output_path.mkdir(parents=True, exist_ok=True)

    feature_names = ['Pregnancies','PlasmaGlucose','DiastolicBloodPressure',
                 'TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']
    
    train_df = pd.DataFrame(X_train_scaled, columns=feature_names)
    train_df["Diabetic"] = y_train
    train_df.to_csv(output_path / "train.csv", index=False)

    test_df = pd.DataFrame(X_test_scaled, columns=feature_names)
    test_df["Diabetic"] = y_test
    test_df.to_csv(output_path / "test.csv", index=False)

if __name__ == "__main__":
    print("\n" + "*" * 60)
    args = parse_args()
    main(args)
    print("*" * 60 + "\n")
    


Overwriting src/prep-data.py


In [7]:
%%writefile $script_folder/train-model.py
# import libraries
import mlflow
import argparse
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier



Random_State=42
def parse_args():
    # setup arg parser
    parser = argparse.ArgumentParser()

    parser.add_argument("--training_data", dest='training_data', type=str) 
    parser.add_argument("--model_output", dest='model_output', type=str) 

    args = parser.parse_args()

    return args


# function that reads the data
def load_data(training_data_path):

    train_df = pd.read_csv(Path(training_data_path) / "train.csv")
    
    X_train = train_df.drop("Diabetic", axis=1)
    y_train = train_df["Diabetic"]


    return X_train, y_train



def train_LR(X_train, y_train,output_path ):
    with mlflow.start_run(nested=True):
        mlflow.log_param("model_type", "Logistic Regression")
        print("Training model...")
        model= LogisticRegression(C=(1/0.01), solver="liblinear").fit(X_train, y_train)
        mlflow.sklearn.save_model(model, output_path / "logistic_regression")

    return model

def train_RF(X_train, y_train,output_path ):
    with mlflow.start_run(nested=True):
        mlflow.log_param("model_type", "Random Forest")
        print("Training model...")
        model=RandomForestClassifier(max_depth= 8, min_samples_split= 2, n_estimators= 500 , random_state = Random_State).fit(X_train, y_train)
        mlflow.sklearn.save_model(model, output_path / "random_forest")
        
    return model


def train_XG(X_train, y_train,output_path ):
    with mlflow.start_run(nested=True):
        mlflow.log_param("model_type", "XGBoost")
        print("Training model...")
        model=XGBClassifier(learning_rate= 0.01, max_depth= 2, n_estimators=500 , random_state = Random_State).fit(X_train, y_train)
        mlflow.sklearn.save_model(model, output_path / "xgboost")
        
    return model




def main(args):
    
    # read data
    X_train, y_train = load_data(args.training_data)
    # path
    output_path = Path(args.model_output)
    output_path.mkdir(parents=True, exist_ok=True)
    
    # train model
    with mlflow.start_run(): #parent run
        train_LR(X_train, y_train, output_path)
        train_RF(X_train, y_train, output_path)
        train_XG(X_train, y_train, output_path)
       

    

    
if __name__ == "__main__":
    print("\n" + "*" * 60)
    args = parse_args()
    main(args)
    print("*" * 60 + "\n")


Overwriting src/train-model.py


In [9]:
%%writefile $script_folder/evaluation-model.py

# import libraries
import mlflow
import argparse
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--test_data", type=str, dest="test_data")
    parser.add_argument("--model_path", type=str, dest="model_path")
    return parser.parse_args()

# function that reads the data
def load_data(test_data_path):

    test_df = pd.read_csv(Path(test_data_path) / "test.csv")

    X_test = test_df.drop("Diabetic", axis=1)
    y_test = test_df["Diabetic"]

    return X_test, y_test



def eval_model(model, X_test, y_test):

    model_name = type(model).__name__

    
    # calculate accuracy
    y_hat = model.predict(X_test)
    acc = np.mean(y_hat == y_test)
    print(f"[{model_name}] Accuracy: {acc:.4f}")
    mlflow.log_metric("accuracy", acc)
    
    # calculate AUC
    y_scores = model.predict_proba(X_test)
    auc = roc_auc_score(y_test,y_scores[:,1])
    print(f"[{model_name}] AUC: {auc:.4f}")
    mlflow.log_metric("auc", auc)
    
    # plot ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_scores[:,1])
    plt.figure(figsize=(6, 4))
    # Plot the diagonal 50% line
    plt.plot([0, 1], [0, 1], 'k--')
    # Plot the FPR and TPR achieved by our model
    plt.plot(fpr, tpr,label=f"AUC = {auc:.2f}")
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve: {model_name}')
    plt.legend()

    output_file = f"ROC-Curve-{model_name}.png"
    plt.savefig(output_file)
    mlflow.log_artifact(output_file)

def main(args):
    # enable autologging
    mlflow.autolog(disable=True) 
    X_test, y_test = load_data(args.test_data)
    
    for model_dir in Path(args.model_path).iterdir():
        if model_dir.is_dir():
            print(f"Evaluating: {model_dir}")
            model = mlflow.sklearn.load_model(str(model_dir))
            eval_model(model, X_test, y_test)


if __name__ == "__main__":
    # add space in logs
    print("\n\n")
    print("*" * 60)
    
    args = parse_args()
    main(args)
    
    print("*" * 60)
    print("\n\n")

Overwriting src/evaluation-model.py


# Define the components

In [11]:
%%writefile prep-data.yml
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
name: prep_data
display_name: Prepare training data
version: 1
type: command
inputs:
  input_data: 
    type: uri_file
outputs:
  output_data:
    type: uri_folder
code: ./src
environment: azureml:AzureML-sklearn-0.24-ubuntu18.04-py37-cpu@latest
command: >-
  python prep-data.py 
  --input_data ${{inputs.input_data}}
  --output_data ${{outputs.output_data}}

Overwriting prep-data.yml


In [13]:
%%writefile train-model.yml
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
name: train_model
display_name: Train the logistic regression, random forest, XGBoost models
version: 1
type: command
inputs:
  training_data: 
    type: uri_folder
outputs:
  model_output:
    type: uri_folder
code: ./src
environment: azureml:AzureML-lightgbm-3.2-ubuntu18.04-py37-cpu@latest
command: >-
  python train-model.py 
  --training_data ${{inputs.training_data}} 
  --model_output ${{outputs.model_output}} 

Overwriting train-model.yml


In [15]:
%%writefile evaluation-model.yml
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
name: evaluation_model
display_name: Evaluate the logistic regression, random forest, and XGBoost models
version: 1
type: command
inputs:
  test_data: 
    type: uri_folder
  model_path:
    type: uri_folder

code: ./src
environment: azureml:AzureML-lightgbm-3.2-ubuntu18.04-py37-cpu@latest
command: >-
  python evaluation-model.py
  --test_data ${{inputs.test_data}} 
  --model_path ${{inputs.model_path}} 

Overwriting evaluation-model.yml


# Load the component

In [17]:
from azure.ai.ml import load_component
parent_dir = ""

prep_data = load_component(source=parent_dir + "./prep-data.yml")
train_models = load_component(source=parent_dir + "./train-model.yml")
eval_models = load_component(source=parent_dir + "./evaluation-model.yml")

# Build the pipeline

In [19]:

from azure.ai.ml import Input
from azure.ai.ml.constants import AssetTypes
from azure.ai.ml.dsl import pipeline

@pipeline()
def diabetes_classification(pipeline_job_input):
    
    clean_split_data = prep_data(input_data=pipeline_job_input)
    train_model = train_models(training_data=clean_split_data.outputs.output_data)
    eval_model=eval_models(test_data=clean_split_data.outputs.output_data, model_path=train_model.outputs.model_output)

    return {
        
        "cleaned_data": clean_split_data.outputs.output_data,
        "trained_model": train_model.outputs.model_output,
        
    }

pipeline_job = diabetes_classification(Input(type=AssetTypes.URI_FILE , path="azureml:diabetes-local:1"))

# Submit the pipeline job

In [21]:
pipeline_job.outputs.cleaned_data.mode = "upload"
pipeline_job.outputs.trained_model.mode = "upload"
pipeline_job.settings.default_compute = "aml-cluster"
pipeline_job.settings.default_datastore = "workspaceblobstore"

pipeline_job = ml_client.jobs.create_or_update(
    pipeline_job, experiment_name="pipeline_diabetes"
)
pipeline_job

Class AutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class AutoDeleteConditionSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseAutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class IntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class ProtectionLevelSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseIntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
[32mUploading src (0.01 MBs): 100%|##

Experiment,Name,Type,Status,Details Page
pipeline_diabetes,mighty_bear_3kjxn8zsg3,pipeline,NotStarted,Link to Azure Machine Learning studio
