# Train a model in Azure Machine Learning

This is a code sample based of the Getting Started With Azure Machine Learning:

MSlearn: https://learn.microsoft.com/en-us/azure/machine-learning/tutorial-train-model?view=azureml-api-2

Github: https://github.com/Azure/azureml-examples/blob/main/tutorials/get-started-notebooks/train-model.ipynb

The only modification done to this has been adding a few feature engineering steps to the main.py for demonstration purposes of a tender requirement.

This notebook serve as a baseline for a classic Azure Machine Learning Notebook for Training Models.

**Note**: This notebook is intended to run by a Compute Instance in Azure Machine Learning

In [4]:
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

# authenticate
credential = DefaultAzureCredential()

SUBSCRIPTION = "<subscription-id>" #Subscription for RG
RESOURCE_GROUP = "rg-we-atpws-aml" #Azure Machine Learning Resource Group (RG)
WS_NAME = "aml-ws-atp001"  #Azure Machine Learning Workspace in RG
# Get a handle to the workspace
ml_client = MLClient(
    credential=credential,
    subscription_id=SUBSCRIPTION,
    resource_group_name=RESOURCE_GROUP,
    workspace_name=WS_NAME,
)

In [5]:
# Verify that the handle works correctly.
# If you ge an error here, modify your SUBSCRIPTION, RESOURCE_GROUP, and WS_NAME in the previous cell.
ws = ml_client.workspaces.get(WS_NAME)
print(ws.location, ":", ws.resource_group)

westeurope : rg-we-atpws-aml


In [6]:
import os

dependencies_dir = "./dependencies"
os.makedirs(dependencies_dir, exist_ok=True)

In [7]:
%%writefile {dependencies_dir}/conda.yaml
name: model-env
channels:
  - conda-forge
dependencies:
  - python=3.8
  - numpy=1.21.2
  - pip=21.2.4
  - scikit-learn=1.0.2
  - scipy=1.7.1
  - pandas>=1.1,<1.2
  - pip:
    - inference-schema[numpy-support]==1.3.0
    - mlflow==2.8.0
    - mlflow-skinny==2.8.0
    - azureml-mlflow==1.51.0
    - psutil>=5.8,<5.9
    - tqdm>=4.59,<4.60
    - ipykernel~=6.0
    - matplotlib

Writing ./dependencies/conda.yaml


In [8]:
from azure.ai.ml.entities import Environment

custom_env_name = "aml-scikit-learn"

custom_job_env = Environment(
    name=custom_env_name,
    description="Custom environment for Credit Card Defaults job",
    tags={"scikit-learn": "1.0.2"},
    conda_file=os.path.join(dependencies_dir, "conda.yaml"),
    image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest",
)
custom_job_env = ml_client.environments.create_or_update(custom_job_env)

print(
    f"Environment with name {custom_job_env.name} is registered to workspace, the environment version is {custom_job_env.version}"
)

Environment with name aml-scikit-learn is registered to workspace, the environment version is 1


In [9]:
import os

train_src_dir = "./src"
os.makedirs(train_src_dir, exist_ok=True)

In [None]:
%%writefile {train_src_dir}/main.py
import os
import argparse
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import numpy as np

def main():
    """Main function of the script."""

    # input and output arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--data", type=str, help="path to input data")
    parser.add_argument("--test_train_ratio", type=float, required=False, default=0.25)
    parser.add_argument("--n_estimators", required=False, default=100, type=int)
    parser.add_argument("--learning_rate", required=False, default=0.1, type=float)
    parser.add_argument("--registered_model_name", type=str, help="model name")
    args = parser.parse_args()
   
    # Start Logging
    mlflow.start_run()

    # enable autologging
    mlflow.sklearn.autolog()

    ###################
    #<prepare the data>
    ###################
    print(" ".join(f"{k}={v}" for k, v in vars(args).items()))

    print("input data:", args.data)
    
    credit_df = pd.read_csv(args.data, header=1, index_col=0)

    mlflow.log_metric("num_samples", credit_df.shape[0])
    mlflow.log_metric("num_features", credit_df.shape[1] - 1)

    ###########################
    #<data cleaning>
    ###########################
    # Remove duplicates
    initial_rows = credit_df.shape[0]
    credit_df = credit_df.drop_duplicates()
    duplicates_removed = initial_rows - credit_df.shape[0]
    print(f"Removed {duplicates_removed} duplicate rows")
    
    # Handle missing values
    nulls_before = credit_df.isnull().sum().sum()
    credit_df = credit_df.dropna()
    nulls_removed = nulls_before - credit_df.isnull().sum().sum()
    print(f"Removed {nulls_removed} null values")
    print(f"Final dataset size: {credit_df.shape[0]} rows")
    ###########################
    #</data cleaning>
    ###########################

    ###########################
    #<feature engineering>
    ###########################
    # Feature 1: Credit Utilization Ratio
    # Calculate the ratio of bill amount to credit limit for the most recent bill
    credit_df['credit_utilization'] = credit_df['BILL_AMT1'] / (credit_df['LIMIT_BAL'] + 1)  # +1 to avoid division by zero
    
    # Feature 2: Average Payment Delay
    # Calculate the average payment delay across all months
    payment_cols = ['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']
    credit_df['avg_payment_delay'] = credit_df[payment_cols].mean(axis=1)
    
    # Feature 3: Payment to Bill Ratio
    # Calculate the ratio of total payments to total bills
    bill_cols = ['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']
    pay_cols = ['PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
    
    total_bills = credit_df[bill_cols].sum(axis=1)
    total_payments = credit_df[pay_cols].sum(axis=1)
    credit_df['payment_to_bill_ratio'] = total_payments / (total_bills + 1)  # +1 to avoid division by zero
    
    print(f"Added features: credit_utilization, avg_payment_delay, payment_to_bill_ratio")
    ###########################
    #</feature engineering>
    ###########################

    #Split train and test datasets
    train_df, test_df = train_test_split(
        credit_df,
        test_size=args.test_train_ratio,
    )
    ####################
    #</prepare the data>
    ####################

    ##################
    #<train the model>
    ##################
    # Extracting the label column
    y_train = train_df.pop("default payment next month")

    # convert the dataframe values to array
    X_train = train_df.values

    # Extracting the label column
    y_test = test_df.pop("default payment next month")

    # convert the dataframe values to array
    X_test = test_df.values

    print(f"Training with data of shape {X_train.shape}")

    clf = GradientBoostingClassifier(
        n_estimators=args.n_estimators, learning_rate=args.learning_rate
    )
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    print(classification_report(y_test, y_pred))
    ###################
    #</train the model>
    ###################

    ##########################
    #<save and register model>
    ##########################
    # Registering the model to the workspace
    print("Registering the model via MLFlow")
    mlflow.sklearn.log_model(
        sk_model=clf,
        registered_model_name=args.registered_model_name,
        artifact_path=args.registered_model_name,
    )

    # Saving the model to a file
    mlflow.sklearn.save_model(
        sk_model=clf,
        path=os.path.join(args.registered_model_name, "trained_model"),
    )
    ###########################
    #</save and register model>
    ###########################
    
    # Stop Logging
    mlflow.end_run()

if __name__ == "__main__":
    main()

In [None]:
!pip install mltable

In [12]:
from azure.ai.ml import command
from azure.ai.ml import Input

registered_model_name = "credit_default_prediction_standard"

job = command(
    inputs=dict(
        data=Input(
            type="uri_file",
            path="https://azuremlexamples.blob.core.windows.net/datasets/credit_card/default_of_credit_card_clients.csv",
        ),
        test_train_ratio=0.2,
        learning_rate=0.25,
        registered_model_name=registered_model_name,
    ),
    code="./src/",  # location of source code
    command="python main.py --data ${{inputs.data}} --test_train_ratio ${{inputs.test_train_ratio}} --learning_rate ${{inputs.learning_rate}} --registered_model_name ${{inputs.registered_model_name}}",
    environment="aml-scikit-learn@latest",
    display_name="credit_default_prediction_standard",
)

### Submit the job

In [13]:
ml_client.create_or_update(job)

Class AutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class AutoDeleteConditionSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseAutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class IntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class ProtectionLevelSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseIntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
[32mUploading src (0.0 MBs): 100%|███

Experiment,Name,Type,Status,Details Page
jeffreyadmin,frosty_collar_49twbj3f2p,command,Starting,Link to Azure Machine Learning studio
