# Login Azure Cloud Plateform

In [1]:
################################### Azure ####################

from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
from azure.ai.ml.entities import Environment
from azure.ai.ml import command
from azure.ai.ml import dsl,Input, Output
import mlflow
import logging
import webbrowser

############################# Data Analysis & Others ############################

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import argparse
import os

In [2]:
try:
    credential = DefaultAzureCredential()
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    credential = InteractiveBrowserCredential()

# Access the Resource Group and Work Space

In [3]:
ml_client = MLClient(
    credential=credential,
    subscription_id="xxxxx-xxx-xxx-xxx-xxxxxxxxxxxxx",
    resource_group_name="your_resource_group",
    workspace_name="your_workspace_name",
)

# Retrieve Url From Azure Blob

In [4]:
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes

web_path = "https://your_resource_group.blob.core.windows.net/mobile/train.csv"

# Load the data to 'Data' in Azure ML Work Space

In [5]:
data = Data(name="MobileClassifications", path=web_path, type=AssetTypes.URI_FILE,
            description="Dataset for mobile", 
            tags={"source_type": "web", "source": "AzureML examples blob"},
            version="1.0.2")

data = ml_client.data.create_or_update(data)
print(f"{data.name} dataset was registered to workspace")

MobileClassifications dataset was registered to workspace


# Create Cluster Compute

In [6]:
from azure.ai.ml.entities import AmlCompute

cpu_compute = "cpu-cluster12345"

try:
    cpu_cluster = ml_client.compute.get(cpu_compute)
    print(f"You already have a cluster named {cpu_compute}")
except Exception:
    cpu_cluster = AmlCompute(name = "cpu-cluster12345",
                             type = "amlcompute",
                             size = "Standard_F2s_v2",
                             min_instances = 2,
                             max_instances = 4,
                             idle_time_before_scale_down=120,
                             tier = "Dedicated",
                            )
    cpu_cluster = ml_client.begin_create_or_update(cpu_cluster)

You already have a cluster named cpu-cluster12345


# Create Custom Environment

In [7]:
dependencies_dir = "./dependencies"
os.makedirs(dependencies_dir, exist_ok=True)

In [8]:
%%writefile {dependencies_dir}/train_env.yaml

name: sklearn-1.5
channels:
- conda-forge
- anaconda
dependencies:
- python=3.10
- pip=21.3.1
- pandas~=1.5.3
- scipy~=1.10.0
- numpy~=1.22.0
- pip:
  - scikit-learn-intelex==2024.7.0
  - azureml-sdk[notebooks,automl]
  - azureml-core==1.57.0.post1
  - azureml-defaults==1.57.0.post1
  - azureml-mlflow==1.57.0.post1
  - azureml-telemetry==1.57.0
  - scikit-learn~=1.5.0
  - joblib~=1.2.0
  - azure-ai-ml==1.9.0
  - mltable
  # azureml-automl-common-tools packages
  - py-spy==0.3.12
  - debugpy~=1.6.3
  - ipykernel~=6.0
  - tensorboard
  - psutil~=5.8.0
  - matplotlib~=3.5.0
  - tqdm~=4.66.3
  - py-cpuinfo==5.0.0
  - torch-tb-profiler~=0.4.0


Overwriting ./dependencies/train_env.yaml


In [9]:
custom_env_name = "train_env"

pipeline_job_env_train = Environment(
    name=custom_env_name,
    description="custom environment ",
    tags={"scikit-learn": "0.24.2"},
    conda_file=os.path.join(dependencies_dir, "train_env.yaml"),
    image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest",
    version="0.215112",
)
pipeline_job_env_train = ml_client.environments.create_or_update(pipeline_job_env_train)

print(
    f"Environment with name {pipeline_job_env_train.name} is registered to workspace, the environment version is {pipeline_job_env_train.version}"
)

Environment with name train_env is registered to workspace, the environment version is 0.215112


# Creating Pipline

## Split Data

In [10]:
split_data_dir = "./automl/split_data"

os.makedirs(split_data_dir, exist_ok=True)

In [11]:
%%writefile {split_data_dir}/split_data.py

import os
import argparse
import pandas as pd
import logging
import mlflow
from sklearn.model_selection import train_test_split, KFold, cross_val_score
import mltable

def main():

    logging.basicConfig(level=logging.INFO)

    parser = argparse.ArgumentParser()
    parser.add_argument("--data", type=str, help="path to input data")
    parser.add_argument("--train_data", type=str, help="path to save cleaned data")
    parser.add_argument("--test_data", type=str, help="path to save cleaned data")
    args = parser.parse_args()

    mlflow.start_run()

    logging.info("Input data: %s", args.data)
    print("Input data:", args.data)

    df = pd.read_csv(args.data)
    X = df.drop(columns = "price_range")
    y = df['price_range']

    boolean_columns = ["blue", "dual_sim", "four_g", "three_g", "touch_screen", "wifi"]
    for col in boolean_columns:
        if col in X.columns:
            X[col] = X[col].astype(bool)
            
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    train_df = pd.concat([X_train, y_train], axis = 1)
    test_df = pd.concat([X_test, y_test], axis = 1)

    train_data_csv = os.path.join(args.train_data, "train_data.csv")
    train_df.to_csv(train_data_csv, index=False)

    test_data_csv = os.path.join(args.test_data, "test_data.csv")
    test_df.to_csv(test_data_csv, index=False)

    paths = [{'file': os.path.join(args.train_data, 'train_data.csv')}]
    train_table = mltable.from_delimited_files(paths) 
    train_table.save(args.train_data)

    logging.info("Training data saved and MLTable created at %s", args.train_data)
    mlflow.end_run()

if __name__ == "__main__":
    main()



Writing ./automl/split_data/split_data.py


In [16]:
split_data_component = command(name="split_data",
                                        display_name="Split Data",
                                        description="Split Data",
                                        inputs={"data": Input(type="uri_folder")},
                                        outputs=dict(
                                            train_data=Output(type="uri_folder", mode="rw_mount"),
                                            test_data=Output(type="uri_folder", mode="rw_mount")),
                                        code=split_data_dir,
                                        command="""python split_data.py \
                                                --data ${{inputs.data}} \
                                                --train_data ${{outputs.train_data}} \
                                                --test_data ${{outputs.test_data}} """,
                                        environment=f"{pipeline_job_env_train.name}:{pipeline_job_env_train.version}")

split_data_component = ml_client.create_or_update(split_data_component.component)
print(f"Component {split_data_component.name} with Version {split_data_component.version} is registered")

Component split_data with Version 2024-10-05-17-27-40-2794807 is registered


# AutoML Training

In [17]:
train_dir = "./automl/train_automl"

os.makedirs(train_dir, exist_ok=True)

In [18]:
from azure.ai.ml import Input, Output, automl, dsl
from azure.ai.ml.automl import classification
from azure.ai.ml.entities._job.automl.tabular import TabularFeaturizationSettings
from azure.ai.ml.dsl import pipeline
from azure.ai.ml.entities import Model
import os
import joblib
import pandas as pd
import mlflow
import mlflow.sklearn

@pipeline(
    description="AutoML Classification Pipeline",
)
def automl_classification(classification_train_data):
    
    data_preprocessing_job = split_data_component(
        data=classification_train_data
    )
    processed_train_data = data_preprocessing_job.outputs.train_data
    processed_test_data = data_preprocessing_job.outputs.test_data

    classification_job = classification(
        training_data=processed_train_data,
        target_column_name="price_range",
        primary_metric="Accuracy",
        featurization=TabularFeaturizationSettings(mode="Auto"),
        n_cross_validations=5,
        outputs={"best_model": Output(type="uri_file")},
        display_name="Auto ML Training"
    )

    classification_job.set_limits(
        timeout_minutes=15, 
        trial_timeout_minutes=2, 
        max_trials=40,
        
        enable_early_termination=True,
    )

    return {
    'best_model': classification_job.outputs.best_model,
    'test_data': processed_test_data
}

pipeline_classification = automl_classification(
        classification_train_data=Input(type="uri_file", path=data.path))

pipeline_classification.settings.default_compute = "cpu-cluster12345"

pipeline_job = ml_client.jobs.create_or_update(
    pipeline_classification, experiment_name="pipeline_samples"
)



Class AutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class AutoDeleteConditionSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseAutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class IntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class ProtectionLevelSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseIntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
pathOnCompute is not a known attribute

# Download best model and test data

In [21]:

pipeline_job = ml_client.jobs.get(pipeline_job.name)
if pipeline_job.status == "Completed":
    ml_client.jobs.download(pipeline_job.name, download_path=".", output_name='best_model')
    ml_client.jobs.download(pipeline_job.name, download_path=".", output_name='test_data')
    print("Download completed.")
else:
    print("Job did not complete successfully.")

Downloading artifact azureml://subscriptions/254266a3-960a-40c6-a182-ebfafdc787d6/resourcegroups/P1/workspaces/machinelearning/datastores/workspaceartifactstore/paths/ExperimentRun/dcid.4c6c2b16-2f0d-45be-a075-9fe8558dd715_22/outputs/mlflow-model to named-outputs/best_model
Downloading artifact azureml://subscriptions/254266a3-960a-40c6-a182-ebfafdc787d6/resourcegroups/P1/workspaces/machinelearning/datastores/workspaceblobstore/paths/azureml/96502c88-48c6-4b37-b3a8-d85d5d15a7e3/test_data/ to named-outputs/test_data


Download completed.


# Model Evaluation

In [22]:
import joblib
path = ('./named-outputs/best_model/model.pkl')
model = joblib.load(path)

In [23]:
model

In [24]:
import pandas as pd

test = pd.read_csv(('./named-outputs/test_data/test_data.csv'))
X_test = test.drop(columns = 'price_range', axis = 1)
y_test = test['price_range']

In [28]:
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score, classification_report

y_pred = model.predict(X_test)

f1 = f1_score(y_test, y_pred, average='weighted')
f1

0.9374987767777142

In [34]:
report = classification_report(y_test, y_pred)
print(report)


              precision    recall  f1-score   support

           0       0.97      0.97      0.97       105
           1       0.91      0.93      0.92        91
           2       0.90      0.89      0.90        92
           3       0.95      0.95      0.95       112

    accuracy                           0.94       400
   macro avg       0.94      0.94      0.94       400
weighted avg       0.94      0.94      0.94       400

