In [1]:
import os

import mltable
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential
import mlflow

from scipy.stats import loguniform

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [2]:
mlflow.start_run()

mlflow.sklearn.autolog()

os.makedirs("./output", exist_ok=True)

In [None]:
ml_client = MLClient.from_config(credential=DefaultAzureCredential())
data_asset = ml_client.data.get("kidney-stone", version="2")

table = mltable.load(f'azureml:/{data_asset.id}')

data = table.to_pandas_dataframe()

In [4]:
features = data.drop(['target'], axis=1)
target = data['target']

knn_pipeline = make_pipeline(PCA(n_components=3),
                             StandardScaler(),
                             KNeighborsClassifier(),
                             memory='cache')

In [5]:
grid_parameters = {'kneighborsclassifier__n_neighbors': [i for i in range(1, 17, 2)],
                   'kneighborsclassifier__weights': ['uniform', 'distance']}

grid_search = GridSearchCV(knn_pipeline, grid_parameters)

In [6]:
grid_search.fit(features, target)
print('Best score:', grid_search.best_score_, 'Parameters:', grid_search.best_params_)

2024/03/02 22:48:41 INFO mlflow.sklearn.utils: Logging the 5 best runs, 11 runs will be omitted.


Best score: 0.7616666666666667 Parameters: {'kneighborsclassifier__n_neighbors': 1, 'kneighborsclassifier__weights': 'uniform'}


In [None]:
mlflow.sklearn.log_model(sk_model=knn_pipeline, registered_model_name='knn_model', artifact_path='artifacts_knn')

In [8]:
svc_pipeline = knn_pipeline
svc_pipeline.steps[-1] = ['svc', SVC(random_state=42, kernel='rbf')]

In [9]:
regularization_range = loguniform(0.001, 10000.0)
gamma_range = loguniform(0.001, 10.0)

random_parameters = {'svc__C': regularization_range, 'svc__gamma':gamma_range}
random_search = RandomizedSearchCV(svc_pipeline, random_parameters, random_state=42)

In [10]:
random_search.fit(features, target)
print('Best score:', random_search.best_score_, 'Parameters:', random_search.best_params_)

UserError: Resource Conflict: ArtifactId ExperimentRun/dcid.3fd7696f-8ba7-431f-8cf3-ae09cb992dcb/training_confusion_matrix.png already exists.
UserError: Resource Conflict: ArtifactId ExperimentRun/dcid.3fd7696f-8ba7-431f-8cf3-ae09cb992dcb/training_roc_curve.png already exists.


Best score: 0.7341666666666666 Parameters: {'svc__C': 671.5811311069941, 'svc__gamma': 0.007068974950624601}


In [None]:
mlflow.sklearn.log_model(sk_model=svc_pipeline, registered_model_name='svc_model', artifact_path='artifacts_svc')

In [12]:
mlflow.end_run()