# Integrate Azure ML with synapse

In [2]:
from azure.identity import DefaultAzureCredential
from azure.ai.ml import MLClient

subscription_id = "8c5f588e-7bbb-44bb-ac46-5c280a516c9f"
resource_group = "resource_group_final"
workspace_name = "synapse-workspace-final"

credential = DefaultAzureCredential()

ml_client = MLClient(credential, subscription_id, resource_group, workspace_name)

## Connect to Synapse DataLake

In [3]:
from azureml.core import Workspace
from azureml.core.datastore import Datastore

# Load your Azure ML Workspace
workspace = Workspace.from_config()

# Register a new datastore with ADLS Gen2 info
datastore = Datastore.register_azure_blob_container(
    workspace=workspace,
    datastore_name='irrelevant_datastore',
    account_name='finalaccountdatalake',  # ADLS Gen2 storage account name
    container_name='datalakefs',
    account_key='CjVN5Ex9XDQVHorTKhz/7sUDoYjUTYuMskDriQblj6+W0hiF0IZnNRlV6fFrEqblrrTE0+zKbffN+AStys7vjQ=='
)

## List files in storage

In [4]:
# pip install azure-storage-blob

In [5]:
from azureml.core import Workspace, Datastore
from azure.core.exceptions import ResourceNotFoundError
from azure.storage.blob import BlobServiceClient


# Get the datastore
datastore = Datastore.get(workspace, 'irrelevant_datastore')

# Create a BlobServiceClient
blob_service_client = BlobServiceClient(account_url=f"https://{datastore.account_name}.blob.core.windows.net",
                                        credential=datastore.account_key)

try:
    # Get a blob container client
    container_client = blob_service_client.get_container_client(datastore.container_name)
    
    # List blobs in the container
    blob_list = container_client.list_blobs(name_starts_with='/')  # Adjust the path as needed
    for blob in blob_list:
        print(blob.name)
except ResourceNotFoundError:
    print("The specified container does not exist.")

apple.csv


## Get data from Synapse DataLake

In [6]:
from azureml.core import Dataset

# Create a dataset from a file path in the datastore
dataset = Dataset.Tabular.from_delimited_files(path=(datastore, '/apple.csv'))

# Load the dataset into a dataframe (for example purposes)
dataframe = dataset.to_pandas_dataframe()
dataframe

Unnamed: 0,A_id,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Quality
0,0.0,-3.970049,-2.512336,5.346330,-1.012009,1.844900,0.329840,-0.491590,good
1,1.0,-1.195217,-2.839257,3.664059,1.588232,0.853286,0.867530,-0.722809,good
2,2.0,-0.292024,-1.351282,-1.738429,-0.342616,2.838636,-0.038033,2.621636,bad
3,3.0,-0.657196,-2.271627,1.324874,-0.097875,3.637970,-3.413761,0.790723,good
4,4.0,1.364217,-1.296612,-0.384658,-0.553006,3.030874,-1.303849,0.501984,good
...,...,...,...,...,...,...,...,...,...
3996,3996.0,-0.293118,1.949253,-0.204020,-0.640196,0.024523,-1.087900,1.854235,good
3997,3997.0,-2.634515,-2.138247,-2.440461,0.657223,2.199709,4.763859,-1.334611,bad
3998,3998.0,-4.008004,-1.779337,2.366397,-0.200329,2.161435,0.214488,-2.229720,good
3999,3999.0,0.278540,-1.715505,0.121217,-1.154075,1.266677,-0.776571,1.599796,good


# Train part

In [7]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
import mlflow
import numpy as np
from azureml.core import Workspace



# connect to your workspace
ws = Workspace.from_config()

In [9]:
df = dataset.to_pandas_dataframe()

df.dropna(inplace=True)

df.drop('A_id', axis=1, inplace=True)

df = df[df['Acidity'] != 'Created_by_Nidula_Elgiriyewithana']
df['Acidity'] = df['Acidity'].astype(float)

le = LabelEncoder()
df['Quality'] = le.fit_transform(df['Quality'])

In [10]:
X = df.drop('Quality', axis=1)
y = df['Quality']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# create experiment and start logging to a new run in the experiment
experiment_name = "azure-ml-apple-quality"

# set up MLflow to track the metrics
mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
mlflow.set_experiment(experiment_name)
mlflow.autolog()

# set up the Logistic regression model
mlp_clf = MLPClassifier(hidden_layer_sizes=(100, 50), learning_rate_init=0.075)

# train the model
with mlflow.start_run() as run:
    scores = cross_val_score(mlp_clf, X, y, cv=5) 

print("MLP Classifier cross-validation scores:", scores)
print("Mean accuracy:", scores.mean())

# register the model
model_uri = "runs:/{}/model".format(run.info.run_id)
model = mlflow.register_model(model_uri, "sklearn_apple_model")

2024/03/20 20:08:51 INFO mlflow.tracking.fluent: Experiment with name 'azure-ml-apple-quality' does not exist. Creating a new experiment.
2024/03/20 20:08:54 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
UserError: Resource Conflict: ArtifactId ExperimentRun/dcid.67d3b172-8575-4637-b701-9eeb80289a7b/training_confusion_matrix.png already exists.
UserError: Resource Conflict: ArtifactId ExperimentRun/dcid.67d3b172-8575-4637-b701-9eeb80289a7b/training_roc_curve.png already exists.
UserError: Resource Conflict: ArtifactId ExperimentRun/dcid.67d3b172-8575-4637-b701-9eeb80289a7b/training_confusion_matrix.png already exists.
UserError: Resource Conflict: ArtifactId ExperimentRun/dcid.67d3b172-8575-4637-b701-9eeb80289a7b/training_roc_curve.png already exists.
UserError: Resource Conflict: ArtifactId ExperimentRun/dcid.67d3b172-8575-4637-b701-9eeb80289a7b/training_confusion_matrix.png already exists.
UserError: Resource Conflict: ArtifactId ExperimentRun/dcid.67d3b1

MLP Classifier cross-validation scores: [0.93625 0.92125 0.92375 0.93    0.945  ]
Mean accuracy: 0.93125
