In [1]:
!pip show azureml-datadrift

Name: azureml-datadrift
Version: 1.33.0
Summary: Azure Machine Learning datadrift
Home-page: https://docs.microsoft.com/python/api/overview/azure/ml/?view=azure-ml-py
Author: Microsoft Corp
Author-email: None
License: https://aka.ms/azureml-sdk-license
Location: /anaconda/envs/azureml_py36/lib/python3.6/site-packages
Requires: pandas, azureml-telemetry, azureml-pipeline-core, scikit-learn, msrest, jsonpickle, scipy, pyspark, azureml-core, matplotlib, azureml-dataset-runtime, lightgbm, numpy
Required-by: 


In [6]:


from azureml.core import Workspace

ws = Workspace.from_config()
print('Ready to work with', ws.name)

Ready to work with wsfinal123


In [9]:
from azureml.core import Datastore, Dataset

# Upload the baseline data
default_ds = ws.get_default_datastore()
default_ds.upload_files(files=['diabetes.csv', 'diabetes2.csv'],
target_path='diabetes-baseline',
overwrite=True, 
show_progress=True)

# Create and register the baseline dataset
print('Registering baseline dataset...')
baseline_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'diabetes-baseline/*.csv'))
baseline_data_set = baseline_data_set. register(workspace=ws, 
name='diabetes baseline',
description='diabetes baseline data',
tags = {'format':'CSV'},
create_new_version=True)

print('Baseline dataset registered!')

Uploading an estimated of 2 files
Uploading diabetes.csv
Uploaded diabetes.csv, 1 files out of an estimated total of 2
Uploading diabetes2.csv
Uploaded diabetes2.csv, 2 files out of an estimated total of 2
Uploaded 2 files
Registering baseline dataset...
Baseline dataset registered!


In [16]:
import datetime as dt
import pandas as pd

print('Generating simulated data...')

# Load the smaller of the two data files
data = pd.read_csv('diabetes2.csv')

# We'll generate data for the past 6 weeks
weeknos = reversed(range(6))

file_paths = []
for weekno in weeknos:

    # Get the date X weeks ago
    data_date = dt.date.today() - dt.timedelta(weeks=weekno)

    # Modify data to create some drift
    data['Pregnancies'] = data['preg_count'] + 1
    data['Age'] = round(data['age'] * 1.2).astype(int)
    data['BMI'] = data['bmi'] * 1.1
    
    
    # Save the file with the date encoded in the filename
    file_path = 'diabetes_{}.csv'.format(data_date.strftime("%Y-%m-%d"))
    data.to_csv(file_path)
    file_paths.append(file_path)


Generating simulated data...


In [17]:
# Upload the files
path_on_datastore = 'diabetes-target'
default_ds.upload_files(files=file_paths,
target_path=path_on_datastore,
overwrite=True,
show_progress=True)

Uploading an estimated of 6 files
Uploading diabetes_2021-08-01.csv
Uploaded diabetes_2021-08-01.csv, 1 files out of an estimated total of 6
Uploading diabetes_2021-08-08.csv
Uploaded diabetes_2021-08-08.csv, 2 files out of an estimated total of 6
Uploading diabetes_2021-08-15.csv
Uploaded diabetes_2021-08-15.csv, 3 files out of an estimated total of 6
Uploading diabetes_2021-08-22.csv
Uploaded diabetes_2021-08-22.csv, 4 files out of an estimated total of 6
Uploading diabetes_2021-08-29.csv
Uploaded diabetes_2021-08-29.csv, 5 files out of an estimated total of 6
Uploading diabetes_2021-09-05.csv
Uploaded diabetes_2021-09-05.csv, 6 files out of an estimated total of 6
Uploaded 6 files


$AZUREML_DATAREFERENCE_8844fdc189b44d7882aa8c527e68b3b0

In [18]:
# Use the folder partition format to define a dataset with a 'date' timestamp column
partition_format = path_on_datastore + '/diabetes_{date:yyyy-MM-dd}.csv'
target_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, path_on_datastore + '/*.csv'),
partition_format=partition_format)

In [19]:

# Register the target dataset
print('Registering target dataset...')
target_data_set = target_data_set.with_timestamp_columns('date').register(workspace=ws,
name='diabetes target',
description='diabetes target data',
tags = {'format':'CSV'},
create_new_version=True)

print('Target dataset registered!')

Registering target dataset...
Target dataset registered!


In [20]:
df = target_data_set.to_pandas_dataframe()

In [31]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "myclusterdd1"

try:
    training_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')


except ComputeTargetException:
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
        training_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        training_cluster.wait_for_completion(show_output=True)
    
    except Exception as ex:
            print(ex)

InProgress.....
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [32]:
from azureml.datadrift import DataDriftDetector

# set up feature list
features = ['preg_count', 'age', 'bmi']

# set up data drift detector
monitor = DataDriftDetector.create_from_datasets(ws, 'mslearn-diabates-drift', baseline_data_set, target_data_set,
          compute_target=cluster_name, 
          frequency='Week', 
          feature_list=features, 
          drift_threshold=.3, 
         latency=24)
monitor

{'_workspace': Workspace.create(name='wsfinal123', subscription_id='29514374-60e8-4ea7-b14f-6778779cf8e4', resource_group='defaultresourcegroup-cus'), '_frequency': 'Week', '_schedule_start': None, '_schedule_id': None, '_interval': 1, '_state': 'Disabled', '_alert_config': None, '_type': 'DatasetBased', '_id': 'c314b402-3d2a-47f1-afb2-427b70300d13', '_model_name': None, '_model_version': 0, '_services': None, '_compute_target_name': 'myclusterdd1', '_drift_threshold': 0.3, '_baseline_dataset_id': '7a146255-632a-4a5b-9684-29161654e32d', '_target_dataset_id': '50eaa2ab-d68e-40c7-995c-3ef7863582b2', '_feature_list': ['preg_count', 'age', 'bmi'], '_latency': 24, '_name': 'mslearn-diabates-drift', '_latest_run_time': None, '_client': <azureml.datadrift._restclient.datadrift_client.DataDriftClient object at 0x7f153ff43d30>, '_logger': <_TelemetryLoggerContextAdapter azureml.datadrift._logging._telemetry_logger.azureml.datadrift.datadriftdetector (DEBUG)>}

In [None]:
from azureml.widgets import RunDetails

backfill = monitor.backfill(dt.datetime.now() - dt.timedelta(weeks=6), dt.datetime.now())

RunDetails(backfill).show()
backfill.wait_for_completion()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [None]:
drift_metrics = backfill.get_metrics()
for metric in drift_metrics:
    print(metric, drift_metrics[metric])