In [1]:
pwd

'/mnt/batch/tasks/shared/LS_root/mounts/clusters/drift-instance/code/Users/camila.carrasco/mt.data.drift/notebooks'

In [2]:
cd ..

/mnt/batch/tasks/shared/LS_root/mounts/clusters/drift-instance/code/Users/camila.carrasco/mt.data.drift


In [3]:
import pandas as pd
import numpy as np
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential
from azureml.fsspec import AzureMachineLearningFileSystem
from azureml.core import Workspace, Datastore, Dataset
from src.utils.utils import load_workspace, load_config
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter("ignore")
from typing import Tuple
import evidently
from evidently.metrics import *
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset
from evidently.test_suite import TestSuite
from evidently.tests import *
from evidently.pipeline.column_mapping import ColumnMapping

In [4]:
print(evidently.__version__)

0.4.19


In [5]:
# Load config with Azure authentication parameters
config_path = './configs/params/monitoring.json'
config = load_config(config_path)

# Load workspace parameters
subscription = config['BASE']['subscription']
resource_group = config['BASE']['resource_group']
workspace = config['BASE']['workspace']

# Aunthenticate 
ml_client = load_workspace(subscription, resource_group, workspace)

In [6]:
def load_data(file_path):
    df =  pd.read_csv(file_path, sep=',', dtype_backend="pyarrow", engine="pyarrow")
    df = df[['Acc_x', 'Acc_y', 'Acc_z']]
    return df

    
def detect_dataset_drift(
    reference: pd.DataFrame,
    current: pd.DataFrame,
    column_mapping: dict,
    test: str
) -> Tuple[dict, dict]:

    data_drift_report = Report(
        metrics=[
            DataDriftPreset(
                num_stattest=test, #psi, kl_div, jensenshannon, wasserstein 
                num_stattest_threshold=0.1,
            )
        ],
        options={"render": {"raw_data": True}}
    )

    data_drift_report.run(
        reference_data=reference, current_data=current, column_mapping=column_mapping
    )
    # html_name = "DatasetDrift_" + file.split("/")[-1] + ".html"
    # data_drift_report.save_html(html_dir +'/' +  html_name)
    report_dataset = data_drift_report.as_dict()

    return report_dataset



def get_drift_metrics(reference, production, file, test):
    
    # Detect drift along dataset
    
    column_mapping = ColumnMapping()
    column_mapping.numerical_features = reference.columns.tolist()
    report_dataset = detect_dataset_drift(reference, production, column_mapping, test)
    filename = file.split('/')[-1]
    date = filename.split("_")[-1].split('.')[0]

    drift_results = []
    

    drift_results.append(
                    {
                        "filename": filename,
                        "date": pd.to_datetime(date, format="%Y%m%d"),
                        "dataset_drift_score": report_dataset["metrics"][0]["result"][
                            "share_of_drifted_columns"
                        ],
                        "dataset_drift": report_dataset["metrics"][0]["result"][
                            "dataset_drift"
                        ],
                        "n_columns": report_dataset["metrics"][0]["result"][
                            "number_of_columns"
                        ],
                        "number_of_drifted_columns": report_dataset["metrics"][0]["result"][
                            "number_of_drifted_columns"
                        ],
                        
                    }
                )


    drift_dataset = pd.DataFrame(drift_results)

    # Detect drift by each column

    drifts = []
    columns_drift = []

    for feature in production.columns:
        drifts.append(
            (
                feature,
                report_dataset["metrics"][1]["result"]["drift_by_columns"][feature][
                    "drift_score"
                ],
            )
        )

    columns_drift.append([x[1] for x in drifts])
    columns_drift_df = pd.DataFrame(columns_drift, columns=production.columns)
    columns_drift_df["date"] = pd.to_datetime(date, format="%Y%m%d")
    columns_drift_df["filename"] = filename

    print('Data drift inspection done')
    return drift_dataset, columns_drift_df

In [7]:
reference_path = './data/reference_data.csv'
reference_data = load_data(reference_path)
production_path = './data/production_data.csv'
production_data = load_data(production_path)
file = '2062_Acc_20240301.log'
test = 'kl_div'

In [8]:
reference_data

Unnamed: 0,Acc_x,Acc_y,Acc_z
0,80,988,-18
1,84,984,-10
2,76,982,-20
3,84,984,-8
4,80,984,-14
...,...,...,...
6375718,122,984,0
6375719,122,986,2
6375720,120,984,0
6375721,122,984,0


In [9]:
production_data

Unnamed: 0,Acc_x,Acc_y,Acc_z
0,172,958,68
1,172,956,70
2,174,952,68
3,172,958,72
4,166,956,76
...,...,...,...
7128715,140,958,48
7128716,138,962,46
7128717,136,960,46
7128718,136,960,50


In [11]:
dataset_drift, columns_drift = get_drift_metrics(reference_data, production_data, file, test)

AttributeError: 'float' object has no attribute 'shape'