<a href="https://colab.research.google.com/github/DJCordhose/mlops-drift/blob/main/notebooks/drift.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Detecting drift

* you can let evidently choose the test or make your own choice
  * allows to select per type or per feature: https://docs.evidentlyai.com/reference/api-reference/evidently.options
  * which test: https://www.evidentlyai.com/blog/data-drift-detection-large-datasets
* Drift algorithm: https://docs.evidentlyai.com/reference/data-drift-algorithm  

In [2]:
import sys
IN_COLAB = 'google.colab' in sys.modules
IN_COLAB

False

In [3]:
if IN_COLAB:
  !pip install -q evidently==0.4.8

In [4]:
import evidently
evidently.__version__

'0.4.8'

In [5]:
from evidently.metrics import DatasetDriftMetric
from evidently.base_metric import InputData, ColumnMapping
from evidently.runner.loader import DataLoader, DataOptions
from evidently.calculations.data_drift import get_drift_for_columns
from evidently.options.data_drift import DataDriftOptions
from evidently.utils.data_operations import process_columns

## Preparing reference and current dataset

In [6]:
def url_for_dataset(relative_path):
  if IN_COLAB:
    url = f'https://github.com/DJCordhose/mlops-drift/raw/main/datasets/insurance_prediction/{relative_path}.csv.gz'
  else:
    url =  f'./datasets/insurance_prediction/{relative_path}.csv.gz'
  return url

In [7]:
# how many months after training?
# iteration = 36 # final month in dataset, should have the biggest difference
iteration = 12 # one year later, drift should starting to show

In [8]:
reference_dataset_url = url_for_dataset(f'reference')
current_dataset_url = url_for_dataset(f'monthly/month-{iteration}')

In [9]:
# could just as well be pandas code
reference_dataset = DataLoader().load(
   filename=reference_dataset_url,
   data_options = DataOptions(date_column=None, separator=';'),
)
reference_dataset = reference_dataset.drop(['risk', 'group', 'group_name', 'good_customer'], axis='columns')
reference_dataset.head()

Unnamed: 0,training,age,emergency_braking,braking_distance,power,milage
0,0,41.9511,0,40.933328,122.238329,98.509765
1,0,24.368286,1,44.797317,113.765298,46.324178
2,0,18.314649,1,41.587241,143.427269,76.862968
3,0,51.265254,1,47.266716,111.578133,102.697069
4,0,23.578861,0,42.835319,145.994235,63.690055


In [10]:
# could just as well be pandas code
current_dataset = DataLoader().load(
   filename=current_dataset_url,
   data_options = DataOptions(date_column=None, separator=';'),
)
current_dataset = current_dataset.drop(['risk', 'group', 'group_name', 'good_customer'], axis='columns')
current_dataset.head()

Unnamed: 0,training,age,emergency_braking,braking_distance,power,milage
0,0,18.936814,0,43.758711,96.338095,63.786116
1,0,32.041673,1,29.122178,187.562915,81.079895
2,0,27.554665,1,40.783638,123.967462,49.117011
3,0,30.040739,1,37.61238,167.705733,67.576213
4,0,20.513682,1,37.577958,137.666668,61.36746


## Target / Prediction

In [11]:
import tensorflow as tf
tf.__version__

2023-11-02 09:10:00.227876: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-02 09:10:00.231420: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-02 09:10:00.272069: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-02 09:10:00.272129: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-02 09:10:00.272160: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to regi



'2.14.0'

In [12]:
if IN_COLAB:
  !wget https://github.com/DJCordhose/buch-machine-learning-notebooks/raw/main/kap11/classifier-binary.keras
model_path = 'classifier'
model = tf.keras.models.load_model(f'{model_path}-binary.keras')

In [14]:
def prediction_name(value: int):
  if binary_classifications:
    names = ['HIGH', 'MEDIUM', 'LOW']
  else:
    names = ['BAD', 'GOOD']
  return names[value]


In [15]:
X = current_dataset
y_pred = model.predict(X, verbose=0).argmax(axis=1)
current_dataset_prediction = X.copy()
current_dataset_prediction['prediction'] = list(map(prediction_name, y_pred))
# current_dataset_prediction['prediction'] = y_pred
current_dataset_prediction.head()

Unnamed: 0,training,age,emergency_braking,braking_distance,power,milage,prediction
0,0,18.936814,0,43.758711,96.338095,63.786116,HIGH
1,0,32.041673,1,29.122178,187.562915,81.079895,LOW
2,0,27.554665,1,40.783638,123.967462,49.117011,LOW
3,0,30.040739,1,37.61238,167.705733,67.576213,MEDIUM
4,0,20.513682,1,37.577958,137.666668,61.36746,MEDIUM


In [16]:
X = reference_dataset
y_pred = model.predict(X, verbose=0).argmax(axis=1)
reference_dataset_prediction = X.copy()
reference_dataset_prediction['prediction'] =  list(map(prediction_name, y_pred))
# reference_dataset_prediction['prediction'] =  y_pred
reference_dataset_prediction.head()

Unnamed: 0,training,age,emergency_braking,braking_distance,power,milage,prediction
0,0,41.9511,0,40.933328,122.238329,98.509765,MEDIUM
1,0,24.368286,1,44.797317,113.765298,46.324178,MEDIUM
2,0,18.314649,1,41.587241,143.427269,76.862968,HIGH
3,0,51.265254,1,47.266716,111.578133,102.697069,HIGH
4,0,23.578861,0,42.835319,145.994235,63.690055,HIGH


# Calculating Drift

In [17]:
column_mapping = ColumnMapping(
    categorical_features=['training', 'emergency_braking'],
    numerical_features=['age', 'braking_distance', 'power', 'milage'],
    prediction="prediction",
    target=None,
)
drift_share = 0.5
columns = column_mapping.categorical_features + column_mapping.numerical_features + [column_mapping.prediction]
columns

['training',
 'emergency_braking',
 'age',
 'braking_distance',
 'power',
 'milage',
 'prediction']

In [18]:
input_data = InputData(
    reference_data=reference_dataset_prediction,
    current_data=current_dataset_prediction,
    column_mapping=column_mapping,
    current_additional_features=None,
    data_definition=None,
    reference_additional_features=None,
    additional_datasets=None
)

In [19]:
dataset_columns = process_columns(input_data.reference_data, input_data.column_mapping)
dataset_columns

DatasetColumns(type='evidently.metric_results.DatasetColumns', utility_columns=DatasetUtilityColumns(type='evidently.metric_results.DatasetUtilityColumns', date=None, id=None, target=None, prediction='prediction'), target_type='cat', num_feature_names=['age', 'braking_distance', 'milage', 'power'], cat_feature_names=['training', 'emergency_braking'], text_feature_names=[], datetime_feature_names=[], target_names=None, task=None)

In [20]:
result = get_drift_for_columns(
    current_data=input_data.current_data,
    reference_data=input_data.reference_data,
    data_drift_options=DataDriftOptions(),
    drift_share_threshold=drift_share,
    dataset_columns=dataset_columns,
    columns=columns,
    agg_data = True
)

In [21]:
result.number_of_columns

7

In [22]:
result.number_of_drifted_columns

2

In [23]:
result.share_of_drifted_columns

0.2857142857142857

In [24]:
result.dataset_drift

False

In [25]:
for column in columns:
    drift = result.drift_by_columns[column]
    print(f'{column}: {drift.drift_score} ({drift.stattest_name}){", drifted" if drift.drift_detected else ""}')

training: 0.005181253622581234 (Jensen-Shannon distance)
emergency_braking: 0.13147841305705196 (Jensen-Shannon distance), drifted
age: 0.09701144919398294 (Wasserstein distance (normed))
braking_distance: 0.0692912211638476 (Wasserstein distance (normed))
power: 0.027957726384958408 (Wasserstein distance (normed))
milage: 0.22422961324847054 (Wasserstein distance (normed)), drifted
prediction: 0.0416103466303872 (Jensen-Shannon distance)


# Comparing different stats tests

https://www.evidentlyai.com/blog/data-drift-detection-large-datasets

In [26]:
from evidently.calculations.stattests.jensenshannon import jensenshannon_stat_test
from evidently.calculations.stattests.kl_div import kl_div_stat_test
from evidently.calculations.stattests.ks_stattest import ks_stat_test
from evidently.calculations.stattests.psi import psi_stat_test
from evidently.calculations.stattests.wasserstein_distance_norm import wasserstein_stat_test

In [28]:
current_dataset_prediction.head()

Unnamed: 0,training,age,emergency_braking,braking_distance,power,milage,prediction
0,0,18.936814,0,43.758711,96.338095,63.786116,HIGH
1,0,32.041673,1,29.122178,187.562915,81.079895,LOW
2,0,27.554665,1,40.783638,123.967462,49.117011,LOW
3,0,30.040739,1,37.61238,167.705733,67.576213,MEDIUM
4,0,20.513682,1,37.577958,137.666668,61.36746,MEDIUM


In [29]:
reference_dataset_prediction.head()

Unnamed: 0,training,age,emergency_braking,braking_distance,power,milage,prediction
0,0,41.9511,0,40.933328,122.238329,98.509765,MEDIUM
1,0,24.368286,1,44.797317,113.765298,46.324178,MEDIUM
2,0,18.314649,1,41.587241,143.427269,76.862968,HIGH
3,0,51.265254,1,47.266716,111.578133,102.697069,HIGH
4,0,23.578861,0,42.835319,145.994235,63.690055,HIGH


In [30]:
feature_name = "age"
reference_feature = reference_dataset_prediction[feature_name]
current_feature = current_dataset_prediction[feature_name]

In [31]:
wasserstein_stat_test(reference_feature, current_feature, 'num', threshold=None)

StatTestResult(drift_score=0.09701144919398294, drifted=False, actual_threshold=0.1)

In [32]:
ks_stat_test(reference_feature, current_feature, 'num', threshold=None)

StatTestResult(drift_score=0.010164445017412765, drifted=True, actual_threshold=0.05)

In [33]:
psi_stat_test(reference_feature, current_feature, 'num', threshold=None)

StatTestResult(drift_score=3.2251039743059975, drifted=True, actual_threshold=0.1)

In [35]:
feature_name = "emergency_braking"
reference_feature = reference_dataset_prediction[feature_name]
current_feature = current_dataset_prediction[feature_name]

In [36]:
jensenshannon_stat_test(reference_feature, current_feature, 'cat', threshold=None)

StatTestResult(drift_score=0.13147841305705196, drifted=True, actual_threshold=0.1)

In [37]:
psi_stat_test(reference_feature, current_feature, 'cat', threshold=None)

StatTestResult(drift_score=0.1391049615865103, drifted=True, actual_threshold=0.1)

In [38]:
kl_div_stat_test(reference_feature, current_feature, 'cat', threshold=None)

StatTestResult(drift_score=0.06931609387501522, drifted=False, actual_threshold=0.1)