<a href="https://colab.research.google.com/github/DJCordhose/mlops-drift/blob/main/notebooks/drift.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Detecting drift

* you can let evidently choose the test or make your own choice
  * allows to select per type or per feature: https://docs.evidentlyai.com/reference/api-reference/evidently.options
  * which test: https://www.evidentlyai.com/blog/data-drift-detection-large-datasets
* Drift algorithm: https://docs.evidentlyai.com/reference/data-drift-algorithm  
* complete code in `src/insurance_prediction/monitoring/data_drift.py`
  * uses rolling window of incoming request in memory buffer
  * implemented as a decorator: https://docs.python.org/3/search.html?q=decorator
  * applied in `src/insurance_prediction/app/application/router.py`
  

In [1]:
import sys
IN_COLAB = 'google.colab' in sys.modules
IN_COLAB

False

In [2]:
if IN_COLAB:
  !pip install -q evidently

In [3]:
import evidently
evidently.__version__

'0.3.0'

In [4]:
from evidently.metrics import DatasetDriftMetric
from evidently.base_metric import InputData, ColumnMapping
from evidently.runner.loader import DataLoader, DataOptions
from evidently.calculations.data_drift import get_drift_for_columns
from evidently.options import DataDriftOptions
from evidently.utils.data_operations import process_columns

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()
2023-10-28 13:56:18.909896: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-28 13:56:18.951291: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Preparing reference and current dataset

In [5]:
def url_for_dataset(relative_path):
  if IN_COLAB:
    url = f'https://github.com/DJCordhose/mlops-drift/raw/main/datasets/insurance_prediction/{relative_path}.csv.gz'
  else:
    url =  f'../datasets/insurance_prediction/{relative_path}.csv.gz'
  return url

In [6]:
# how many months after training?
# iteration = 36 # final month in dataset, should have the biggest difference
iteration = 12 # one year later, drift should starting to show

In [7]:
reference_dataset_url = url_for_dataset(f'reference')
current_dataset_url = url_for_dataset(f'monthly/month-{iteration}')

In [8]:
# could just as well be pandas code
reference_dataset = DataLoader().load(
   filename=reference_dataset_url,
   data_options = DataOptions(date_column=None, separator=';'),
)
reference_dataset = reference_dataset.drop(['risk', 'group', 'group_name'], axis='columns')
reference_dataset.head()

Unnamed: 0,training,age,emergency_braking,braking_distance,power,miles
0,0,41.9511,0,40.933328,122.238329,98.509765
1,0,24.368286,1,44.797317,113.765298,46.324178
2,0,18.314649,1,41.587241,143.427269,76.862968
3,0,51.265254,1,47.266716,111.578133,102.697069
4,0,23.578861,0,42.835319,145.994235,63.690055


In [9]:
# could just as well be pandas code
current_dataset = DataLoader().load(
   filename=current_dataset_url,
   data_options = DataOptions(date_column=None, separator=';'),
)
current_dataset = current_dataset.drop(['risk', 'group', 'group_name'], axis='columns')
current_dataset.head()

Unnamed: 0,training,age,emergency_braking,braking_distance,power,miles
0,0,18.936814,0,43.758711,96.338095,63.786116
1,0,32.041673,1,29.122178,187.562915,81.079895
2,0,27.554665,1,40.783638,123.967462,49.117011
3,0,30.040739,1,37.61238,167.705733,67.576213
4,0,20.513682,1,37.577958,137.666668,61.36746


## Target / Prediction

In [10]:
import tensorflow as tf
tf.__version__

'2.12.0'

In [11]:
if IN_COLAB:
  !wget https://github.com/DJCordhose/mlops-drift/raw/main/notebooks/classifier.h5
model_path = 'classifier'

In [12]:
model = tf.keras.models.load_model(f'{model_path}.h5')

In [13]:
def prediction_name(value: int):
  names = ['HIGH', 'MEDIUM', 'LOW']
  return names[value]


In [14]:
X = current_dataset
y_pred = model.predict(X, verbose=0).argmax(axis=1)
current_dataset_prediction = X.copy()
current_dataset_prediction['prediction'] = list(map(prediction_name, y_pred))
# current_dataset_prediction['prediction'] = y_pred
current_dataset_prediction.head()

Unnamed: 0,training,age,emergency_braking,braking_distance,power,miles,prediction
0,0,18.936814,0,43.758711,96.338095,63.786116,HIGH
1,0,32.041673,1,29.122178,187.562915,81.079895,LOW
2,0,27.554665,1,40.783638,123.967462,49.117011,LOW
3,0,30.040739,1,37.61238,167.705733,67.576213,MEDIUM
4,0,20.513682,1,37.577958,137.666668,61.36746,MEDIUM


In [15]:
X = reference_dataset
y_pred = model.predict(X, verbose=0).argmax(axis=1)
reference_dataset_prediction = X.copy()
reference_dataset_prediction['prediction'] =  list(map(prediction_name, y_pred))
# reference_dataset_prediction['prediction'] =  y_pred
reference_dataset_prediction.head()

Unnamed: 0,training,age,emergency_braking,braking_distance,power,miles,prediction
0,0,41.9511,0,40.933328,122.238329,98.509765,MEDIUM
1,0,24.368286,1,44.797317,113.765298,46.324178,MEDIUM
2,0,18.314649,1,41.587241,143.427269,76.862968,HIGH
3,0,51.265254,1,47.266716,111.578133,102.697069,HIGH
4,0,23.578861,0,42.835319,145.994235,63.690055,HIGH


# Calculating Drift

In [16]:
column_mapping = ColumnMapping(
    categorical_features=['training', 'emergency_braking'],
    numerical_features=['age', 'braking_distance', 'power', 'miles'],
    prediction="prediction",
    target=None,
)
drift_share = 0.5
columns = column_mapping.categorical_features + column_mapping.numerical_features + [column_mapping.prediction]
columns

['training',
 'emergency_braking',
 'age',
 'braking_distance',
 'power',
 'miles',
 'prediction']

In [17]:
input_data = InputData(
    reference_data=reference_dataset_prediction,
    current_data=current_dataset_prediction,
    column_mapping=column_mapping,
    current_additional_features=None,
    data_definition=None,
    reference_additional_features=None
)

In [18]:
dataset_columns = process_columns(input_data.reference_data, input_data.column_mapping)
dataset_columns

DatasetColumns(utility_columns=DatasetUtilityColumns(date=None, id=None, target=None, prediction='prediction'), target_type='cat', num_feature_names=['age', 'braking_distance', 'miles', 'power'], cat_feature_names=['training', 'emergency_braking'], text_feature_names=[], datetime_feature_names=[], target_names=None, task=None)

In [21]:
result = get_drift_for_columns(
    current_data=input_data.current_data,
    reference_data=input_data.reference_data,
    data_drift_options=DataDriftOptions(),
    drift_share_threshold=drift_share,
    dataset_columns=dataset_columns,
    columns=columns,
    # needed depending on version
    # agg_data = True
)

In [22]:
result.number_of_columns

7

In [23]:
result.number_of_drifted_columns

2

In [24]:
result.share_of_drifted_columns

0.2857142857142857

In [25]:
result.dataset_drift

False

In [26]:
for column in columns:
    drift = result.drift_by_columns[column]
    print(f'{column}: {drift.drift_score} ({drift.stattest_name}){", drifted" if drift.drift_detected else ""}')

training: 0.005181253622581234 (Jensen-Shannon distance)
emergency_braking: 0.13147841305705196 (Jensen-Shannon distance), drifted
age: 0.09701144919398294 (Wasserstein distance (normed))
braking_distance: 0.0692912211638476 (Wasserstein distance (normed))
power: 0.027957726384958408 (Wasserstein distance (normed))
miles: 0.22422961324847054 (Wasserstein distance (normed)), drifted
prediction: 0.041610346630386784 (Jensen-Shannon distance)
