<a href="https://colab.research.google.com/github/DJCordhose/mlops-drift/blob/main/notebooks/drift.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Detecting drift

* you can let evidently choose the test or make your own choice
  * allows to select per type or per feature: https://docs.evidentlyai.com/reference/api-reference/evidently.options
  * which test: https://www.evidentlyai.com/blog/data-drift-detection-large-datasets
* Drift algorithm: https://docs.evidentlyai.com/reference/data-drift-algorithm  
* complete code in `src/insurance_prediction/monitoring/data_drift.py`
  * uses rolling window of incoming request in memory buffer
  * implemented as a decorator: https://docs.python.org/3/search.html?q=decorator
  * applied in `src/insurance_prediction/app/application/router.py`
  

In [1]:
import sys
IN_COLAB = 'google.colab' in sys.modules
IN_COLAB

True

In [2]:
if IN_COLAB:
  !pip install -q evidently

In [3]:
from evidently.metrics import DatasetDriftMetric
from evidently.base_metric import InputData, ColumnMapping
from evidently.runner.loader import DataLoader, DataOptions
from evidently.calculations.data_drift import get_drift_for_columns
from evidently.options import DataDriftOptions
from evidently.utils.data_operations import process_columns

## Preparing reference and current dataset

In [4]:
def url_for_dataset(relative_path):
  if IN_COLAB:
    url = f'https://github.com/DJCordhose/mlops-drift/raw/main/datasets/insurance_prediction/{relative_path}.csv.gz'
  else:
    url =  f'../datasets/insurance_prediction/{relative_path}.csv.gz'
  return url

In [5]:
# how many months after training?
# iteration = 36 # final month in dataset, should have the biggest difference
iteration = 12 # one year later, drift should starting to show

In [6]:
reference_dataset_url = url_for_dataset(f'reference')
current_dataset_url = url_for_dataset(f'monthly/month-{iteration}')

In [7]:
# could just as well be pandas code
reference_dataset = DataLoader().load(
   filename=reference_dataset_url,
   data_options = DataOptions(date_column=None, separator=';'),
)
reference_dataset = reference_dataset.drop(['risk', 'group', 'group_name'], axis='columns')
reference_dataset.head()

Unnamed: 0,training,age,emergency_braking,braking_distance,power,miles
0,0,41.9511,0,40.933328,122.238329,98.509765
1,0,24.368286,1,44.797317,113.765298,46.324178
2,0,18.314649,1,41.587241,143.427269,76.862968
3,0,51.265254,1,47.266716,111.578133,102.697069
4,0,23.578861,0,42.835319,145.994235,63.690055


In [8]:
# could just as well be pandas code
current_dataset = DataLoader().load(
   filename=current_dataset_url,
   data_options = DataOptions(date_column=None, separator=';'),
)
current_dataset = current_dataset.drop(['risk', 'group', 'group_name'], axis='columns')
current_dataset.head()

Unnamed: 0,training,age,emergency_braking,braking_distance,power,miles
0,0,18.936814,0,43.758711,96.338095,63.786116
1,0,32.041673,1,29.122178,187.562915,81.079895
2,0,27.554665,1,40.783638,123.967462,49.117011
3,0,30.040739,1,37.61238,167.705733,67.576213
4,0,20.513682,1,37.577958,137.666668,61.36746


## Feature Drift

In [9]:
column_mapping = ColumnMapping(
    categorical_features=['training', 'emergency_braking'],
    numerical_features=['age', 'braking_distance', 'power', 'miles'],
    prediction=None,
    target=None,
)
columns = column_mapping.categorical_features + column_mapping.numerical_features
drift_share = 0.5

In [10]:
input_data = InputData(
    reference_data=reference_dataset,
    current_data=current_dataset,
    column_mapping=column_mapping,
    current_additional_features=None,
    data_definition=None,
    reference_additional_features=None
)

In [11]:
dataset_columns = process_columns(input_data.reference_data, input_data.column_mapping)
result = get_drift_for_columns(
    current_data=input_data.current_data,
    reference_data=input_data.reference_data,
    data_drift_options=DataDriftOptions(),
    drift_share_threshold=drift_share,
    dataset_columns=dataset_columns,
    columns=column_mapping.categorical_features + column_mapping.numerical_features,
    agg_data = True
)

In [12]:
result.number_of_columns

6

In [13]:
result.number_of_drifted_columns

2

In [14]:
result.share_of_drifted_columns

0.3333333333333333

In [15]:
result.dataset_drift

False

In [16]:
for column in columns:
    drift = result.drift_by_columns[column]
    print(f'{column}: {drift.drift_score} ({drift.stattest_name}){", drifted" if drift.drift_detected else ""}')

training: 0.005181253622581234 (Jensen-Shannon distance)
emergency_braking: 0.13147841305705196 (Jensen-Shannon distance), drifted
age: 0.09701144919398294 (Wasserstein distance (normed))
braking_distance: 0.0692912211638476 (Wasserstein distance (normed))
power: 0.027957726384958408 (Wasserstein distance (normed))
miles: 0.22422961324847054 (Wasserstein distance (normed)), drifted


## Target Drift

In [17]:
import tensorflow as tf
tf.__version__

'2.13.0'

In [18]:
if IN_COLAB:
  !wget https://github.com/DJCordhose/mlops-drift/raw/main/notebooks/classifier.h5
model_path = 'classifier'

--2023-10-15 11:52:12--  https://github.com/DJCordhose/mlops-drift/raw/main/notebooks/classifier.h5
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/DJCordhose/mlops-drift/main/notebooks/classifier.h5 [following]
--2023-10-15 11:52:12--  https://raw.githubusercontent.com/DJCordhose/mlops-drift/main/notebooks/classifier.h5
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 333608 (326K) [application/octet-stream]
Saving to: ‘classifier.h5.4’


2023-10-15 11:52:12 (6.76 MB/s) - ‘classifier.h5.4’ saved [333608/333608]



In [19]:
model = tf.keras.models.load_model(f'{model_path}.h5')

In [20]:
def prediction_name(value: int):
  names = ['HIGH', 'MEDIUM', 'LOW']
  return names[value]


In [21]:
X = current_dataset
y_pred = model.predict(X, verbose=0).argmax(axis=1)
current_dataset_prediction = X.copy()
current_dataset_prediction['prediction'] = list(map(prediction_name, y_pred))
# current_dataset_prediction['prediction'] = y_pred
current_dataset_prediction.head()

Unnamed: 0,training,age,emergency_braking,braking_distance,power,miles,prediction
0,0,18.936814,0,43.758711,96.338095,63.786116,HIGH
1,0,32.041673,1,29.122178,187.562915,81.079895,LOW
2,0,27.554665,1,40.783638,123.967462,49.117011,LOW
3,0,30.040739,1,37.61238,167.705733,67.576213,MEDIUM
4,0,20.513682,1,37.577958,137.666668,61.36746,MEDIUM


In [22]:
X = reference_dataset
y_pred = model.predict(X, verbose=0).argmax(axis=1)
reference_dataset_prediction = X.copy()
reference_dataset_prediction['prediction'] =  list(map(prediction_name, y_pred))
# reference_dataset_prediction['prediction'] =  y_pred
reference_dataset_prediction.head()

Unnamed: 0,training,age,emergency_braking,braking_distance,power,miles,prediction
0,0,41.9511,0,40.933328,122.238329,98.509765,MEDIUM
1,0,24.368286,1,44.797317,113.765298,46.324178,MEDIUM
2,0,18.314649,1,41.587241,143.427269,76.862968,HIGH
3,0,51.265254,1,47.266716,111.578133,102.697069,HIGH
4,0,23.578861,0,42.835319,145.994235,63.690055,HIGH


In [23]:
column_mapping = ColumnMapping(
    categorical_features=['training', 'emergency_braking'],
    numerical_features=['age', 'braking_distance', 'power', 'miles'],
    prediction="prediction",
    target=None,
)
columns = column_mapping.categorical_features + column_mapping.numerical_features + [column_mapping.prediction]
columns

['training',
 'emergency_braking',
 'age',
 'braking_distance',
 'power',
 'miles',
 'prediction']

In [24]:
input_data = InputData(
    reference_data=reference_dataset_prediction,
    current_data=current_dataset_prediction,
    column_mapping=column_mapping,
    current_additional_features=None,
    data_definition=None,
    reference_additional_features=None
)

In [25]:
dataset_columns = process_columns(input_data.reference_data, input_data.column_mapping)
dataset_columns

DatasetColumns(type='evidently.metric_results.DatasetColumns', utility_columns=DatasetUtilityColumns(type='evidently.metric_results.DatasetUtilityColumns', date=None, id=None, target=None, prediction='prediction'), target_type='cat', num_feature_names=['age', 'braking_distance', 'miles', 'power'], cat_feature_names=['training', 'emergency_braking'], text_feature_names=[], datetime_feature_names=[], target_names=None, task=None)

In [26]:
result = get_drift_for_columns(
    current_data=input_data.current_data,
    reference_data=input_data.reference_data,
    data_drift_options=DataDriftOptions(),
    drift_share_threshold=drift_share,
    dataset_columns=dataset_columns,
    columns=columns,
    agg_data = True
)

In [27]:
for column in columns:
    drift = result.drift_by_columns[column]
    print(f'{column}: {drift.drift_score} ({drift.stattest_name}){", drifted" if drift.drift_detected else ""}')

training: 0.005181253622581234 (Jensen-Shannon distance)
emergency_braking: 0.13147841305705196 (Jensen-Shannon distance), drifted
age: 0.09701144919398294 (Wasserstein distance (normed))
braking_distance: 0.0692912211638476 (Wasserstein distance (normed))
power: 0.027957726384958408 (Wasserstein distance (normed))
miles: 0.22422961324847054 (Wasserstein distance (normed)), drifted
prediction: 0.0416103466303872 (Jensen-Shannon distance)
