# How to calculate Embeddings drift?

In [1]:
import pandas as pd
import numpy as np
import PIL

from sklearn import datasets

from evidently import ColumnMapping
from evidently.report import Report
from evidently.metrics import EmbeddingsDriftMetric

from evidently.metrics.data_drift.embedding_drift_methods import model, distance, ratio, mmd

from evidently.tests import TestEmbeddingsDrift
from evidently.test_preset import DataDriftTestPreset, NoTargetPerformanceTestPreset
from evidently.test_suite import TestSuite

2023-08-23 17:13:44.301877: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-08-23 17:13:44.373675: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-23 17:13:44.771047: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/home/vishesh/anaconda3/lib/:/home/vishesh/anaconda3/lib/python3.9/site-packages/nvidi

## Prepare a Dataset

In [18]:
perturbation = "gaussiannoise"
strength = 10
ref_path = "/home/vishesh/Desktop/datasets/fleurs/embeddings/train_decoder_embeddings.csv"
# cur_path = f"/home/vishesh/Desktop/datasets/fleurs/embeddings/perturbations/{perturbation}/test_results_{perturbation}_{strength}.csv"
cur_path = f"/home/vishesh/Desktop/datasets/fleurs/embeddings/test_decoder_embeddings.csv"

In [19]:
reference = pd.read_csv(ref_path)
current = pd.read_csv(cur_path)
current.drop_duplicates(subset=['Labels'], inplace=True)
reference.drop_duplicates(subset=['Labels'], inplace=True)

## Embeddings Drift Report

In [20]:
column_mapping = ColumnMapping(
    embeddings={'features': reference.columns[5:773]},
)

In [21]:
report = Report(metrics=[
    EmbeddingsDriftMetric('features')
])

report.run(reference_data = reference, current_data = current, 
           column_mapping = column_mapping)
report


The loss 'log' was deprecated in v1.1 and will be removed in version 1.3. Use `loss='log_loss'` which is equivalent.



### Embeddings Drift Detection: model

In [6]:
# report = Report(metrics = [
#     EmbeddingsDriftMetric('features', 
#                           drift_method = model(
#                               threshold = 0.55,
#                               bootstrap = None,
#                               quantile_probability = 0.05,
#                               pca_components = None,
#                           )
#                          )
# ])

# report.run(reference_data = reference, current_data = current, 
#            column_mapping = column_mapping)
# report

### Embeddings Drift Detection: mmd

In [22]:
report = Report(metrics = [
    EmbeddingsDriftMetric('features', 
                          drift_method = mmd(
                              threshold = 0.015,
                              bootstrap = None,
                              quantile_probability = 0.95,
                              pca_components = None,
                          )
                         )
])

report.run(reference_data = reference, current_data = current,  
           column_mapping = column_mapping)
report

### Embeddings Drift Detection: ratio

In [23]:
report = Report(metrics = [
    EmbeddingsDriftMetric('features', 
                          drift_method = ratio(
                              component_stattest = 'wasserstein',
                              component_stattest_threshold = 0.1,
                              threshold = 0.2,
                              pca_components = None,
                          )
                         )
])

report.run(reference_data = reference, current_data = current,  
           column_mapping = column_mapping)
report

### Embeddings Drift Detection: distance

In [24]:
report = Report(metrics = [
    EmbeddingsDriftMetric('features', 
                          drift_method = distance(
                              dist = 'euclidean', #"euclidean", "cosine", "cityblock" or "chebyshev"
                              threshold = 0.2,
                              pca_components = None,
                              bootstrap = None,
                              quantile_probability = 0.95
                          )
                         )
])

report.run(reference_data = reference, current_data = current,  
           column_mapping = column_mapping)
report

# Data Drift Metric Preset

In [4]:
from evidently.metric_preset import DataDriftPreset
from evidently.metrics.data_drift.embedding_drift_methods import model, ratio
report = Report(metrics=[
    DataDriftPreset(embeddings=['features'],
                    embeddings_drift_method={'features': ratio(pca_components=5)})
])

report.run(reference_data=reference,
    current_data=current, 
    column_mapping=column_mapping)
report

NameError: name 'reference' is not defined

# Embeddings Drift Test

In [None]:
tests = TestSuite(tests=[
    TestEmbeddingsDrift(embeddings_name='small_subset')
])

tests.run(reference_data=embeddings_data[:2500],
    current_data=embeddings_data_shifted[2500:5000], 
    column_mapping=column_mapping)
tests

# Data Drift Test Preset

In [None]:
tests = TestSuite(tests=[
    DataDriftTestPreset(embeddings=['small_subset'])
])

tests.run(
    reference_data=embeddings_data[:2500],
    current_data=embeddings_data[2500:5000], 
    column_mapping=column_mapping
)
tests

# No Target Performance Test Preset

In [None]:
tests = TestSuite(tests=[
    NoTargetPerformanceTestPreset(embeddings=['small_subset', 'big_subset'])
])

tests.run(
    reference_data=embeddings_data[:2500],
    current_data=embeddings_data[2500:5000], 
    column_mapping=column_mapping
)
tests