# How to calculate Embeddings drift?

In [1]:
import pandas as pd
import numpy as np
import PIL

from sklearn import datasets

from evidently import ColumnMapping
from evidently.report import Report
from evidently.metrics import EmbeddingsDriftMetric

from evidently.metrics.data_drift.embedding_drift_methods import model, distance, ratio, mmd

from evidently.tests import TestEmbeddingsDrift
from evidently.test_preset import DataDriftTestPreset, NoTargetPerformanceTestPreset
from evidently.test_suite import TestSuite

  from .autonotebook import tqdm as notebook_tqdm
2023-07-25 10:22:17.408365: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-25 10:22:17.477540: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-07-25 10:22:17.864443: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/home/vishesh/anaconda3/lib/:/home/v

## Prepare a Dataset

In [256]:
train_data_dir = "/home/vishesh/Desktop/datasets/ships-data/X_true_train_240_320"
train_ground_truth_dir = "/home/vishesh/Desktop/datasets/ships-data/Y_true_train_240_320"

test_data_path = "/home/vishesh/Desktop/datasets/ships-data/X_true_val_240_320/1.npy"
test_ground_truth_path = "/home/vishesh/Desktop/datasets/ships-data/Y_true_val_240_320/1.npy"

perturbation = "contrast"
strength = 10
perturbed_dataset_path = f"/home/vishesh/Desktop/datasets/ships-data/perturbed_test/varying/{perturbation}/{strength}/1.npy"

In [257]:
X_test = np.load(test_data_path)
y_test = np.load(test_ground_truth_path).astype(int)

perturbed_dataset = np.load(perturbed_dataset_path)
X_test = perturbed_dataset

In [258]:
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.python.ops.numpy_ops import np_config

model = tf.keras.models.load_model("/home/vishesh/Desktop/datasets/ships-data/Xception-10-0.74.hdf5", compile=False)
model.compile(optimizer=tfa.optimizers.RectifiedAdam(), loss='categorical_crossentropy')

feature_extractor = tf.keras.models.Model(inputs=model.input, outputs=model.get_layer('dense_1').output)

In [259]:
y_preds = np.argmax(model.predict(X_test, batch_size=10), axis=1)
test_features = feature_extractor.predict(X_test, batch_size=10)



In [260]:
current = {
    'target': y_test,
    'prediction': y_preds,
}

current.update({f'Feature_{i+1}': test_features[:, i] for i in range(test_features.shape[1])})

current = pd.DataFrame(current)

In [261]:
# reference = pd.read_csv("/home/vishesh/Desktop/datasets/ships-data/train.csv")
# reference.drop(columns=['Unnamed: 0', 'url'], inplace=True)

# reference['embeddings'] = reference['embeddings'].apply(eval)

# # Determine the maximum number of features
# max_features = max(reference['embeddings'].apply(len))

# # Generate column names for features
# feature_columns = [f'Feature_{i+1}' for i in range(max_features)]

# # Separate features into separate columns
# reference = pd.concat([reference.drop('embeddings', axis=1),
#                 pd.DataFrame(reference['embeddings'].to_list(), columns=feature_columns)],
#                axis=1)

# reference['actual'] = reference['actual'].astype(int)
# reference.rename(columns={'actual': 'target', 'predicted': 'prediction'}, inplace=True)


In [262]:
# embeddings_data = datasets.fetch_lfw_people()
# embeddings_data = pd.DataFrame(embeddings_data['data'])
# embeddings_data.columns = ['col_' + str(x) for x in embeddings_data.columns]

# embeddings_data = embeddings_data.iloc[:5100, :31]

# embeddings_data_shifted = embeddings_data.copy()
# embeddings_data_shifted.iloc[2500:5000, :5] = 0

In [263]:
# embeddings_data.shape

## Embeddings Drift Report

In [264]:
column_mapping = ColumnMapping(
    embeddings={'features': reference.columns[2:1026]},
    target=reference.columns[0]
)

In [265]:
report = Report(metrics=[
    EmbeddingsDriftMetric('features')
])

report.run(reference_data = reference, current_data = current, 
           column_mapping = column_mapping)
report


The loss 'log' was deprecated in v1.1 and will be removed in version 1.3. Use `loss='log_loss'` which is equivalent.



### Embeddings Drift Detection: model

In [266]:
# report = Report(metrics = [
#     EmbeddingsDriftMetric('features', 
#                           drift_method = model(
#                               threshold = 0.55,
#                               bootstrap = None,
#                               quantile_probability = 0.05,
#                               pca_components = None,
#                           )
#                          )
# ])

# report.run(reference_data = reference, current_data = current, 
#            column_mapping = column_mapping)
# report

### Embeddings Drift Detection: mmd

In [267]:
report = Report(metrics = [
    EmbeddingsDriftMetric('features', 
                          drift_method = mmd(
                              threshold = 0.015,
                              bootstrap = None,
                              quantile_probability = 0.95,
                              pca_components = None,
                          )
                         )
])

report.run(reference_data = reference, current_data = current,  
           column_mapping = column_mapping)
report

### Embeddings Drift Detection: ratio

In [268]:
report = Report(metrics = [
    EmbeddingsDriftMetric('features', 
                          drift_method = ratio(
                              component_stattest = 'wasserstein',
                              component_stattest_threshold = 0.1,
                              threshold = 0.2,
                              pca_components = None,
                          )
                         )
])

report.run(reference_data = reference, current_data = current,  
           column_mapping = column_mapping)
report

### Embeddings Drift Detection: distance

In [269]:
report = Report(metrics = [
    EmbeddingsDriftMetric('features', 
                          drift_method = distance(
                              dist = 'euclidean', #"euclidean", "cosine", "cityblock" or "chebyshev"
                              threshold = 0.2,
                              pca_components = None,
                              bootstrap = None,
                              quantile_probability = 0.95
                          )
                         )
])

report.run(reference_data = reference, current_data = current,  
           column_mapping = column_mapping)
report

# Data Drift Metric Preset

In [None]:
from evidently.metric_preset import DataDriftPreset
from evidently.metrics.data_drift.embedding_drift_methods import model, ratio
report = Report(metrics=[
    DataDriftPreset(embeddings=['features'],
                    embeddings_drift_method={'features': ratio(pca_components=5)})
])

report.run(reference_data=reference,
    current_data=current, 
    column_mapping=column_mapping)
report

# Embeddings Drift Test

In [None]:
tests = TestSuite(tests=[
    TestEmbeddingsDrift(embeddings_name='small_subset')
])

tests.run(reference_data=embeddings_data[:2500],
    current_data=embeddings_data_shifted[2500:5000], 
    column_mapping=column_mapping)
tests

# Data Drift Test Preset

In [None]:
tests = TestSuite(tests=[
    DataDriftTestPreset(embeddings=['small_subset'])
])

tests.run(
    reference_data=embeddings_data[:2500],
    current_data=embeddings_data[2500:5000], 
    column_mapping=column_mapping
)
tests

# No Target Performance Test Preset

In [None]:
tests = TestSuite(tests=[
    NoTargetPerformanceTestPreset(embeddings=['small_subset', 'big_subset'])
])

tests.run(
    reference_data=embeddings_data[:2500],
    current_data=embeddings_data[2500:5000], 
    column_mapping=column_mapping
)
tests