## Notebook that suggests fix to evidenlty issue 334:
### The fixed value for feel_zeroes in get_binned_data may lead to deviation in some case.

Here we make a custom change to the utils.py function get_binned_data() and apply it to custom function Kullback Leibler divergence drift score on adult_data.

In [382]:
try:
    import evidently
except:
    !npm install -g yarn
    !pip install git+https://github.com/evidentlyai/evidently.git

In [383]:
import pandas as pd
import numpy as np

from scipy import stats
from sklearn import datasets, ensemble, model_selection

from evidently import ColumnMapping
from evidently.calculations.stattests import StatTest
from evidently.options import DataDriftOptions
from evidently.test_suite import TestSuite
from evidently.tests import *

In [384]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

## Prepare Datasets

In [385]:
#Dataset for Data Quality and Integrity
adult_data = datasets.fetch_openml(name='adult', version=2, as_frame='auto')
adult = adult_data.frame

adult_ref = adult[~adult.education.isin(['Some-college', 'HS-grad', 'Bachelors'])]
adult_cur = adult[adult.education.isin(['Some-college', 'HS-grad', 'Bachelors'])]

#adult_cur.iloc[:2000, 3:5] = np.nan

In [386]:
adult_ref

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25.0,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K
2,28.0,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K
5,34.0,Private,198693.0,10th,6.0,Never-married,Other-service,Not-in-family,White,Male,0.0,0.0,30.0,United-States,<=50K
7,63.0,Self-emp-not-inc,104626.0,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,White,Male,3103.0,0.0,32.0,United-States,>50K
9,55.0,Private,104996.0,7th-8th,4.0,Married-civ-spouse,Craft-repair,Husband,White,Male,0.0,0.0,10.0,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48832,32.0,Private,34066.0,10th,6.0,Married-civ-spouse,Handlers-cleaners,Husband,Amer-Indian-Eskimo,Male,0.0,0.0,40.0,United-States,<=50K
48833,43.0,Private,84661.0,Assoc-voc,11.0,Married-civ-spouse,Sales,Husband,White,Male,0.0,0.0,45.0,United-States,<=50K
48834,32.0,Private,116138.0,Masters,14.0,Never-married,Tech-support,Not-in-family,Asian-Pac-Islander,Male,0.0,0.0,11.0,Taiwan,<=50K
48835,53.0,Private,321865.0,Masters,14.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,40.0,United-States,>50K


## Add two get_binned_data

In [394]:
#current one on evidently
def get_binned_data(reference: pd.Series, current: pd.Series, feature_type: str, n: int, feel_zeroes: bool = True):
    """Split variable into n buckets based on reference quantiles
    Args:
        reference: reference data
        current: current data
        feature_type: feature type
        n: number of quantiles
    Returns:
        reference_percents: % of records in each bucket for reference
        current_percents: % of records in each bucket for reference
    """
    n_vals = reference.nunique()
    if feature_type == "num" and n_vals > 20:

        bins = np.histogram_bin_edges(list(reference) + list(current), bins="sturges")

        reference_percents = np.histogram(reference, bins)[0] / len(reference)
        current_percents = np.histogram(current, bins)[0] / len(current)

    else:
        keys = list((set(reference.unique()) | set(current.unique())) - {np.nan})

        ref_feature_dict = {**dict.fromkeys(keys, 0), **dict(reference.value_counts())}
        current_feature_dict = {**dict.fromkeys(keys, 0), **dict(current.value_counts())}

        reference_percents = np.array([ref_feature_dict[key] / len(reference) for key in keys])
        current_percents = np.array([current_feature_dict[key] / len(current) for key in keys])
    if feel_zeroes:
        np.place(reference_percents, reference_percents == 0, 0.0001)
        np.place(current_percents, current_percents == 0, 0.0001)

    return reference_percents, current_percents

In [388]:
# this get binned data is with the fixed feel zeroes method
def get_binned_data2(reference: pd.Series, current: pd.Series, feature_type: str, n: int, feel_zeroes: bool = True):
    """Split variable into n buckets based on reference quantiles
    Args:
        reference: reference data
        current: current data
        feature_type: feature type
        n: number of quantiles
    Returns:
        reference_percents: % of records in each bucket for reference
        current_percents: % of records in each bucket for reference
    """
    n_vals = reference.nunique()
    if feature_type == "num" and n_vals > 20:

        bins = np.histogram_bin_edges(list(reference) + list(current), bins="sturges")

        reference_percents = np.histogram(reference, bins)[0] / len(reference)
        current_percents = np.histogram(current, bins)[0] / len(current)

    else:
        keys = list((set(reference.unique()) | set(current.unique())) - {np.nan})

        ref_feature_dict = {**dict.fromkeys(keys, 0), **dict(reference.value_counts())}
        current_feature_dict = {**dict.fromkeys(keys, 0), **dict(current.value_counts())}

        reference_percents = np.array([ref_feature_dict[key] / len(reference) for key in keys])
        current_percents = np.array([current_feature_dict[key] / len(current) for key in keys])
    if feel_zeroes:
        np.place(reference_percents, reference_percents == 0, min(reference_percents[reference_percents!=0])/10**6 if min(reference_percents[reference_percents!=0]) <= 0.0001 else 0.0001)
        np.place(current_percents, current_percents == 0, min(current_percents[current_percents!=0])/10**6 if min(current_percents[current_percents!=0]) <= 0.0001 else 0.0001)
        
    return reference_percents, current_percents

## Define custom kl_div drift test and compare for different feel zero thresholds

In [389]:
def kl_div(
    reference_data: pd.Series, current_data: pd.Series, feature_type: str, threshold: float, n_bins: int = 30
):
    """Compute the Kullback-Leibler divergence between two arrays
    Args:
        reference_data: reference data
        current_data: current data
        feature_type: feature type
        threshold: all values above this threshold means data drift
        n_bins: number of bins
    Returns:
        kl_div: calculated Kullback-Leibler divergence value
        test_result: whether the drift is detected
    """
    reference_percents, current_percents = get_binned_data(reference_data, current_data, feature_type, n_bins)
    kl_div_value = stats.entropy(reference_percents, current_percents)
    return kl_div_value, kl_div_value >= threshold


kl_div_stat_test = StatTest(
    name="kl_div",
    display_name="Kullback-Leibler divergence",
    func=kl_div,
    allowed_feature_types=["cat", "num"],
    default_threshold=0.1,
)

In [390]:
#this uses the get_binned_data2
def kl_div2(
    reference_data: pd.Series, current_data: pd.Series, feature_type: str, threshold: float, n_bins: int = 30
):
    """Compute the Kullback-Leibler divergence between two arrays
    Args:
        reference_data: reference data
        current_data: current data
        feature_type: feature type
        threshold: all values above this threshold means data drift
        n_bins: number of bins
    Returns:
        kl_div: calculated Kullback-Leibler divergence value
        test_result: whether the drift is detected
    """
    reference_percents, current_percents = get_binned_data2(reference_data, current_data, feature_type, n_bins)
    kl_div_value2 = stats.entropy(reference_percents, current_percents)
    return kl_div_value2, kl_div_value2 >= threshold


kl_div_stat_test2 = StatTest(
    name="kl_div2",
    display_name="Kullback-Leibler divergence2",
    func=kl_div2,
    allowed_feature_types=["cat", "num"],
    default_threshold=0.1,
)

In [391]:
stat_test_option1 = DataDriftOptions(num_features_stattest=kl_div_stat_test)
stat_test_option2 = DataDriftOptions(num_features_stattest=kl_div_stat_test2)

In [392]:

data_drift_dataset_tests = TestSuite(tests=[
    TestFeatureValueDrift(column_name='education-num'),
    TestFeatureValueDrift(column_name='education-num', options=stat_test_option1),
    TestFeatureValueDrift(column_name='fnlwgt', options=stat_test_option1),
    TestFeatureValueDrift(column_name='capital-loss', options=stat_test_option1),
])

data_drift_dataset_tests.run(reference_data=adult_ref, current_data=adult_cur)
data_drift_dataset_tests
data_drift_dataset_tests.save_html('file.html')

In [393]:
data_drift_dataset_tests = TestSuite(tests=[
    TestFeatureValueDrift(column_name='education-num'),
    TestFeatureValueDrift(column_name='education-num', options=stat_test_option2),
    TestFeatureValueDrift(column_name='fnlwgt', options=stat_test_option2),
    TestFeatureValueDrift(column_name='capital-loss', options=stat_test_option2)   
])

data_drift_dataset_tests.run(reference_data=adult_ref, current_data=adult_cur)
data_drift_dataset_tests
data_drift_dataset_tests.save_html('file2.html')