In [1]:
import sys

sys.path.insert(0, "../src/")

%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import beta, norm, expon, gamma

from phoenix.metrics.drift import psi

In [None]:
normal_random_variable = norm()
beta_random_variable = beta(100, 10)
x = np.linspace(-3, 3, 1000)
y_normal = normal_random_variable.pdf(x)
y_beta = beta_random_variable.pdf(x)

In [None]:
plt.plot(x, y_normal)
plt.plot(x, y_beta)
plt.show()

In [None]:
lambd = 2
exponential_random_variable = expon(2)
gamma_random_variable = gamma(2, lambd)
x = np.linspace(0, 10, 1000)
y_exponential = exponential_random_variable.pdf(x)
y_gamma = gamma_random_variable.pdf(x)

In [None]:
reference_random_variable = norm()
primary_mean = 10
primary_std = 1
primary_random_variable = norm(loc=primary_mean, scale=primary_std)
x = np.linspace(-3, 3, 1000)
y_primary = reference_random_variable.pdf(x)
y_reference = primary_random_variable.pdf(x)

In [None]:
plt.plot(x, y_reference, c="blue")
plt.plot(x, y_primary, c="orange")
plt.show()

In [None]:
# Generate synthetic data
num_samples = pow(10, 6)
reference_df = pd.DataFrame.from_dict({"data": reference_random_variable.rvs(size=num_samples)})
primary_df = pd.DataFrame.from_dict({"data": primary_random_variable.rvs(size=num_samples)})

In [None]:
psi(primary=primary_df, reference=reference_df)

In [None]:
# Gradually introduce drift in primary distribution
reference_random_variable = norm()
reference_df = pd.DataFrame.from_dict({"data": reference_random_variable.rvs(size=num_samples)})
primary_std = 1
psi_values = []
num_samples = pow(10, 6)
primary_means = np.linspace(0, 0.5, 10)
for primary_mean in primary_means:
    print(f"mean: {primary_mean}")
    primary_random_variable = norm(loc=primary_mean, scale=primary_std)
    primary_df = pd.DataFrame.from_dict({"data": primary_random_variable.rvs(size=num_samples)})
    psi_values.append(psi(primary=primary_df, reference=reference_df))

In [None]:
plt.plot(primary_means, psi_values)
plt.show()

In [4]:
import datetime

from phoenix.metrics.drift.psi import _psi, compute_edge_bins, compute_histogram

# num_features = 200
# num_samples = 6 * (10 ** 7)
num_features = 100
num_samples = 10**6
epsilon = 10e-7
reference_random_variable = norm()
primary_random_variable = norm()
reference = pd.DataFrame.from_dict(
    {
        f"feature_{index}": reference_random_variable.rvs(size=num_samples)
        for index in range(num_features)
    }
)
primary = pd.DataFrame.from_dict(
    {
        f"feature_{index}": reference_random_variable.rvs(size=num_samples)
        for index in range(num_features)
    }
)

# start computation
psi_start = datetime.datetime.now()
print(f"num features: {num_features}")
print(f"num samples: {num_samples}")

# compute bins
start = datetime.datetime.now()
bins_column = compute_edge_bins(reference)
end = datetime.datetime.now()
compute_bins_duration = end - start
print(f"compute bins: {compute_bins_duration}")

# compute primary histogram
start = datetime.datetime.now()
primary_histogram = compute_histogram(primary, bins_column)
end = datetime.datetime.now()
compute_primary_histogram_duration = end - start
print(f"compute primary histogram: {compute_primary_histogram_duration}")

# compute reference histogram
start = datetime.datetime.now()
reference_histogram = compute_histogram(reference, bins_column)
end = datetime.datetime.now()
compute_reference_histogram_duration = end - start
print(f"compute reference histogram: {compute_reference_histogram_duration}")

# compute primary distribution
start = datetime.datetime.now()
primary_distributions = primary_histogram / primary_histogram.sum()
end = datetime.datetime.now()
compute_primary_distribution_duration = end - start
print(f"compute primary distribution: {compute_primary_distribution_duration}")

# compute reference distribution
start = datetime.datetime.now()
reference_distributions = reference_histogram / reference_histogram.sum()
end = datetime.datetime.now()
compute_reference_distribution_duration = end - start
print(f"compute reference distribution duration: {compute_reference_distribution_duration}")

# compute psi
start = datetime.datetime.now()
out = _psi(primary_distributions, reference_distributions, epsilon)
end = datetime.datetime.now()
compute_psi_duration = end - start
print(f"compute psi duration: {compute_psi_duration}")

# end computation
psi_end = datetime.datetime.now()
total_duration = psi_end - psi_start
print(f"total duration: {total_duration}")

# display computed values
print("PSI")
print(out)

compute bins: 0:00:01.813287
compute primary histogram: 0:01:06.583072
compute reference histogram: 0:01:06.844766
compute primary distribution: 0:00:00.000350
compute reference distribution duration: 0:00:00.000235
compute psi duration: 0:00:00.000492
total duration: 0:02:15.242770
PSI
feature_0     0.000010
feature_1     0.000013
feature_2     0.000015
feature_3     0.000012
feature_4     0.000019
                ...   
feature_95    0.000008
feature_96    0.000014
feature_97    0.000009
feature_98    0.000016
feature_99    0.000011
Length: 100, dtype: float64
