In [None]:
import sys

sys.path.insert(0, "../src/")

import concurrent.futures as cf
import datetime

import numpy as np
import pandas as pd
from scipy.stats import norm

from phoenix.datasets.binning import compute_default_bins, compute_histogram
from phoenix.metrics.drift import psi

In [None]:
def initialize_dataframe(feature_column_names, num_samples, dtype, max_workers=None):
    data = {}
    with cf.ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_column_name = {
            executor.submit(lambda: np.random.random(num_samples).astype(dtype)): column_name
            for column_name in feature_column_names
        }
        for future in cf.as_completed(future_to_column_name):
            column_name = future_to_column_name[future]
            data[column_name] = future.result()
    return pd.DataFrame.from_dict(data)

In [None]:
num_features = 100
num_samples = 10**7
dtype = np.float32
if dtype == np.float32:
    num_bytes_per_element = 4
elif dtype == np.float64:
    num_bytes_per_element = 8
else:
    raise ValueError()
num_gigs = (num_features * num_samples * num_bytes_per_element) // 10**9
epsilon = 1e-7
feature_column_names = [f"feature{i}" for i in range(num_features)]
primary = initialize_dataframe(feature_column_names, num_samples, dtype)
reference = initialize_dataframe(feature_column_names, num_samples, dtype)

In [None]:
# start computation
psi_start = datetime.datetime.now()
print(f"num features: {num_features}")
print(f"num samples: {num_samples}")
print(f"dtype: {dtype}")
print(f"num gigs: {num_gigs}")

# compute bins
start = datetime.datetime.now()
bins_column = compute_default_bins(reference)
end = datetime.datetime.now()
compute_bins_duration = end - start
print(f"compute bins: {compute_bins_duration}")

# compute primary histogram
start = datetime.datetime.now()
primary_histogram = compute_histogram(primary, bins_column)
end = datetime.datetime.now()
compute_primary_histogram_duration = end - start
print(f"compute primary histogram: {compute_primary_histogram_duration}")

# compute reference histogram
start = datetime.datetime.now()
reference_histogram = compute_histogram(reference, bins_column)
end = datetime.datetime.now()
compute_reference_histogram_duration = end - start
print(f"compute reference histogram: {compute_reference_histogram_duration}")

# compute primary distribution
start = datetime.datetime.now()
primary_distributions = primary_histogram / primary_histogram.sum()
end = datetime.datetime.now()
compute_primary_distribution_duration = end - start
print(f"compute primary distribution: {compute_primary_distribution_duration}")

# compute reference distribution
start = datetime.datetime.now()
reference_distributions = reference_histogram / reference_histogram.sum()
end = datetime.datetime.now()
compute_reference_distribution_duration = end - start
print(f"compute reference distribution duration: {compute_reference_distribution_duration}")

# compute psi
start = datetime.datetime.now()
out = psi(primary_distributions, reference_distributions, epsilon)
end = datetime.datetime.now()
compute_psi_duration = end - start
print(f"compute psi duration: {compute_psi_duration}")

# end computation
psi_end = datetime.datetime.now()
total_duration = psi_end - psi_start
print(f"total duration: {total_duration}")

# display computed values
print("PSI")
print(out)

In [None]:
def calculate_psi(primary_df, reference_df, max_workers=None):
    bins_column = compute_default_bins(reference_df, max_workers=max_workers)
    primary_histogram = compute_histogram(primary_df, bins_column, max_workers=max_workers)
    reference_histogram = compute_histogram(reference_df, bins_column, max_workers=max_workers)
    primary_distributions = primary_histogram / primary_histogram.sum()
    reference_distributions = reference_histogram / reference_histogram.sum()
    return psi(primary_distributions, reference_distributions, epsilon)

In [None]:
import time

max_workers_list = list(range(1, 11))
durations = []
for max_workers in max_workers_list:
    print(max_workers)
    start = time.time()
    calculate_psi(primary, reference, max_workers=max_workers)
    duration = time.time() - start
    durations.append(duration)

In [None]:
%matplotlib notebook
import matplotlib.pyplot as plt

plt.plot(max_workers_list, durations)
plt.xlabel("num threads")
plt.ylabel("seconds")
plt.show()