In [None]:
import altair as alt
import numpy as np
import pandas as pd
from scipy.stats import rankdata, gaussian_kde
from vega_datasets import data


def generate_van_der_corput(n, base=2):
    sequence = []
    for i in range(1, n + 1):
        vdc = 0
        denominator = 1
        numerator = i
        while numerator > 0:
            denominator *= base
            remainder = numerator % base
            vdc += remainder / denominator
            numerator //= base
        sequence.append(vdc)
    return sequence


def calculate_offsets(data, x_var, y_var, group_vars):
    """
    Calculate offsets for data points based on Van der Corput sequence and Gaussian KDE.

    This function computes offsets for data points to avoid overlap in a scatter plot.
    It uses the Van der Corput sequence for quasi-random distribution and Gaussian
    Kernel Density Estimation (KDE) to determine point densities.

    Parameters:
    data (pd.DataFrame): The input data containing the variables.
    x_var (str): The name of the x-axis variable.
    y_var (str): The name of the y-axis variable.
    group_vars (list of str): The list of variables to group by.

    Returns:
    pd.DataFrame: A DataFrame with an additional column 'xOffset' containing the calculated offsets.

    Notes:
    - The function makes a copy of the input data to avoid modifying the original DataFrame.
    - The Van der Corput sequence is used to generate quasi-random offsets.
    - Gaussian KDE is used to estimate the density of y-values.
    - Offsets are scaled by the density and the square root of the subgroup size.
    """
    # make a copy of the data to avoid modifying the original
    data = data.copy()
    y_values = data.groupby(group_vars)[y_var].apply(np.array).reset_index()
    for i, row in y_values.iterrows():
        y_list = row[y_var]
        if len(y_list[~np.isnan(y_list)]) < 2:
            continue
        van_der_corput_seq = generate_van_der_corput(len(y_list))
        ranks = rankdata(y_list, method="ordinal")
        van_der_corput_seq = [van_der_corput_seq[i - 1] for i in ranks]
        density = gaussian_kde(y_list, bw_method=0.1)
        x_vals = np.linspace(min(y_list), max(y_list), 2**10)
        dens_y = density(x_vals)
        dens_y /= max(dens_y)
        subgroup_width = np.sqrt(len(y_list))
        point_densities = np.interp(y_list, x_vals, dens_y)

        offset = (
            (np.array(van_der_corput_seq) - 0.5) * 2 * point_densities * subgroup_width
        )
        condition = data[x_var] == row[x_var]
        for var in group_vars[1:]:
            condition &= data[var] == row[var]
        data.loc[condition, "xOffset"] = offset
    return data


def quasirandom_point_facet(
    data, x_var, y_var, facet_var, color_var, point_size=20, step=40
):
    if x_var == facet_var:
        raise ValueError("x_var and facet_var cannot be the same")
    data_with_offset = calculate_offsets(data, x_var, y_var, [x_var, facet_var])

    p = (
        alt.Chart(data_with_offset)
        .mark_circle(size=point_size)
        .encode(
            x=f"{x_var}:N",
            xOffset="xOffset:Q",
            y=f"{y_var}:Q",
            color=f"{color_var}:N",
        )
        .properties(width=alt.Step(step))
        .facet(f"{facet_var}:N", columns=2)
        .resolve_axis(x="independent")
        .resolve_scale(x="shared", y="independent")
    )
    return p


def quasirandom_point(data, x_var, y_var, color_var, point_size=20, step=40):
    data_with_offset = calculate_offsets(data, x_var, y_var, [x_var])
    p = (
        alt.Chart(data_with_offset)
        .mark_circle(size=point_size)
        .encode(
            x=f"{x_var}:N",
            xOffset="xOffset:Q",
            y=f"{y_var}:Q",
            color=f"{color_var}:N",
        )
        .properties(width=alt.Step(step))
        .resolve_axis(x="independent")
        .resolve_scale(x="shared", y="independent")
    )
    return p

In [10]:
# load iris dataset
iris = data.iris()
# plot quasirandom points
p = quasirandom_point(iris, "species", "sepalWidth", "species", step=80)
p