In [None]:
import numpy as np
import matplotlib.pyplot as plt
from fastkde import KDE
from gaussianmixture import GaussianMixture
from tqdm import tqdm_notebook as tqdm
import scipy.stats
%matplotlib inline

In [None]:
# Create object for generating data from a Gaussian mixture
xlim = [-3, 3]
gm = GaussianMixture([-1, 1], [0.5, 0.3])
(xpdf,), ypdf = gm.pdf(minx=[xlim[0]], maxx=[xlim[1]], n=101)

# Show KDE with confidence bands

In [None]:
# Parameters
seed = 0
ndatapoints = 500
confidence = 0.95
nrepeat = 1000  # Number of repeats for the bootstrap

In [None]:
# Use the plug-in method
np.random.seed(seed)
x = gm.generate_samples(ndatapoints)
kde = KDE(data=x)
kde.compute_bw()  # Compute the bandwidth using one-leave-out cross validation
bandwidth = kde.bw  # Store the bandwidth for later usage
print("Bandwidth: {:.5f}".format(bandwidth))
kde.compute_kde()
ypdf_estimated = kde.score_samples(xpdf)
low_plugin, up_plugin = kde.confidence_interval(xpdf, confidence=confidence)

In [None]:
# Use the bootstrap method for determining the confidence interval
np.random.seed(seed)
pdfs_bootstrap = np.zeros((nrepeat, len(xpdf)))
for i in tqdm(range(nrepeat)):
    kde = KDE(data=x[np.random.choice(len(x), size=len(x), replace=True)], bw=bandwidth)
    kde.compute_kde()
    pdfs_bootstrap[i] = kde.score_samples(xpdf)
std = np.std(pdfs_bootstrap, axis=0)
zvalue = scipy.stats.norm.ppf(confidence/2+0.5)
low_bootstrap1 = ypdf_estimated - zvalue*std  # "Bootstrap and plug-in approach"
up_bootstrap1 = ypdf_estimated + zvalue*std
deviation = np.percentile(np.abs(pdfs_bootstrap - np.mean(pdfs_bootstrap, axis=0)), confidence*100, axis=0)
low_bootstrap2 = ypdf_estimated - deviation
up_bootstrap2 = ypdf_estimated + deviation

In [None]:
# Perform the KDE many times to see the real uncertainty
np.random.seed(seed)
pdfs = np.zeros((nrepeat, len(xpdf)))
for i in tqdm(range(nrepeat)):
    x = gm.generate_samples(ndatapoints)
    kde = KDE(data=x)
    kde.compute_bw(min_bw=0.05, max_bw=0.5)
    kde.compute_kde()
    pdfs[i] = kde.score_samples(xpdf)
low_real = np.percentile(pdfs, (1-confidence)*50, axis=0)
up_real = np.percentile(pdfs, (1+confidence)*50, axis=0)

In [None]:
f, ax = plt.subplots(1, 1, figsize=(16, 10))

# Plot the result of the plug-in method
plt_pdf, = ax.plot(xpdf, ypdf)
plt_estimated, = ax.plot(xpdf, ypdf_estimated)
plt_plugin = ax.plot(np.array([xpdf, xpdf]).T, np.array([low_plugin, up_plugin]).T, '--', color=[1, .5, .5])
plt_bootstrap1 = ax.plot(np.array([xpdf, xpdf]).T, np.array([low_bootstrap1, up_bootstrap1]).T, '--', color=[.5, 1, .5])
plt_bootstrap2 = ax.plot(np.array([xpdf, xpdf]).T, np.array([low_bootstrap2, up_bootstrap2]).T, '--', color=[.5, .5, 1])
plt_realconf = ax.fill_between(xpdf, low_real, up_real, facecolor=[.6, .6, .6], alpha=.5)
ax.legend([plt_pdf, plt_estimated, plt_plugin[0], plt_bootstrap1[0], plt_bootstrap2[0], plt_realconf], 
          ['Real', 'Estimated', '{:.0f}% Confidence (plug-in)'.format(confidence*100),
           '{:.0f}% Confidence (bootstrap and plug-in)'.format(confidence*100),
           '{:.0f}% Confidence (bootstrap)'.format(confidence*100),
           '{:.0f}% Confidence (real)'.format(confidence*100)])
ax.grid(True)
_ = ax.set_xlim(xlim)  # The "_ =" suppresses the output