# Variable bandwidth - does it make things better?

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from stats import KDE
import scipy.stats

We draw samples from a Cauchy distribution, which is knows for its heavy tails. The probability density equals:
$$
p(x; x_0, \gamma) = \frac{1}{\pi\gamma \left( 1+ \left(\frac{x-x_0}{\gamma}\right)^2 \right)}.
$$

In [None]:
nrepeat = 100
ndata = 100

In [None]:
xpdf = np.linspace(-10, 10, 301)
ypdf = 1 / (np.pi*(1+xpdf**2))

In [None]:
plt.plot(xpdf, ypdf)
plt.ylim(0, plt.ylim()[1])
plt.xlim(np.min(xpdf), np.max(xpdf))

In [None]:
def silverman_bandwidth(xdata):
    return 0.9*np.min((np.std(xdata), scipy.stats.iqr(xdata))) / len(xdata)**.2

def ise(xpdf, yreal, yest):
    """ Compute the integrated squared error. """
    return np.trapz((yreal-yest)**2, xpdf)

def normal_kde(xdata, xpdf):
    """ Fit a normal KDE and compute the likelihood. """
    kde = KDE(xdata)
    kde.compute_bandwidth(max_bw=silverman_bandwidth(xdata))
    return kde.score_samples(xpdf)

def variable_kde(xdata, xpdf):
    """ Fit a KDE with a variable bandwidth. """
    kde = KDE(xdata)
    kde.constants.variable_bandwidth = True
    kde.constants.percentile = 80
    if kde.compute_bandwidth(max_bw=silverman_bandwidth(xdata)*100) == 0:
        print("Bandwidth is not max_bw")
    return kde.score_samples(xpdf)

def silverman(xdata, xpdf):
    """ Fit a KDE using Silverman's rule. """
    kde = KDE(xdata)
    kde.set_bandwidth(silverman_bandwidth(xdata))
    return kde.score_samples(xpdf)

def test_method(dist, fits, xpdf, ypdf, nrepeat=100, plot=False):
    errors = np.zeros((nrepeat, len(fits)))
    if plot:
        n_horz_plots = 4
        n_vert_plots = np.ceil(nrepeat/n_horz_plots).astype(np.int)
        _, axs = plt.subplots(n_vert_plots, n_horz_plots, figsize=(16, n_vert_plots*3))
        axs = np.ravel(axs)
    for i in range(nrepeat):
        xdata = dist(ndata)
        if plot:
            axs[i].plot(xpdf, ypdf, 'r-')
            axs[i].set_xlim(np.min(xpdf), np.max(xpdf))
        for j, fit in enumerate(fits):
            yest = fit(xdata, xpdf)
            errors[i, j] = ise(xpdf, ypdf, yest)
            if plot:
                axs[i].plot(xpdf, yest, ['b-', 'g-', 'y-'][j])        
    return errors    

In [None]:
np.random.seed(0)
test_method(np.random.standard_cauchy, [normal_kde, silverman, variable_kde], 
            xpdf, ypdf, nrepeat=12, plot=True)

In [None]:
np.random.seed(0)
errors = test_method(np.random.standard_cauchy, [normal_kde, silverman, variable_kde], 
                     xpdf, ypdf, nrepeat=100)

In [None]:
np.mean(errors, axis=0)

In [None]:
np.mean(errors, axis=0)

In [None]:
np.trapz(ypdf**2, xpdf)

In [None]:
np.random.seed(0)
while True:
    xdata = np.random.standard_cauchy(100)
    if np.std(xdata) > 100:
        break

In [None]:
kde = KDE(xdata)
kde.constants.variable_bandwidth = True
kde.constants.percentile = 80
kde.compute_bandwidth(max_bw=1)
ypdf = kde.score_samples(xpdf)

In [None]:
plt.plot(xpdf, ypdf)

In [None]:
kde.bandwidth

In [None]:
bandwidth_normalized = np.sqrt(-2*np.percentile(kde.data_helpers.mindists, 80, axis=0))
# bandwidth_normalized /= np.median(bandwidth_normalized)

In [None]:
q = np.sort(xdata)

In [None]:
q[0]

In [None]:
np.sqrt(2*np.max(bandwidth_normalized))

In [None]:
np.mean(bandwidth_normalized / kde.bandwidth)

In [None]:
kde.compute_bandwidth(max_bw=100000, max_iter=1000)

In [None]:
bandwidth_normalized / kde.bandwidth

In [None]:
plt.plot(xpdf, kde.score_samples(xpdf))

In [None]:
kde.constants.variable_bandwidth = True

In [None]:
kde.compute_bandwidth(max_bw=1000)

In [None]:
kde.bandwidth