In [None]:
import numpy as np
from kde import KDE
import matplotlib.pyplot as plt
import time
import scipy
import scipy.spatial.distance as dist
from sklearn.neighbors import KernelDensity
from statsmodels.nonparametric.kernel_density import KDEMultivariate

# Simple example

In [None]:
# We use a uniform distribution on the interval of [0, 1]
# This means that the entropy is 0
H = 0

# Values for which the pdf has to be evaluated
xeval = np.linspace(-1, 2, 3000)

# Other parameters
n = 10000
m = 100
np.random.seed(0)

In [None]:
def silverman(x):
    return 1.06 * np.std(x) / (len(x) ** (1/5))

In [None]:
x = np.random.rand(n)


kde = KDE(data=x[:, np.newaxis], bandwidth=silverman(x))
kde.compute_kde()

In [None]:
plt.plot(xeval, np.exp(kde.kde.score_samples(xeval[:, np.newaxis])))

In [None]:
def estimate_entropy(x, y):
    z = np.zeros_like(y)
    for i in range(len(y)):
        if y[i] > 0:
            z[i] = y[i] * np.log(y[i])
    return -np.trapz(z, x=x)

In [None]:
np.random.seed(0)
n = 1000
x = np.random.randn(n)/5
bandwidth = np.logspace(-2, -0.5, 100)
h = np.zeros_like(bandwidth)
minloglikelihood = np.zeros_like(bandwidth)

for i, bw in enumerate(bandwidth):
    kde = KDE(data=x[:, np.newaxis], bandwidth=bw)
    kde.compute_kde()
    pdf = np.exp(kde.kde.score_samples(xeval[:, np.newaxis]))
    #plt.plot(xeval, pdf)
    h[i] = estimate_entropy(xeval, pdf)
    minloglikelihood[i] = -kde.kde.score(x[:, np.newaxis]) / n

In [None]:
plt.semilogx(bandwidth, h)
plt.semilogx(bandwidth, minloglikelihood)
plt.semilogx(bandwidth, h-minloglikelihood)
plt.grid('on')
print("Minimum add h={:.3f}".format(bandwidth[np.argmin(h - minloglikelihood)]))

# Compute bandwidth faster

In [None]:
n = 80
np.random.seed(0)
x = np.random.rand(n)
xpdf = np.linspace(-1, 2, 301)

In [None]:
def method1(x):
    bandwidth = np.linspace(0.001, 0.2, 200)
    loglikelihood = np.zeros_like(bandwidth)
    for i, h in enumerate(bandwidth):
        for j in range(n):
            kde = KDE(data=np.concatenate((x[:j], x[j+1:]))[:, np.newaxis], bandwidth=h)
            kde.compute_kde()
            loglikelihood[i] += kde.kde.score(x[j:j+1, np.newaxis])
    #plt.plot(bandwidth, np.exp(loglikelihood))
    #plt.xlabel('Bandwidth')
    #print("Optimal bandwidth: {:.3f}".format(bandwidth[np.argmax(loglikelihood)]))
    return bandwidth[np.argmax(loglikelihood)]
method1(x)

In [None]:
%%timeit
method1(x)

In [None]:
def method2(x):
    kde = KDE(data=x[:, np.newaxis])
    kde.compute_kde(min_bandwidth=0.001, max_bandwidth=0.2, n_bandwidths=200, cv=n)
    return kde.bandwidth
method2(x)

In [None]:
%%timeit
method2(x)

In [None]:
def method3(x):
    bandwidth = np.linspace(0.001, 0.2, 200)
    score = np.zeros_like(bandwidth)
    mindists = -dist.squareform(dist.pdist(x[:, np.newaxis], metric='sqeuclidean')) / 2
    for i, h in enumerate(bandwidth):
        score[i] = np.sum(np.log(np.sum(np.exp(mindists / h**2), axis=0) - 1)) - n*np.log(h)# - n/2*np.log(2*np.pi) - n*np.log(n)
    return bandwidth[np.argmax(score)]
method3(x)         

In [None]:
%%timeit
method3(x)

In [None]:
bandwidth = np.linspace(0.001, 0.2, 200)
score = np.zeros_like(bandwidth)
mindists = -dist.squareform(dist.pdist(x[:, np.newaxis], metric='sqeuclidean')) / 2
for i, h in enumerate(bandwidth):
    score[i] = np.sum(np.log(np.sum(np.exp(mindists / h**2), axis=0) - 1)) - n*np.log(h)# - n/2*np.log(2*np.pi) - n*np.log(n)
plt.plot(bandwidth, np.exp(score / np.max(score)))

In [None]:
def opt(abc):
    n = [10, 20, 30, 40, 50, 60, 70, 80]
    #x = [1.79, 3.63, 5.63, 7.85, 10.6, 13.7, 16.6, 19.9]
    x = [2.7, 3.45, 4.71, 6.38, 8.72, 10.9, 13.8, 17.1]
    y = abc[0] + abc[1]*n**abc[2]
    return np.sum((x - y)**2)

In [None]:
scipy.optimize.minimize(opt, [1.6, 0.2, 1])

In [None]:
def method4(x):
    bandwidth = np.linspace(0.001, 0.2, 200)
    loglikelihood = np.zeros_like(bandwidth)
    
    for i, h in enumerate(bandwidth):
        kde_skl = KernelDensity(bandwidth=h)
        for j in range(n):
            kde_skl.fit(np.concatenate((x[:j], x[j+1:]))[:, np.newaxis])
            loglikelihood[i] += kde_skl.score(x[j:j+1, np.newaxis])
    return bandwidth[np.argmax(loglikelihood)]
method4(x)

In [None]:
kde_skl = KernelDensity(bandwidth=0.045)
kde_skl.fit(x[:, np.newaxis])
plt.plot(xpdf, np.exp(kde_skl.score_samples(xpdf[:, np.newaxis])))

In [None]:
%%timeit
method4(x)

In [None]:
def method5(x):
    kde = KDEMultivariate(x[:, np.newaxis], 'c', bw='cv_ml')
    return kde
kde = method5(x)
plt.plot(xpdf, kde.pdf(xpdf[:, np.newaxis]))

In [None]:
%%timeit
method5(x)

# One-leave-out loglikelihood versus entropy

In [None]:
# Parameters
n = 100
H = np.linspace(0.01, 0.2, 20)
xpdf = np.linspace(-3, 5, 8001)  # This is where the pdf will be evaluated

In [None]:
x = np.random.rand(n)
e = np.zeros_like(H)
for i, h in enumerate(H):
    kde = KDE(data=x[:, np.newaxis], bandwidth=h)
    kde.compute_kde()
    y = np.exp(kde.kde.score_samples(xpdf[:, np.newaxis]))
    e[i] = estimate_entropy(xpdf, y)

In [None]:
plt.plot(H, e)

In [None]:
score = np.zeros_like(H)
mindists = -dist.squareform(dist.pdist(x[:, np.newaxis], metric='sqeuclidean')) / 2
for i, h in enumerate(H):
    score[i] = -np.sum(np.log(np.sum(np.exp(mindists / h**2), axis=0) - 1))/n + np.log(h)# - n/2*np.log(2*np.pi) - n*np.log(n)
score += 1/2*np.log(2*np.pi) + np.log(n-1)

In [None]:
plt.plot(H, e)
plt.plot(H, score)