In [None]:
import numpy as np
from stats import KDE, GaussianMixture, kde_from_file
import matplotlib.pyplot as plt
from time import time
import scipy.spatial.distance as dist
%matplotlib inline

In [None]:
GM = GaussianMixture([-1, 1], [0.5, 0.3])
(xpdf,), ypdf = GM.pdf(npoints=101)
np.random.seed(0)
n = 1000
x = GM.generate_samples(n)

In [None]:
kde1 = KDE(x)
kde1.compute_bandwidth()
ykde1 = kde1.score_samples(xpdf)
bandwidth = kde1.bandwidth

In [None]:
kde2 = KDE(x, scaling=True)
std = kde2.data_helpers.std
kde2.clustering()
kde2.data_helpers.std = std
print("Number of clusters: {:d}".format(len(kde2.data)))

In [None]:
bb = np.linspace(0.1, 0.4, 20)
score1 = np.zeros(len(bb))
score2 = np.zeros(len(bb))
for i, b in enumerate(bb):
    score1[i] = kde1.score_leave_one_out(bandwidth=b, include_const=True)
    score2[i] = kde2.score_leave_one_out(bandwidth=b, include_const=True)

plt.plot(bb, score1, label="Orig")
plt.plot(bb, score2, label="Clustered")
plt.xlabel("Bandwidth")
plt.ylabel("Score")
plt.legend()

In [None]:
kde1.set_bandwidth(bandwidth)
kde2.set_bandwidth(bandwidth)

In [None]:
ykde2 = kde2.score_samples(xpdf)
plt.plot(xpdf, ykde1, label="Orig")
plt.plot(xpdf, ykde2, label="Clustered")
plt.legend()

In [None]:
cdf1 = kde1.cdf(xpdf)
cdf2 = kde2.cdf(xpdf)
plt.plot(xpdf, cdf1, label="Orig")
plt.plot(xpdf, cdf2, label="Clustered")
plt.legend()

In [None]:
laplacian1 = kde1.laplacian(xpdf)
laplacian2 = kde2.laplacian(xpdf)
plt.plot(xpdf, laplacian1, label="Orig")
plt.plot(xpdf, laplacian2, label="Clustered")
plt.legend()

In [None]:
np.random.seed(0)
hist1 = np.histogram(kde1.sample(100000), bins=np.linspace(-3, 3, 25), density=True)
p = plt.plot((hist1[1][1:] + hist1[1][:-1])/2, hist1[0], '.', label="Orig")
# plt.plot(xpdf, ykde1, label="Orig")
hist2 = np.histogram(kde2.sample(100000), bins=np.linspace(-3, 3, 25), density=True)
p = plt.plot((hist2[1][1:] + hist2[1][:-1])/2, hist2[0], '.', label="Clustered")
# plt.plot(xpdf, ykde2, label="Clustered")
plt.legend()

In [None]:
GM = GaussianMixture([[-1, -1], [1, 1]], [[[0.5, -0.2], [-0.2, 0.5]], [[0.5, -0.2], [-0.2, 0.5]]])
(xpdf1, xpdf2,), ypdf = GM.pdf(npoints=51)
np.random.seed(0)
n2 = 10000
x2 = GM.generate_samples(n2)

In [None]:
def plot2d(y, y2=None):
    if y2 is None:
        fig, ax1 = plt.subplots(1, 1, figsize=(6, 5))
    else:
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
    
    cs = ax1.contourf(xpdf1, xpdf2, y)
    ax1.set_xlabel("Parameter 1")
    ax1.set_ylabel("Parameter 2")
    if y2 is not None:
        ax2.contourf(xpdf1, xpdf2, np.abs(y-y2), levels=cs.levels)
        ax2.set_xlabel("Parameter 1")
        ax2.set_ylabel("Parameter 2")
        ax2.set_title("Difference")
    fig.colorbar(cs)
plot2d(ypdf)

In [None]:
xpdf2d = np.concatenate((xpdf1[:, :, np.newaxis], xpdf2[:, :, np.newaxis]), axis=2)
kde3 = KDE(x2)
kde3.compute_bandwidth()
ypdf3 = kde3.score_samples(xpdf2d)
plot2d(ypdf3, ypdf)

In [None]:
bandwidth2 = kde3.bandwidth
kde4 = KDE(x2)
kde4.clustering()
kde4.set_bandwidth(bandwidth2)
ypdf4 = kde4.score_samples(xpdf2d)
plot2d(ypdf4, ypdf3)

In [None]:
bb = np.linspace(0.1, 0.4, 20)
score3 = np.zeros(len(bb))
score4 = np.zeros(len(bb))
for i, b in enumerate(bb):
    score3[i] = kde3.score_leave_one_out(bandwidth=b, include_const=True)
    score4[i] = kde4.score_leave_one_out(bandwidth=b, include_const=True)
    
plt.plot(bb, score3, label="Orig")
plt.plot(bb, score4, label="Counted")
plt.xlabel("Bandwidth")
plt.ylabel("Score")
plt.legend()

In [None]:
cdf3 = kde3.cdf(xpdf2d)
cdf4 = kde4.cdf(xpdf2d)
plot2d(cdf4, cdf3)

In [None]:
laplacian3 = kde3.laplacian(xpdf2d)
laplacian4 = kde4.laplacian(xpdf2d)
plot2d(laplacian4, laplacian3)

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(14, 5))

np.random.seed(0)
samples = kde3.sample(10000)
hist1 = np.histogram2d(samples[:, 0], samples[:, 1], np.linspace(-3, 3, 13))
cs = ax1.contourf(hist1[1][:-1], hist1[2][:-1], hist1[0])
ax1.set_title("Histogram samples original")

samples = kde4.sample(10000)
hist2 = np.histogram2d(samples[:, 0], samples[:, 1], np.linspace(-3, 3, 13))
ax2.contourf(hist2[1][:-1], hist2[2][:-1], hist2[0], levels=cs.levels)
ax2.set_title("Histogram samples weighted")

ax3.contourf(hist2[1][:-1], hist2[2][:-1], np.abs(hist1[0]-hist2[0]), levels=cs.levels)
ax3.set_title("Absolute difference")
fig.colorbar(cs)

In [None]:
np.random.seed(0)
plt.plot(np.histogram(kde3.conditional_sample(1, -1, 100000), 
                      bins=np.linspace(-3, 3, 25))[0], '.', label="Orig")
plt.plot(np.histogram(kde4.conditional_sample(1, -1, 100000), 
                      bins=np.linspace(-3, 3, 25))[0], '.', label="Weighted")
plt.legend()

In [None]:
%%timeit
kde1.score_leave_one_out(bandwidth=bandwidth)

In [None]:
%%timeit
kde2.score_leave_one_out(bandwidth=bandwidth)

In [None]:
%%timeit
kde1.score_samples(xpdf)

In [None]:
%%timeit
kde2.score_samples(xpdf)

In [None]:
%%timeit
kde1.sample(1)

In [None]:
%%timeit
kde2.sample(1)

In [None]:
%%timeit
kde3.score_leave_one_out(bandwidth=bandwidth2)

In [None]:
%%timeit
kde4.score_leave_one_out(bandwidth=bandwidth2)

In [None]:
%%timeit
kde3.score_samples(xpdf2d)

In [None]:
%%timeit
kde4.score_samples(xpdf2d)

In [None]:
%%timeit
kde3.sample(10000)

In [None]:
%%timeit
kde4.sample(10000)

In [None]:
%%timeit
kde3.conditional_sample(1, -1, 10000)

In [None]:
%%timeit
kde4.conditional_sample(1, -1, 10000)