In [None]:
import numpy as np
import pickle
import matplotlib.pyplot as plt
from fastkde import KDE
import os
from tqdm import tqdm_notebook as tqdm
import h5py
import scipy.special
%matplotlib inline
%load_ext autoreload
%autoreload 2

### Apply method for approximating the MISE

In [None]:
# Open the dataset
with open(os.path.join('pickles', 'df.p'), 'rb') as f:
    dfs, scaling = pickle.load(f)
scaling = scaling.T  # [time vstart vend]
scaling = scaling[scaling[:, 2] > 0, :]  # Remove full stops
scaling[:, 1] = scaling[:, 1] - scaling[:, 2]  # Now it becomes: [time deltav vend] (less correlation)
scaling[:, 0] = scaling[:, 1] / scaling[:, 0]  # Now it becomes: [deceleration deltav vend] (better behaved)
std_scaling = np.std(scaling, axis=0)
mean_scaling = np.mean(scaling, axis=0)
scaling = (scaling - mean_scaling) / std_scaling

In [None]:
np.corrcoef(scaling.T)

In [None]:
# Parameters
nmin = 600
nstep = 300
xmax = 3
nx = 61
nrepeat = 10
seed = 0
d = scaling.shape[1]
overwrite = False

In [None]:
nn = np.arange(nmin, scaling.shape[0], nstep)
bw = np.zeros((nrepeat, len(nn)))
pdfs = np.zeros(np.concatenate(([nrepeat, len(nn)], np.ones(d, dtype=np.int)*nx)))
x = np.linspace(-xmax, xmax, nx)
dx = np.mean(np.gradient(x))

In [None]:
# Initialize the KDE
np.random.seed(seed)
data = scaling.copy()
filename = os.path.join('hdf5', 'pdfs_real_dataset.hdf5')
if overwrite or not os.path.exists(filename):
    for repeat in tqdm(range(nrepeat)):
        np.random.shuffle(data)
        kde = KDE(data=data)

        # Evaluate MISE for different values of n
        for i, n in enumerate(nn):
            # Compute the bandwidth
            kde.set_n(n)
            kde.compute_bw()
            bw[repeat, i] = kde.bw

            # Compute the pdf
            xx = np.array(np.meshgrid(*[x for _ in range(d)])).reshape((d, nx**d)).T
            pdf = kde.score_samples(xx).reshape(tuple([nx for _ in range(d)]))
            pdfs[repeat, i] = pdf
            
    # Write data to file
    with h5py.File(filename, "w") as f:
        f.create_dataset("pdfs", data=pdfs)
        f.create_dataset("bw", data=bw)
else:
     with h5py.File(filename, "r") as f:
        pdfs = f["pdfs"][:]
        bw = f["bw"][:]

In [None]:
mise_var = np.zeros_like(bw)
mise_bias = np.zeros_like(bw)
mise = np.zeros_like(bw)
kde = KDE(data=scaling)  # Just to get kde.muk

# Evaluate MISE for different values of n
for repeat in tqdm(range(nrepeat)):
    for i, n in enumerate(nn):
        # Compute integral of Laplacianxx = np.reshape(xx, (d, nx**d)).T
        pdf = pdfs[repeat, i]
        laplacian = np.sum(np.array([np.gradient(np.gradient(pdf, axis=i), axis=i) / dx**2 for i in range(d)]), axis=0)
        integral = laplacian**2
        for _ in range(d):
            integral = np.trapz(integral, x)

        # Estimate MISE
        mise_var[repeat, i] = kde.muk / (n * bw[repeat, i]**d)
        mise_bias[repeat, i] = bw[repeat, i]**4 / 4 * integral
        mise[repeat, i] = mise_var[repeat, i] + mise_bias[repeat, i]

In [None]:
for m in mise:
    plt.loglog(nn, m, color=[.5, .5, 1])
plt.loglog(nn, np.mean(mise, axis=0), 'r-', lw=3, label="Mean")
plt.xlabel("Number of samples")
plt.ylabel("MISE")
plt.legend()
plt.grid(True)

In [None]:
f, ax = plt.subplots(1, 1, figsize=(7, 5))
ax.loglog(nn, mise[0], lw=5)
ax.set_xlabel("Number of samples")
ax.set_ylabel("MISE")
ax.set_xlim([nn[0], nn[-1]])
ax.grid(True)
print("Power for estimated MISE: {:.2f}".format(np.polyfit(np.log(nn), np.log(mise[0]), 1)[0]))
mise_normal = mise  # Save MISE for later

### First map the input data

The data is mapped, such that each marginal distribution is closer to a normal distribution. The idea is that the estimation of the MISE should be more consistent. For example, when looking at the plot above, it shows that the approximation of the MISE can be significantly different when the data is reshuffled. It is expected that this is less when the data is closer to a well-behaved Gaussian distribution. Still, the data will not be Gaussian, as only the marginal distributions will be, but it is nevertheless expected that this improve things.

In [None]:
# Parameters
nmin = 600
nstep = 300
xmax = 3
nx = 61
xmax_transform = np.max(np.abs(scaling)) + 3
nx_transform = 500
seed = 0
nrepeat = 10
d = scaling.shape[1]
overwrite = False

In [None]:
data = scaling.copy()
kdes = [KDE(data=scaling[:, i], bw=0.25) for i in range(d)]  # 0.25 is approximately similar to Silverman's rule

In [None]:
xpdf_transform = np.linspace(-xmax_transform, xmax_transform, nx_transform)
dx_transform = np.mean(np.gradient(xpdf_transform))
f, axs = plt.subplots(1, 3, figsize=(14, 4))
ypdf_transform = np.zeros((d, nx_transform))
ycdf_transform = np.zeros_like(ypdf_transform)
for i, (kde, ax) in enumerate(zip(kdes, axs)):
    ax.hist(scaling[:, i], density=True)
    ypdf_transform[i] = kde.score_samples(xpdf_transform)
    ypdf_transform[i] /= np.trapz(ypdf_transform[i], xpdf_transform)
    ycdf_transform[i] = np.cumsum(ypdf_transform[i]) * dx_transform
    ycdf_transform[i] /= ycdf_transform[i, -1]
    xlim = ax.get_xlim()
    ax.plot(xpdf_transform, ypdf_transform[i], 'r-')
    ax.set_xlim(xlim)
    ax.set_xlabel('original value')
    ax.set_ylabel('pdf')

In [None]:
f, axs = plt.subplots(1, 3, figsize=(14, 4))
for i, ax in enumerate(axs):
    new = np.interp(scaling[:, i], xpdf_transform, ycdf_transform[i]) * 2 - 1
    data[:, i] = scipy.special.erfinv(new)
    ax.hist(data[:, i])

In [None]:
nn = np.arange(nmin, scaling.shape[0], nstep)
bw = np.zeros((nrepeat, len(nn)))
pdfs = np.zeros(np.concatenate(([nrepeat, len(nn)], np.ones(d, dtype=np.int)*nx)))
x = np.linspace(-xmax, xmax, nx)
dx = np.mean(np.gradient(x))

In [None]:
# Compute the pdfs
np.random.seed(seed)
filename = os.path.join('hdf5', 'pdfs_real_dataset_transformed.hdf5')
if overwrite or not os.path.exists(filename):
    for repeat in tqdm(range(nrepeat)):
        np.random.shuffle(data)
        kde = KDE(data=data)

        # Evaluate MISE for different values of n
        for i, n in enumerate(nn):
            # Compute the bandwidth
            kde.set_n(n)
            kde.compute_bw()
            bw[repeat, i] = kde.bw

            # Compute the pdf
            xx = np.array(np.meshgrid(*[x for _ in range(d)])).reshape((d, nx**d)).T
            pdf = kde.score_samples(xx).reshape(tuple([nx for _ in range(d)]))
            pdfs[repeat, i] = pdf
            
    # Write data to file
    with h5py.File(filename, "w") as f:
        f.create_dataset("pdfs", data=pdfs)
        f.create_dataset("bw", data=bw)
else:
     with h5py.File(filename, "r") as f:
        pdfs = f["pdfs"][:]
        bw = f["bw"][:]

In [None]:
mise_var = np.zeros_like(bw)
mise_bias = np.zeros_like(bw)
mise = np.zeros_like(bw)

# Evaluate MISE for different values of n
for repeat in tqdm(range(nrepeat)):
    for i, n in enumerate(nn):
        # Compute integral of Laplacianxx = np.reshape(xx, (d, nx**d)).T
        pdf = pdfs[repeat, i]
        laplacian = np.sum(np.array([np.gradient(np.gradient(pdf, axis=i), axis=i) / dx**2 for i in range(d)]), axis=0)
        integral = laplacian**2
        for _ in range(d):
            integral = np.trapz(integral, x)

        # Estimate MISE
        mise_var[repeat, i] = kde.muk / (n * bw[repeat, i]**d)
        mise_bias[repeat, i] = bw[repeat, i]**4 / 4 * integral
        mise[repeat, i] = mise_var[repeat, i] + mise_bias[repeat, i]

In [None]:
for m in mise:
    plt.loglog(nn, m, color=[.5, .5, 1])
plt.loglog(nn, np.mean(mise, axis=0), 'r-', lw=3, label="Mean")
plt.xlabel("Number of samples")
plt.ylabel("MISE")
plt.legend()
plt.grid(True)

### Compute KDE for first feature seperately

In [None]:
# Parameters
seed = 0
nrepeat = 10
overwrite = True

In [None]:
nn = np.arange(nmin, scaling.shape[0], nstep)
bw = np.zeros((nrepeat, len(nn), 2))
pdfs1 = np.zeros((nrepeat, len(nn), nx))
pdfs2 = np.zeros(np.concatenate(([nrepeat, len(nn)], np.ones(d-1, dtype=np.int)*nx)))
laplacian1 = np.zeros_like(pdfs1)
laplacian2 = np.zeros_like(pdfs2)
x = np.linspace(-xmax, xmax, nx)
dx = np.mean(np.gradient(x))
data = scaling.copy()

In [None]:
# Initialize the KDE
np.random.seed(seed)
filename = os.path.join('hdf5', 'pdfs_split_notebook.hdf5')
if overwrite or not os.path.exists(filename):
    for repeat in tqdm(range(nrepeat)):
        np.random.shuffle(data)
        kde1 = KDE(data=data[:, 0])
        kde2 = KDE(data=data[:, 1:])
        kde1.set_score_samples(x)
        xx = np.array(np.meshgrid(*[x for _ in range(d-1)])).reshape((d-1, nx**(d-1))).T
        kde2.set_score_samples(xx)

        # Evaluate MISE for different values of n
        for i, n in enumerate(tqdm(nn, leave=False)):
            # Compute the bandwidth
            for j, kde in enumerate([kde1, kde2]):
                kde.set_n(n)
                kde.compute_bw()
                bw[repeat, i, j] = kde.bw

            # Compute the pdf
            pdfs1[repeat, i] = kde1.score_samples()
            laplacian1[repeat, i] = kde1.laplacian()
            pdf = kde2.score_samples().reshape(tuple([nx for _ in range(d-1)]))
            pdfs2[repeat, i] = pdf
            laplacian = kde2.laplacian().reshape(tuple([nx for _ in range(d-1)]))
            laplacian2[repeat, i] = laplacian
            
    # Write data to file
    with h5py.File(filename, "w") as f:
        f.create_dataset("pdfs1", data=pdfs1)
        f.create_dataset("pdfs2", data=pdfs2)
        f.create_dataset("bw", data=bw)
else:
     with h5py.File(filename, "r") as f:
        pdfs1 = f["pdfs1"][:]
        pdfs2 = f["pdfs2"][:]
        bw = f["bw"][:]

In [None]:
mise = np.zeros((nrepeat, len(nn)))
mise1 = np.zeros_like(mise)
mise2 = np.zeros_like(mise)
kde1 = KDE(data=scaling[:, 0])  # Just to get kde.muk
kde2 = KDE(data=scaling[:, 1:])  # Just to get kde.muk

# Evaluate MISE for different values of n
for repeat in tqdm(range(nrepeat)):
    for i, n in enumerate(nn):
        # Compute the MISE of the first part
        pdf = pdfs1[repeat, i]
        laplacian = np.gradient(np.gradient(pdf)) / dx**2
        laplacian = laplacian1[repeat, i]
        integral = np.trapz(laplacian**2, x)
        mise1[repeat, i] = kde1.muk / (n * bw[repeat, i, 0]) + bw[repeat, i, 0]**4 / 4 * integral
        
        # Compute MISE of the second part
        pdf = pdfs2[repeat, i]
        laplacian = np.sum(np.array([np.gradient(np.gradient(pdf, axis=i), axis=i) / dx**2 for i in range(d-1)]), axis=0)
        laplacian = laplacian2[repeat, i]
        integral = laplacian**2
        for _ in range(d-1):
            integral = np.trapz(integral, x)
        mise2[repeat, i] = kde2.muk / (n * bw[repeat, i, 1]**(d-1)) + bw[repeat, i, 1]**4 / 4 * integral
        
        # Compute the integrals of the squared probabilities
        integral1 = np.trapz(pdfs1[repeat, i]**2, x)
        integral2 = pdfs2[repeat, i]**2
        for _ in range(d-1):
            integral2 = np.trapz(integral2, x)
            
        # Compute the total MISE
        mise = integral1 * mise2 + integral2 * mise1 + mise1 * mise2

In [None]:
for m in mise:
    plt.loglog(nn, m, color=[.5, .5, 1])
plt.loglog(nn, np.mean(mise, axis=0), 'r-', lw=3, label="Mean")
plt.xlabel("Number of samples")
plt.ylabel("MISE")
plt.legend()
plt.grid(True)

In [None]:
for m in mise:
    plt.loglog(nn, m, color=[.5, .5, 1])
plt.loglog(nn, np.mean(mise, axis=0), 'r-', lw=3, label="Mean")
plt.xlabel("Number of samples")
plt.ylabel("MISE")
plt.legend()
plt.grid(True)

In [None]:
f, ax = plt.subplots(1, 1, figsize=(7, 5))
ax.loglog(nn, mise_normal[0], lw=5, label="Normal")
ax.loglog(nn, mise[0], lw=5, label="Acceleration independent")
ax.set_xlabel("Number of samples")
ax.set_ylabel("MISE")
ax.set_xlim([nn[0], nn[-1]])
ax.grid(True)
ax.legend()
print("Power for estimated MISE: {:.2f}".format(np.polyfit(np.log(nn), np.log(mise[0]), 1)[0]))

In [None]:
plt.semilogx(nn, bw[0, :, 0])
plt.semilogx(nn, bw[0, :, 1])

### Map the data, such that one feature is independent

In [None]:
data = scaling / np.std(scaling, axis=0)

In [None]:
kde = KDE(data=data)
kde.compute_bandwidth()
score_kde = kde.score_leave_one_out()
print("Total score:       {:.4f}".format(score_kde))

kde1 = KDE(data=data[:, 0])
kde1.compute_bandwidth()
score_kde1 = kde1.score_leave_one_out()
print("Score feature 0:   {:.4f}".format(score_kde1))

kde2 = KDE(data=data[:, [1, 2]])
kde2.compute_bandwidth()
score_kde2 = kde2.score_leave_one_out()
print("Score feature 1&2: {:.4f}".format(score_kde2))
print("Sum scores 0, 1&2: {:.4f}".format(score_kde1+score_kde2))

In [None]:
data = scaling / np.std(scaling, axis=0)
A = np.array([np.ones(len(data)), data[:, 1], data[:, 2]]).T
theta = np.linalg.lstsq(A, data[:, 0], rcond=None)[0]
print(theta)
data[:, 0] -= np.dot(A, theta)

In [None]:
kde = KDE(data=data)
kde.compute_bandwidth()
score_kde = kde.score_leave_one_out()
print("Total score:       {:.4f}".format(score_kde))

kde1 = KDE(data=data[:, 0])
kde1.compute_bandwidth()
score_kde1 = kde1.score_leave_one_out()
print("Score feature 0:   {:.4f}".format(score_kde1))

kde2 = KDE(data=data[:, [1, 2]])
kde2.compute_bandwidth()
score_kde2 = kde2.score_leave_one_out()
print("Score feature 1&2: {:.4f}".format(score_kde2))
print("Sum scores 0, 1&2: {:.4f}".format(score_kde1+score_kde2))

In [None]:
data = scaling / np.std(scaling, axis=0)
A = np.array([np.ones(len(data)), data[:, 0], data[:, 2]]).T
theta = np.linalg.lstsq(A, data[:, 1], rcond=None)[0]
print(theta)
data[:, 1] -= np.dot(A, theta)

In [None]:
kde = KDE(data=data)
kde.compute_bandwidth()
score_kde = kde.score_leave_one_out()
print("Total score:       {:.4f}".format(score_kde))

kde1 = KDE(data=data[:, 1])
kde1.compute_bandwidth()
score_kde1 = kde1.score_leave_one_out()
print("Score feature 1:   {:.4f}".format(score_kde1))

kde2 = KDE(data=data[:, [0, 2]])
kde2.compute_bandwidth()
score_kde2 = kde2.score_leave_one_out()
print("Score feature 1&2: {:.4f}".format(score_kde2))
print("Sum scores 0, 1&2: {:.4f}".format(score_kde1+score_kde2))

In [None]:
data = scaling / np.std(scaling, axis=0)
A = np.array([np.ones(len(data)), data[:, 0], data[:, 1]]).T
theta = np.linalg.lstsq(A, data[:, 2], rcond=None)[0]
print(theta)
data[:, 2] -= np.dot(A, theta)

In [None]:
kde = KDE(data=data)
kde.compute_bandwidth()
score_kde = kde.score_leave_one_out()
print("Total score:       {:.4f}".format(score_kde))

kde1 = KDE(data=data[:, 2])
kde1.compute_bandwidth()
score_kde1 = kde1.score_leave_one_out()
print("Score feature 1:   {:.4f}".format(score_kde1))

kde2 = KDE(data=data[:, [0, 1]])
kde2.compute_bandwidth()
score_kde2 = kde2.score_leave_one_out()
print("Score feature 1&2: {:.4f}".format(score_kde2))
print("Sum scores 0, 1&2: {:.4f}".format(score_kde1+score_kde2))

In [None]:
kde = KDE(data=data[:, 0])
kde.compute_bandwidth()

In [None]:
x = np.linspace(-3, 7, 10000)
kde.set_score_samples(x)

In [None]:
pdf = kde.score_samples()
plt.plot(x, pdf)

In [None]:
laplacian = np.gradient(np.gradient(pdf)) / np.mean(np.diff(x))**2
plt.plot(x, laplacian**2)
plt.plot(x, kde.laplacian()**2)
plt.plot(x, laplacian**2 - kde.laplacian()**2)