In [None]:
import numpy as np
import pickle
import matplotlib.pyplot as plt
from fastkde import KDE
import os
from tqdm import tqdm_notebook as tqdm
import h5py
import scipy.special
%matplotlib inline
%load_ext autoreload
%autoreload 2

### Apply method for approximating the MISE

In [None]:
# Open the dataset
with open(os.path.join('pickles', 'df.p'), 'rb') as f:
    dfs, scaling = pickle.load(f)
scaling = scaling.T  # [time vstart vend]
scaling = scaling[scaling[:, 2] > 0, :]  # Remove full stops
scaling[:, 1] = scaling[:, 1] - scaling[:, 2]  # Now it becomes: [time deltav vend] (less correlation)
scaling[:, 0] = scaling[:, 1] / scaling[:, 0]  # Now it becomes: [deceleration deltav vend] (better behaved)
std_scaling = np.std(scaling, axis=0)
mean_scaling = np.mean(scaling, axis=0)
scaling = (scaling - mean_scaling) / std_scaling

In [None]:
# Parameters
nmin = 600
nstep = 300
xmax = 3
nx = 61
nrepeat = 10
seed = 0
d = scaling.shape[1]
overwrite = False

In [None]:
nn = np.arange(nmin, scaling.shape[0], nstep)
bw = np.zeros((nrepeat, len(nn)))
pdfs = np.zeros(np.concatenate(([nrepeat, len(nn)], np.ones(d, dtype=np.int)*nx)))
x = np.linspace(-xmax, xmax, nx)
dx = np.mean(np.gradient(x))

In [None]:
# Initialize the KDE
np.random.seed(seed)
data = scaling.copy()
filename = os.path.join('hdf5', 'pdfs_real_dataset.hdf5')
if overwrite or not os.path.exists(filename):
    for repeat in tqdm(range(nrepeat)):
        np.random.shuffle(data)
        kde = KDE(data=data)

        # Evaluate MISE for different values of n
        for i, n in enumerate(nn):
            # Compute the bandwidth
            kde.set_n(n)
            kde.compute_bw()
            bw[repeat, i] = kde.bw

            # Compute the pdf
            xx = np.array(np.meshgrid(*[x for _ in range(d)])).reshape((d, nx**d)).T
            pdf = kde.score_samples(xx).reshape(tuple([nx for _ in range(d)]))
            pdfs[repeat, i] = pdf
            
    # Write data to file
    with h5py.File(filename, "w") as f:
        f.create_dataset("pdfs", data=pdfs)
        f.create_dataset("bw", data=bw)
else:
     with h5py.File(filename, "r") as f:
        pdfs = f["pdfs"][:]
        bw = f["bw"][:]

In [None]:
mise_var = np.zeros_like(bw)
mise_bias = np.zeros_like(bw)
mise = np.zeros_like(bw)
kde = KDE(data=scaling)  # Just to get kde.muk

# Evaluate MISE for different values of n
for repeat in tqdm(range(nrepeat)):
    for i, n in enumerate(nn):
        # Compute integral of Laplacianxx = np.reshape(xx, (d, nx**d)).T
        pdf = pdfs[repeat, i]
        laplacian = np.sum(np.array([np.gradient(np.gradient(pdf, axis=i), axis=i) / dx**2 for i in range(d)]), axis=0)
        integral = laplacian**2
        for _ in range(d):
            integral = np.trapz(integral, x)

        # Estimate MISE
        mise_var[repeat, i] = kde.muk / (n * bw[repeat, i]**d)
        mise_bias[repeat, i] = bw[repeat, i]**4 / 4 * integral
        mise[repeat, i] = mise_var[repeat, i] + mise_bias[repeat, i]

In [None]:
for m in mise:
    plt.loglog(nn, m, color=[.5, .5, 1])
plt.loglog(nn, np.mean(mise, axis=0), 'r-', lw=3, label="Mean")
plt.xlabel("Number of samples")
plt.ylabel("MISE")
plt.legend()
plt.grid(True)

### First map the input data

The data is mapped, such that each marginal distribution is closer to a normal distribution. The idea is that the estimation of the MISE should be more consistent. For example, when looking at the plot above, it shows that the approximation of the MISE can be significantly different when the data is reshuffled. It is expected that this is less when the data is closer to a well-behaved Gaussian distribution. Still, the data will not be Gaussian, as only the marginal distributions will be, but it is nevertheless expected that this improve things.

In [None]:
# Parameters
nmin = 600
nstep = 300
xmax = 3
nx = 61
xmax_transform = np.max(np.abs(scaling)) + 3
nx_transform = 500
seed = 0
nrepeat = 10
d = scaling.shape[1]
overwrite = False

In [None]:
data = scaling.copy()
kdes = [KDE(data=scaling[:, i], bw=0.25) for i in range(d)]  # 0.25 is approximately similar to Silverman's rule

In [None]:
xpdf_transform = np.linspace(-xmax_transform, xmax_transform, nx_transform)
dx_transform = np.mean(np.gradient(xpdf_transform))
f, axs = plt.subplots(1, 3, figsize=(14, 4))
ypdf_transform = np.zeros((d, nx_transform))
ycdf_transform = np.zeros_like(ypdf_transform)
for i, (kde, ax) in enumerate(zip(kdes, axs)):
    ax.hist(scaling[:, i], density=True)
    ypdf_transform[i] = kde.score_samples(xpdf_transform)
    ypdf_transform[i] /= np.trapz(ypdf_transform[i], xpdf_transform)
    ycdf_transform[i] = np.cumsum(ypdf_transform[i]) * dx_transform
    ycdf_transform[i] /= ycdf_transform[i, -1]
    xlim = ax.get_xlim()
    ax.plot(xpdf_transform, ypdf_transform[i], 'r-')
    ax.set_xlim(xlim)
    ax.set_xlabel('original value')
    ax.set_ylabel('pdf')

In [None]:
f, axs = plt.subplots(1, 3, figsize=(14, 4))
for i, ax in enumerate(axs):
    new = np.interp(scaling[:, i], xpdf_transform, ycdf_transform[i]) * 2 - 1
    data[:, i] = scipy.special.erfinv(new)
    ax.hist(data[:, i])

In [None]:
nn = np.arange(nmin, scaling.shape[0], nstep)
bw = np.zeros((nrepeat, len(nn)))
pdfs = np.zeros(np.concatenate(([nrepeat, len(nn)], np.ones(d, dtype=np.int)*nx)))
x = np.linspace(-xmax, xmax, nx)
dx = np.mean(np.gradient(x))

In [None]:
# Initialize the KDE
np.random.seed(seed)
filename = os.path.join('hdf5', 'pdfs_real_dataset_transformed.hdf5')
if overwrite or not os.path.exists(filename):
    for repeat in tqdm(range(nrepeat)):
        np.random.shuffle(data)
        kde = KDE(data=data)

        # Evaluate MISE for different values of n
        for i, n in enumerate(nn):
            # Compute the bandwidth
            kde.set_n(n)
            kde.compute_bw()
            bw[repeat, i] = kde.bw

            # Compute the pdf
            xx = np.array(np.meshgrid(*[x for _ in range(d)])).reshape((d, nx**d)).T
            pdf = kde.score_samples(xx).reshape(tuple([nx for _ in range(d)]))
            pdfs[repeat, i] = pdf
            
    # Write data to file
    with h5py.File(filename, "w") as f:
        f.create_dataset("pdfs", data=pdfs)
        f.create_dataset("bw", data=bw)
else:
     with h5py.File(filename, "r") as f:
        pdfs = f["pdfs"][:]
        bw = f["bw"][:]

In [None]:
mise_var = np.zeros_like(bw)
mise_bias = np.zeros_like(bw)
mise = np.zeros_like(bw)

# Evaluate MISE for different values of n
for repeat in tqdm(range(nrepeat)):
    for i, n in enumerate(nn):
        # Compute integral of Laplacianxx = np.reshape(xx, (d, nx**d)).T
        pdf = pdfs[repeat, i]
        laplacian = np.sum(np.array([np.gradient(np.gradient(pdf, axis=i), axis=i) / dx**2 for i in range(d)]), axis=0)
        integral = laplacian**2
        for _ in range(d):
            integral = np.trapz(integral, x)

        # Estimate MISE
        mise_var[repeat, i] = kde.muk / (n * bw[repeat, i]**d)
        mise_bias[repeat, i] = bw[repeat, i]**4 / 4 * integral
        mise[repeat, i] = mise_var[repeat, i] + mise_bias[repeat, i]

In [None]:
for m in mise:
    plt.loglog(nn, m, color=[.5, .5, 1])
plt.loglog(nn, np.mean(mise, axis=0), 'r-', lw=3, label="Mean")
plt.xlabel("Number of samples")
plt.ylabel("MISE")
plt.legend()
plt.grid(True)