# Method for selection of hyperparameters of parametrization

In [None]:
import numpy as np
import pickle
from spline_inter1d import SplineInter1D, _Spline
import matplotlib.pyplot as plt
%matplotlib inline
from kde import KDE
from tqdm import tqdm_notebook as tqdm
from sklearn.preprocessing import StandardScaler
import os
overwrite = False

## Simplified example

In [None]:
# Define the spline function that we later need
spl_event = SplineInter1D([0, 1/2, 1], left_constraints=[1], right_constraints=[0])
spl = _Spline(spl_event.t, spl_event.d, spl_event.thetafixed)
def theta(thetatilde):
    return spl_event.thetafixed + np.dot(spl_event.v2, thetatilde)
def thetatilde(theta):
    return np.dot(spl_event.v2.T, theta - spl_event.thetafixed)

In [None]:
# This is the error function that measures how good the parameters 'z' fit with the curve y
def error(y, z, sigma=0.1):
    """Determine the error
    y is an test curve, evaluated at default x values
    z is a vector containing the parameters from the kde
    sigma 'Standard deviation', kind of allowed error
    """
    # Construct spline and evaluate
    spl.theta = theta(z)
    yz = spl()
    
    # Return the function
    # return np.mean(np.exp(-(y - yz)**2 / (2 * sigma**2))) / (sigma * np.sqrt(2*np.pi))
    # return np.mean(-np.abs(y - yz) / sigma) - np.log(2 * sigma)
    # return np.exp(-np.mean((y - yz)**2 / (2 * sigma * sigma)) / np.sqrt(2 * np.pi * sigma))
    return np.exp(-np.mean(np.abs(y - yz) / sigma))

In [None]:
# Do it for a very simple case
t = np.linspace(0, 1, 100)
spl(t)  # Sets the x-values
v1 = -t**2 + 1
v2 = t**2 -2*t + 1
v3 = -t + 1
coef_train = np.array([thetatilde(spl_event.fit(t, v1).theta), thetatilde(spl_event.fit(t, v2).theta)])
plt.plot(t, v1, 'b', label='Training curve')
plt.plot(t, v2, 'b')
plt.plot(t, v3, 'r', label='Test curve')
plt.xlabel('t')
plt.ylabel('y')
plt.legend()
plt.grid('on')

In [None]:
H = [0.01, 0.1, 0.2, 0.5, 1, 2]
f, axs = plt.subplots(2, 3, figsize=(16, 8))
for h, ax in zip(H, np.ravel(axs)):
    kde = KDE(coef_train, bandwidth=h, scale_data=False)
    kde.compute_kde()
    for i in range(40):
        z = kde.draw_random_sample(1)[0]
        spl.theta = theta(z)
        ax.plot(t, spl(t), color=[.5, .5, 1])
    ax.set_title('h = {:.2f}'.format(h))
    ax.set_xlabel('t')
    ax.set_ylabel('y')
    ax.grid('on')
plt.tight_layout()

In [None]:
savefile = os.path.join('pickles', 'artificial_example.p')
N = 1000
H = np.logspace(-2, 1, 31)
Sigma = [0.005, 0.01, 0.05, 0.1, 0.5, 1]

E = np.zeros((len(Sigma), len(H)))
if overwrite or not os.path.exists(savefile):
    for isigma, sigma in enumerate(tqdm(Sigma)):
        for ih, h in enumerate(tqdm(H, leave=False)):
            kde = KDE(coef_train, bandwidth=h, scale_data=False)
            kde.compute_kde()
            e = np.zeros(N)
            for i in range(N):
                z = kde.draw_random_sample(1)[0]
                e[i] = error(v3, z, sigma=sigma)
            E[isigma, ih] = np.mean(e, axis=0)
    if not os.path.exists(os.path.dirname(savefile)):
        os.mkdir(os.path.dirname(savefile))
    with open(savefile, 'wb') as f:
        pickle.dump(E, f)
else:
    with open(savefile, 'rb') as f:
        E = pickle.load(f)

In [None]:
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 4))
for sigma, e in zip(Sigma, E):
    plthandle = ax1.semilogx(H, e, label='sigma = {:.3f}'.format(sigma))
    ax1.plot(H[np.argmax(e)], np.max(e), 'x', color=plthandle[0].get_color(), ms=10, mew=3)
    plthandle = ax2.semilogx(H, (e - np.min(e)) / (np.max(e) - np.min(e)), label='sigma = {:.3f}'.format(sigma))
    ax2.plot(H[np.argmax(e)], 1, 'x', color=plthandle[0].get_color(), ms=10, mew=3)
ax1.grid('on')
ax1.set_xlabel('Bandwidth')
ax1.set_ylabel('Score')
ax2.grid('on')
ax2.set_xlabel('Bandwidth')
ax2.set_ylabel('Score normalized per curve')
ax1.legend()

## Real example

In [None]:
with open('df.p', 'rb') as f:
    (dfs, scaling) = pickle.load(f)
# Only use first n profiles
n = 400
dfs = dfs[:n].copy()
scaling = scaling.T[:n].copy()

In [None]:
# Interpolate all curves
profiles = np.zeros((len(dfs), len(t)))
for i, df in enumerate(dfs):
    profiles[i] = np.interp(t, df['time'], df['vel'])

### Influence of number of knots

In [None]:
savefile = os.path.join('pickles', 'test_nknots.p')
sigma = 0.001
N = 2000
NKNOTS = [0, 1, 2, 3, 4]
H = np.logspace(-3, 0, 10)
J = np.zeros((len(NKNOTS), len(H)))
ntest = 25

if overwrite or not os.path.exists(savefile):
    for inknots, nknots in enumerate(tqdm(NKNOTS)):
        print("Number of interior knots: {:d}".format(nknots))
        # Define spline function. Note the +2 because of extra exterior knots
        spl_event = SplineInter1D(np.linspace(0, 1, nknots+2), left_constraints=[1], right_constraints=[0])
        spl = _Spline(spl_event.t, spl_event.d, spl_event.thetafixed)
        spl(t)  # Sets the x-values

        # Compute the spline coefficients
        ncoefs = spl_event.v2.shape[1]
        coefs = np.zeros((len(dfs), ncoefs))
        for i, df in enumerate(dfs):
            coefs[i] = thetatilde(spl_event.fit(df['time'], df['vel']).theta)
        scaler = StandardScaler()
        scaler.fit(coefs)

        # Perform the cross validation
        for ih, h in enumerate(tqdm(H, leave=False)):
            for idf, profile in enumerate(tqdm(profiles[:ntest], leave=False)):
                # Construct the KDE
                kde = KDE(scaler.transform(np.concatenate((coefs[:idf], coefs[idf+1:]))), bandwidth=h, scale_data=False)
                kde.compute_kde()

                # Generate curves and compare
                e = np.zeros(N)
                for i in range(N):
                    z = scaler.inverse_transform(kde.draw_random_sample(1))[0]
                    e[i] = error(profile, z, sigma=sigma)
                #plt.plot(e)

                # Add result to final score
                J[inknots, ih] += np.log(np.mean(e))
                print("\rh = {:.2e}, J = {:.1f}".format(h, J[inknots, ih]/(idf+1)*ntest), end="")
            print("\rh = {:.2e}, J = {:.1f}".format(h, J[inknots, ih]))
            print()
    with open(savefile, 'wb') as f:
        pickle.dump(J, f)
else:
    with open(savefile, 'rb') as f:
        J = pickle.load(f)

In [None]:
for inknots, nknots in enumerate(NKNOTS):
    plthandle = plt.semilogx(H, J[inknots], label='nknots={:d}'.format(nknots))
    plt.plot(H[np.argmax(J[inknots])], np.max(J[inknots]), 'x', color=plthandle[0].get_color(), ms=10, mew=3)
plt.grid('on')
plt.xlabel('Bandwidth')
plt.ylabel('Score')
plt.legend()

### Influence of number of coefficients with SVD

In [None]:
savefile = os.path.join('pickles', 'test_svd.p')
NKNOTS2 = [2, 3, 4]
H2 = np.logspace(-2, 1, 10)

if overwrite or not os.path.exists(savefile):
    JJ = []
    # Loop through all number of knots
    for inknots, nknots in enumerate(tqdm(NKNOTS2)):
        print("Number of interior knots: {:d}".format(nknots))
        # Define spline function. Note the +2 because of extra exterior knots
        spl_event = SplineInter1D(np.linspace(0, 1, nknots+2), left_constraints=[1], right_constraints=[0])
        spl = _Spline(spl_event.t, spl_event.d, spl_event.thetafixed)
        spl(t)  # Sets the x-values

        # Compute the spline coefficients
        ncoefs = spl_event.v2.shape[1]
        coefs = np.zeros((len(dfs), ncoefs))
        for i, df in enumerate(dfs):
            coefs[i] = thetatilde(spl_event.fit(df['time'], df['vel']).theta)
        scaler = StandardScaler()
        scaler.fit(coefs)

        # Perform SVD
        mu = np.mean(coefs, axis=0)
        u, s, v = np.linalg.svd(coefs-mu, full_matrices=False)

        # Initialize result matrix
        jj = np.zeros((ncoefs, len(H)))

        # Try all different amount of coefficients
        for incoef, ncoef in enumerate(tqdm(range(1, ncoefs+1), leave=False)):
            print("Number of coefficients: {:d}".format(ncoef))
            # Define the scaler
            scaler = StandardScaler()
            scaler.fit(u[:, :ncoef])
            # Perform the cross validation
            for ih, h in enumerate(tqdm(H2, leave=False)):
                for idf, profile in enumerate(tqdm(profiles[:ntest], leave=False)):
                    # Construct the KDE
                    kde = KDE(scaler.transform(np.concatenate((u[:idf, :ncoef], u[idf+1:, :ncoef]))), 
                              bandwidth=h, scale_data=False)
                    kde.compute_kde()

                    # Generate curves and compare
                    e = np.zeros(N)
                    for i in range(N):
                        z = scaler.inverse_transform(kde.draw_random_sample(1))[0]
                        e[i] = error(profile, mu + np.dot(z*s[:ncoef], v[:ncoef]), sigma=sigma)
                    #plt.plot(e)

                    # Add result to final score
                    jj[incoef, ih] += np.log(np.mean(e))
                    print("\rh = {:.2e}, J = {:.1f}".format(h, jj[incoef, ih]/(idf+1)*ntest), end="")
                print("\rh = {:.2e}, J = {:.1f}".format(h, jj[incoef, ih]))
                print()
        JJ.append(jj)
    with open(savefile, 'wb') as f:
        pickle.dump(JJ, f)
else:
    with open(savefile, 'rb') as f:
        JJ = pickle.load(f)

In [None]:
f, axs = plt.subplots(1, len(JJ), figsize=(5*len(JJ), 4))
for ax, jj, nknots in zip(axs, JJ, NKNOTS2):
    for incoef, ncoef in enumerate(range(1, len(jj)+1)):
        plthandle = ax.semilogx(H2, jj[incoef], label='ncoef={:d}'.format(ncoef))
        ax.plot(H2[np.argmax(jj[incoef])], np.max(jj[incoef]), 'x', color=plthandle[0].get_color(), ms=10, mew=3)
    ax.grid('on')
    ax.set_xlabel('Bandwidth')
    ax.set_ylabel('Score')
    ax.set_title('Number of interior knots: {:d}'.format(nknots))
    ax.legend()
plt.tight_layout()

In [None]:
f, axs = plt.subplots(1, len(JJ), figsize=(5*len(JJ), 4))
for ax, j, jj, nknots in zip(axs, [J[i] for i in np.searchsorted(NKNOTS, NKNOTS2)], JJ, NKNOTS2):
    plthandle = ax.semilogx(H, j, label='orig'.format(ncoef))
    ax.plot(H[np.argmax(j)], np.max(j), 'x', color=plthandle[0].get_color(), ms=10, mew=3)
    plthandle = ax.semilogx(H2, jj[-1], label='svd'.format(ncoef))
    ax.plot(H2[np.argmax(jj[-1])], np.max(jj[-1]), 'x', color=plthandle[0].get_color(), ms=10, mew=3)
    ax.grid('on')
    ax.set_xlabel('Bandwidth')
    ax.set_ylabel('Score')
    ax.set_title('Number of interior knots: {:d}'.format(nknots))
    ax.legend()
plt.tight_layout()

### Copulas

In [None]:
nknots = 1

spl_event = SplineInter1D(np.linspace(0, 1, nknots+2), left_constraints=[1], right_constraints=[0])
spl = _Spline(spl_event.t, spl_event.d, spl_event.thetafixed)
spl(t)  # Sets the x-values

# Compute the spline coefficients
ncoefs = spl_event.v2.shape[1]
coefs = np.zeros((len(dfs), ncoefs))
for i, df in enumerate(dfs):
    coefs[i] = thetatilde(spl_event.fit(df['time'], df['vel']).theta)
scaler = StandardScaler()
scaler.fit(coefs)

In [None]:
scoefs = scaler.transform(coefs)
normcoefs = np.zeros_like(scoefs)  # Normalized coefficients, such that marginals are uniform distributions
x = np.linspace(-5, 5, 1000)
cdf = np.zeros((len(x), ncoefs))
h = 0.05
for i in range(scoefs.shape[1]):
    kde = KDE(data=scoefs[:, i:i+1], scale_data=False, bandwidth=h)
    kde.compute_kde()
    y = np.exp(kde.kde.score_samples(x[:, np.newaxis]))
    cdf[:, i] = np.cumsum(y * np.gradient(x))
    cdf[:, i] /= cdf[-1, i]
    normcoefs[:, i] = np.interp(scoefs[:, i], x, cdf[:, i])

In [None]:
H = np.logspace(-5, -2, 10)

J = np.zeros_like(H)
for ih, h in enumerate(tqdm(H)):
    kde = KDE(data=normcoefs, scale_data=False, bandwidth=h)
    kde.compute_kde()
    for idf, profile in enumerate(tqdm(profiles[:ntest], leave=False)):
        e = np.zeros(N)
        for i in range(N):
            X = kde.draw_random_sample(1)[0]
            X = np.min([np.mod(X, 2), 2-np.mod(X, 2)], axis=0)
            Xscaled = [np.interp(X[i], cdf[:, i], x) for i in range(ncoefs)]
            z = scaler.inverse_transform(Xscaled)
            e[i] = error(profiles[0], z, sigma=sigma)
        J[ih] += np.log(np.mean(e))
        print("\rh = {:.2e}, J = {:.1f}".format(h, J[ih]/(idf+1)*ntest), end="")
    print("\rh = {:.2e}, J = {:.1f}".format(h, J[ih]))
    print()

In [None]:
X = kde.draw_random_sample(1)[0]
X

In [None]:
np.min([np.mod(X, 2), 2-np.mod(X, 2)], axis=0)

In [None]:
np.min([np.mod(X, 2), np.mod(2-X, 2)], axis=0)

In [None]:
gausscoefs = np.zeros_like(scoefs)
cdf_norm = (erf(x)+1)/2
for i in range(ncoefs):
    gausscoefs[:, i] = np.interp(normcoefs[:, i], cdf_norm, x)
cov = np.cov(gausscoefs.T)

In [None]:
from scipy.stats import gaussian_kde

# Calculate the point density
i = 0
j = 1
xy = np.vstack([normcoefs[:, i], normcoefs[:, j]])
z = gaussian_kde(xy)(xy)

fig, ax = plt.subplots()
ax.scatter(normcoefs[:, i], normcoefs[:, j], c=z, s=100, edgecolor='')
plt.xlim([0, 1])
plt.ylim([0, 1])

In [None]:
J = 0
for profile in tqdm(profiles[:ntest], leave=False):
    e = np.zeros(N)
    for i in range(N):
        X = np.random.multivariate_normal(np.zeros(ncoefs), cov)
        Xuniform = np.interp(X, x, cdf_norm)
        Xscaled = [np.interp(Xuniform[i], cdf[:, i], x) for i in range(ncoefs)]
        z = scaler.inverse_transform(Xscaled)
        e[i] = error(profiles[0], z, sigma=sigma)
    J += np.log(np.mean(e))

In [None]:
plt.hist(normcoefs[:, 1], normed=True)

In [None]:
plt.plot(x, np.gradient(cdf[:, 0]))

In [None]:
# Show copula density
kde = KDE(data=normcoefs, scale_data=False, bandwidth=0.1)
kde.compute_kde()
n = 50
q = np.linspace(0, 1, n)
qq, rr, ss = [x.ravel() for x in np.meshgrid(q, q, q)]
scores = np.zeros((n, n, n))
for q in tqdm([-qq, qq, 2-qq]):
    for r in tqdm([-rr, rr, 2-rr], leave=False):
        for s in tqdm([-ss, ss, 2-ss], leave=False):
            pp = np.array([q, r, s]).T
            scores += np.reshape(np.exp(kde.kde.score_samples(pp)), (n, n, n))

In [None]:
from matplotlib import animation, rc
from IPython.display import HTML

# First set up the figure, the axis, and the plot element we want to animate
fig = plt.figure()
ax = plt.axes(xlim=(0, n), ylim=(0, n))
im = ax.imshow(scores[0])

# initialization function: plot the background of each frame
def init():
    im.set_data(scores[0])
    return (im,)

# animation function.  This is called sequentially
def animate(i):
    im.set_data(scores[i])
    return (im,)

# call the animator.  blit=True means only re-draw the parts that have changed.
anim = animation.FuncAnimation(fig, animate, init_func=init,
                               frames=n, interval=200, blit=True)

# call our new function to display the animation
HTML(anim.to_html5_video())

In [None]:
from matplotlib import animation, rc
from IPython.display import HTML

# First set up the figure, the axis, and the plot element we want to animate
fig = plt.figure()
ax = plt.axes(xlim=(0, n), ylim=(0, n))
im = ax.imshow(scores[0])

# initialization function: plot the background of each frame
def init():
    im.set_data(scores[0])
    return (im,)

# animation function.  This is called sequentially
def animate(i):
    im.set_data(scores[i])
    return (im,)

# call the animator.  blit=True means only re-draw the parts that have changed.
anim = animation.FuncAnimation(fig, animate, init_func=init,
                               frames=n, interval=50, blit=True)

# call our new function to display the animation
HTML(anim.to_html5_video())