# Completeness

In [None]:
from profile_generation import ProfileGeneration
import numpy as np
import matplotlib.pyplot as plt
from kde import KDE
from tqdm import tqdm_notebook as tqdm
%matplotlib inline
from matplotlib2tikz import save as tikz_save
import os
import pickle

import matplotlib
matplotlib.rcParams['text.usetex'] = True
matplotlib.rcParams['text.latex.unicode'] = True

%load_ext autoreload
%autoreload 2

In [None]:
# General parameters
overwrite = False

In [None]:
# Method for converting figure to tikz
tikz_folder = 'figures'
def tikz(name, extras=None):
    # Convert to tikz
    if extras is None:
        extras = []
    tikz_save(os.path.join(tikz_folder, '{:s}.tikz'.format(name)),
              figureheight='\\figureheight', figurewidth='\\figurewidth',
              extra_axis_parameters=extras, show_info=False) 

In [None]:
# Generate profile
pg = ProfileGeneration()
pg.sigma_vel = 0
v, y = pg.generate_profile(1000)
t = np.arange(len(y)) * pg.ts
def running_mean(x, N):
    return np.convolve(x, np.ones((N,))/N, mode='same')
a = running_mean(np.gradient(v, 2)/pg.ts, 5)  # Use second order

In [None]:
tend = 500
iend = np.searchsorted(t, tend)
plt.plot(t[0:iend:10], v[0:iend:10], linewidth=5)
plt.xlabel('Time [s]')
plt.ylabel('Speed [m/s]')
plt.grid('on')
plt.xlim([0, tend])
ievent = np.where(np.diff(y) != 0)[0]
ylim = plt.ylim()
for i in ievent:
    if t[i] > tend:
        break
    else:
        plt.plot([t[i], t[i]], ylim, 'k')
plt.ylim(ylim)
tikz('artificial_data')

# Wang's method

## Univariate distributions

In [None]:
# Parameters
mm = [500, 1000, 2000, 4000, 6000, 8000, 10000]
eps = 1e-3
tol = 1e-6

evaluate_kde_v = np.linspace(np.min(v)-0.5*np.std(v), np.max(v)+0.5*np.std(v), 100)
evaluate_kde_a = np.linspace(np.min(a)-0.5*np.std(a), np.max(a)+0.5*np.std(a), 100)

In [None]:
def silverman(x):
    return 1.06 * np.std(x) / (len(x) ** (1/5))

In [None]:
pickle_file = os.path.join('pickles', 'kls.p')
if os.path.exists(pickle_file) and not overwrite:
    with open(pickle_file, 'rb') as f:
        mm, klsv, klsa = pickle.load(f)
else:
    klsv, klsa = [], []
    for m in tqdm(mm):
        p_kdes_v = np.zeros((len(v)//m, len(evaluate_kde_v)))
        p_kdes_a = np.zeros((len(a)//m, len(evaluate_kde_a)))
        for i, n in enumerate(tqdm(range(m, len(v), m), leave=False)):
            kde = KDE(data=v[:n, np.newaxis], bandwidth=silverman(v[:n]))
            kde.compute_kde()
            p_kdes_v[i] = np.exp(kde.kde.score_samples(evaluate_kde_v[:, np.newaxis]))
            kde = KDE(data=a[:n, np.newaxis], bandwidth=silverman(a[:n]))
            kde.compute_kde()
            p_kdes_a[i] = np.exp(kde.kde.score_samples(evaluate_kde_a[:, np.newaxis]))
        kl = [np.sum(f[f >= tol] * np.log(f[f >= tol] / g[f >= tol]))*np.mean(np.diff(evaluate_kde_v))
              for g, f in zip(p_kdes_v[:-1], p_kdes_v[1:])]
        klsv.append(kl)
        kl = [np.sum(f[f >= tol] * np.log(f[f >= tol] / g[f >= tol]))*np.mean(np.diff(evaluate_kde_a))
              for g, f in zip(p_kdes_a[:-1], p_kdes_a[1:])]
        klsa.append(kl)
    with open(pickle_file, 'wb') as f:
        pickle.dump((mm, klsv, klsa), f)

In [None]:
for m, kl in zip(mm, klsv):
    plt.plot(t[np.arange(1, len(kl)+1, dtype=np.int)*m]/60, kl)
    i = next((i for i, x in enumerate(np.abs(np.diff(kl))) if x < eps), -1)
    print('{:9s} : '.format('m = {:d}'.format(m)), end='')
    if i == -1:
        print('Completeness not reached')
    else:
        print('Completeness reached at t={:.0f} min'.format(t[(i+1)*m]/60))
plt.ylim([0, 1])
plt.xlabel('Minutes')

In [None]:
for m, kl in zip(mm, klsa):
    plt.plot(t[np.arange(1, len(kl)+1, dtype=np.int)*m]/60, kl)
    i = next((i for i, x in enumerate(np.abs(np.diff(kl))) if x < eps), -1)
    print('{:9s} : '.format('m = {:d}'.format(m)), end='')
    if i == -1:
        print('Completeness not reached')
    else:
        print('Completeness reached at t={:.0f} min'.format(t[(i+1)*m]/60))
plt.ylim([0, 0.01])
plt.xlabel('Minutes')

In [None]:
i = 2
m = mm[i]
print("Plot result for m = {:d}".format(m))
plt.plot(t[np.arange(1, len(klsv[i])+1, dtype=np.int)*m]/60, np.array(klsv[i])*1e3, 'b-', label='Speed')
plt.plot(t[np.arange(1, len(klsa[i])+1, dtype=np.int)*m]/60, np.array(klsa[i])*1e3, 'r--', label='Acceleration')
plt.legend()
plt.xlabel(r'Time [min]')
plt.ylabel(r'KL divergence $\times$ $10^3$')
plt.ylim([0, 10])
plt.grid('on')
tikz('kl_sample_based')

## Likelihood of measurements

In [None]:
# Parameters
mm = [500, 1000, 2000, 4000, 6000, 8000, 10000]

In [None]:
filename = 'likelihood_samples.p'
if os.path.exists(filename) and not overwrite:
    with open(filename, 'rb') as f:
        JJ = pickle.load(f)
else:
    JJ = []
    for m in tqdm(mm):
        J = np.zeros(len(v)//m-1)
        for i, n in enumerate(range(m, len(v)-m, m)):
            std = np.std(v[:n])
            kde = KDE(data=v[:n, np.newaxis]/std, bandwidth=silverman(v[:n]/std))
            kde.compute_kde()
            J[i] = kde.kde.score(v[n:n+m, np.newaxis]/std)
        JJ.append(J)
    with open(filename, 'wb') as f:
        pickle.dump(JJ, f)

In [None]:
kde = KDE(data=v[:, np.newaxis]/np.std(v), bandwidth=silverman(v/np.std(v)))
kde.compute_kde()
logp = kde.kde.score_samples(evaluate_kde_v[:, np.newaxis]/np.std(v))
entropy = -np.trapz(np.exp(logp) * logp, x=evaluate_kde_v/np.std(v))
plt.plot(evaluate_kde_v/np.std(v), np.exp(logp))
print("Entropy: {:.3f}".format(entropy))

In [None]:
for m, J in zip(mm, JJ):
    plt.plot(t[np.arange(1, len(J)+1, dtype=np.int)*m]/60, J/m)
plt.ylim([-4, 0])

In [None]:
i = 2
m = mm[i]
entropy = np.zeros(len(v) // m - 1)
for j, n in enumerate(tqdm(range(m, len(v)-m, m))):
    std = np.std(v[:n])
    kde = KDE(data=v[:n, np.newaxis]/std, bandwidth=silverman(v[:n]/std))
    kde.compute_kde()
    logp = kde.kde.score_samples(evaluate_kde_v[:, np.newaxis]/np.std(v))
    entropy[j] = -np.trapz(np.exp(logp) * logp, x=evaluate_kde_v/np.std(v))

In [None]:
print('m = {:d}'.format(m))
J = JJ[i]
plt.plot(t[np.arange(1, len(J)+1, dtype=int)*m]/60, J/m, 'b-', label=r'$\ln J(n)$')
plt.plot(t[np.arange(1, len(J)+1, dtype=int)*m]/60, -entropy, 'r--', label=r'$-\hat{\textup{H}}[x;n]$')
plt.ylim([-3, 0])
plt.xlabel(r'Time [min]')
plt.legend()
plt.grid('on')
tikz('likelihood_samplebased')

## Extract activities

In [None]:
ievent = np.where(np.diff(y) != 0)[0]
nevent = len(ievent)
nfeatures = 3  # End speed, speed diff, time diff
features = np.zeros((nevent, nfeatures))
istart = 0
for i, iend in enumerate(ievent):
    vevent = v[istart:iend+1]
    tevent = t[istart:iend+1]
    features[i, 0] = vevent[-1]
    features[i, 1] = vevent[-1] - vevent[0]
    features[i, 2] = tevent[-1] - tevent[0]
    istart = iend+1
tevent = t[ievent+1]

## KL for activities features

In [None]:
# Parameters
mm = [5, 10, 15, 20]

In [None]:
feature_evaluate = np.zeros((100, nfeatures))
for j in range(nfeatures):
    feature_evaluate[:, j] = np.linspace(np.min(features[:, j]) - 0.5*np.std(features[:, j]), 
                                         np.max(features[:, j]) + 0.5*np.std(features[:, j]), 
                                         feature_evaluate.shape[0])
pdfss = []
for m in tqdm(mm):
    pdfs = np.zeros((nevent//m-1, nfeatures, feature_evaluate.shape[0]))
    for i, n in enumerate(range(m, nevent-m, m)):
        for j in range(nfeatures):
            kde = KDE(data=features[:n, j:j+1], bandwidth=silverman(features[:n, j]))
            kde.compute_kde()
            pdfs[i, j, :] = np.exp(kde.kde.score_samples(feature_evaluate[:, j:j+1]))
    pdfss.append(pdfs)

In [None]:
klss = []
for m, pdfs in zip(mm, pdfss):
    kls = np.zeros((nevent//m-2, nfeatures))
    for j in range(nfeatures):
        kls[:, j] = [np.sum(f[f >= tol] * np.log(f[f >= tol] / g[f >= tol]))*np.mean(np.diff(feature_evaluate[:, j]))
                     for g, f in zip(pdfs[:-1, j], pdfs[1:, j])]
    klss.append(kls)

In [None]:
i = 1
m = mm[i]
kls = klss[i]
style = ['b-', 'r--', 'g-.']
labels = [r'$v_{\textup{end}}$', r'$\Delta v$', r'$\Delta t$']
for j in range(nfeatures):
    plt.semilogy(tevent[np.arange(m, nevent-2*m, m, dtype=np.int)]/60, kls[:, j], style[j], label=labels[j])
plt.legend()
plt.grid('on')
plt.xlabel('Time [min]')
plt.ylabel('KL divergence')
tikz('KL_activity_based')

In [None]:
for m, kls in zip(mm, klss):
    imin = [next((i for i, b in enumerate(kl < 1e-3) if b), -1) for kl in kls.T]
    if np.any(imin == -1):
        print("m = {:2d}: No completeness reached".format(m))
    else:
        print("m = {:2d}: Completeness at {:.0f} min".format(m, tevent[np.max(imin)*m]/60))
        print(imin)
        tt = tevent[np.array(imin)*m]/60
        for t in tt:
            print("{:.0f}  ".format(t), end="")
        print()

## Likelihood for activities

In [None]:
JJ = []
for m in tqdm(mm):
    J = np.zeros((nevent//m-1, nfeatures))
    for i, n in enumerate(range(m, nevent-m, m)):
        for j in range(nfeatures):
            std = np.std(features[:n, j])
            kde = KDE(data=features[:n, j:j+1]/std, bandwidth=silverman(features[:n, j]/std))
            kde.compute_kde()
            J[i, j] = kde.kde.score(features[n:n+m, j:j+1]/std) / m
    JJ.append(J)

In [None]:
i = 1
j = 1
m = mm[i]
J = JJ[i]

entropy = np.zeros(nevent//m - 1)
for k, n in enumerate(range(m, nevent-m, m)):
    std = np.std(features[:n, j])
    kde = KDE(data=features[:n, j:j+1]/std, bandwidth=silverman(features[:n, j:j+1]/std))
    kde.compute_kde()
    logp = kde.kde.score_samples(feature_evaluate[:, j:j+1]/np.std(v))
    entropy[k] = -np.trapz(np.exp(logp) * logp, x=feature_evaluate[:, j]/np.std(v))    

plt.plot(tevent[np.arange(m, nevent-m, m, dtype=np.int)]/60, J[:, j], 'b-', label=r'$\ln J(n)$')
plt.plot(tevent[np.arange(m, nevent-m, m, dtype=np.int)]/60, -entropy, 'r--', label=r'$-\hat{\textup{H}}[x;n]$')
plt.xlabel(r'Time [min]')
plt.legend()
plt.grid('on')
tikz('likelihood_activitybased')

In [None]:
plt.plot(h_silv)
plt.plot(h)

In [None]:
h/0.33037721

In [None]:
i = 1
j = 1
m = mm[i]
J = np.zeros(nevent//m-1)
h_silv = np.zeros_like(J)
h = np.zeros_like(J)
entropy = np.zeros_like(J)
for i, n in enumerate(tqdm(range(m, nevent-m, m))):
    std = np.std(features[:n, j])
    h_silv[i] = silverman(features[:n, j]/std)
    kde = KDE(data=features[:n, j:j+1]/std)
    if i == 0:
        hold = h_silv[i]
    else:
        hold = h[i-1]
    kde.compute_bandwidth(min_bandwidth=hold*0.75, max_bandwidth=hold*1.25, n_bandwidths=21)
    kde.compute_kde()
    h[i] = kde.bandwidth
    J[i] = kde.kde.score(features[n:n+m, j:j+1]/std) / m
    logp = kde.kde.score_samples(feature_evaluate[:, j:j+1]/np.std(v))
    entropy[i] = -np.trapz(np.exp(logp) * logp, x=feature_evaluate[:, j]/np.std(v))    

In [None]:
plt.plot(tevent[np.arange(m, nevent-m, m, dtype=np.int)]/60, J, 'b-', label=r'$\ln J(n)$')
plt.plot(tevent[np.arange(m, nevent-m, m, dtype=np.int)]/60, -entropy, 'r--', label=r'$-\hat{\textup{H}}[x;n]$')
plt.xlabel(r'Time [min]')
plt.legend()
plt.grid('on')
tikz('likelihood_activitybased_small_bandwidth')

In [None]:
plt.plot(tevent[np.arange(m, nevent-m, m, dtype=np.int)]/60, h_silv, 'b-', label=r'Silverman bandwidth')
plt.plot(tevent[np.arange(m, nevent-m, m, dtype=np.int)]/60, h, 'r--', label=r'Cross-validated bandwidth')
plt.xlabel(r'Time [min]')
plt.ylabel(r'$h$')
plt.legend()
plt.grid('on')
tikz('bandwidth')

In [None]:
plt.plot(feature_evaluate[:, j], np.exp(logp))

# Dissimilarity

In [None]:
def dissimilarity(xx, yy):
    j = 0
    for y in yy:
        j = max(j, min([np.sqrt(np.sum((x - y)**2)) for x in xx]))
    return j
    
JJ = []
for m in tqdm(mm):
    J = np.zeros(nevent//m-1)
    for i, n in enumerate(range(m, nevent-m, m)):
        std = np.std(features[:n], axis=0)
        recorded = features[:n] / std
        new_data = features[n:n+m] / std
        J[i] = dissimilarity(recorded, new_data)
    JJ.append(J)

In [None]:
for m, J in zip(mm, JJ):
    plt.plot(tevent[np.arange(m, nevent-m, m, dtype=np.int)]/60, J, label='m={:d}'.format(m))
plt.xlabel('Time [min]')
plt.ylabel('J')
plt.legend()

In [None]:
i = 1
width = 5
m = mm[i]
J = JJ[i]
plt.plot(tevent[np.arange(m, nevent-m, m, dtype=np.int)]/60, J)
plt.xlabel(r'Time [min]')
plt.grid('on')
plt.ylabel(r'$J_d(n,m)$')
tikz('dissimilarity')

In [None]:
d = []
for y in new_data:
    dist = []
    for x in recorded:
        dist.append(np.sqrt(np.sum((x-y)**2)))
    d.append(dist)
d = np.array(d)

In [None]:
plt.imshow(d)

In [None]:
np.min(d, axis=1)

In [None]:
y = new_data[0]

In [None]:
dist = []
for x in recorded:
    dist.append(np.sqrt(np.sum((x-y)**2)))

In [None]:
y