In [None]:
import os
import pickle
import matplotlib.pyplot as plt
import numpy as np
import ot
from tqdm import tqdm_notebook as tqdm
from scipy.spatial import distance
from scipy.stats import ttest_ind
from sklearn.model_selection import train_test_split
from stats import KDE

In [None]:
with open(os.path.join("pickles", "df.p"), "rb") as file:
    dfs, scaling = pickle.load(file)
scaling = scaling.T   # [time vstart vend]

In [None]:
n = 50
profiles = np.zeros((len(dfs), n))
for i, df in enumerate(dfs):
    profiles[i] = np.interp(np.linspace(0, 1, n), df["time"], df["vel"])
profiles_scaled = np.zeros_like(profiles)
for i in range(n):
    profiles_scaled[:, i] = profiles[:, i]*(scaling[:, 1] - scaling[:, 2]) + scaling[:, 2]

# Without scaling

In [None]:
indexa, indexb = train_test_split(np.arange(len(profiles)), test_size=.5, random_state=0)

In [None]:
scalinga, scalingb = scaling[indexa, :], scaling[indexb, :]
profilesa, profilesb = profiles[indexa, :], profiles[indexb, :]
pa, pb = profiles_scaled[indexa, :], profiles_scaled[indexb, :]
ta, tb = scalinga[:, 0], scalingb[:, 0]

In [None]:
def approach0(profilesa):
    return profilesa

def approach1(profilesa):
    return profilesa[np.random.choice(np.arange(len(profilesa)), len(profilesa)), :]

def approach2(profilesa):
    return np.array([np.linspace(1, 0, profilesa.shape[1]) for _ in range(profilesa.shape[0])])

def approach3(profilesa):
    profiles = np.zeros_like(profilesa)
    x = np.linspace(0, 1, n)
    for i in range(len(profilesa)):
        b = -np.random.rand()*2
        a = -1-b
        profiles[i] = a*x**2 + b*x + 1
    return profiles

def approach4(profilesa, q=3):
    profiles = np.zeros_like(profilesa)
    mean = np.mean(profilesa, axis=0)
    u,s,v = np.linalg.svd(profilesa-mean, full_matrices=False)
    for i in range(len(profilesa)):
        profiles[i] = np.dot(np.random.randn(q)*s[:q]/np.sqrt(profilesa.shape[0]), v[:q]) + mean
    return profiles

def approach5(profilesa):
    profiles = approach1(profilesa)
    x = np.linspace(0, 1, n)
    for i in range(len(profilesa)):
        b = (2*np.random.rand() - 1)*.1
        a = -b
        profiles[i] += a*x**2 + b*x
    return profiles

def approach6(profilesa, q=3):
    profiles = np.zeros_like(profilesa)
    mean = np.mean(profilesa, axis=0)
    u,s,v = np.linalg.svd(profilesa-mean, full_matrices=False)
    for i in range(len(profilesa)):
        profiles[i] = np.dot(np.random.randn(q)*s[:q]/np.sqrt(profilesa.shape[0]), v[:q]) + mean
    profiles[profiles > 1] = 1
    profiles[profiles < 0] = 0
    return profiles

def approach7(profilesa, q=2):
    profiles = np.zeros_like(profilesa)
    mean = np.mean(profilesa, axis=0)
    u,s,v = np.linalg.svd(profilesa-mean, full_matrices=False)
    k = KDE(u[:, :q], scaling=True)
    k.compute_bandwidth()
    profiles = np.dot(k.sample(len(k.data)) * s[:q], v[:q]) + mean
    profiles[profiles > 1] = 1
    profiles[profiles < 0] = 0
    return profiles

In [None]:
def score(approach, test):
    return ot.emd2([], [], distance.cdist(test, approach))

In [None]:
np.random.seed(2)
for i, method in enumerate([approach0, approach1, approach2, approach3, 
                            approach4, approach5, approach6, approach7]):
    profiles = method(profilesa)
    score1 = score(profiles, profilesb)
    score2 = score(profiles, profilesa)
    print("Method {:d}: {:.4f} {:.4f} {:.4f}".format(i, score1, score2, 2*score1-score2))

In [None]:
np.random.seed(0)
for i in range(30):
    profiles = approach4(profilesa, q=i)
    score1 = score(profiles, profilesb)
    score2 = score(profiles, profilesa)
    print("q={:2d}: {:.4f} {:.4f} {:.4f}".format(i, score1, score2, 2*score1-score2))

In [None]:
np.random.seed(0)
for i in range(10):
    profiles = approach7(profilesa, q=i)
    score1 = score(profiles, profilesb)
    score2 = score(profiles, profilesa)
    print("q={:2d}: {:.4f} {:.4f} {:.4f}".format(i, score1, score2, 2*score1-score2))

# With scaling

In [None]:
def score(p1, p2, t1, t2):
    a = 50
    s1 = np.hstack((t1[:, np.newaxis]*a, p1))
    s2 = np.hstack((t2[:, np.newaxis]*a, p2))
    return ot.emd2([], [], distance.cdist(s1, s2))

In [None]:
def approach0(pa, tt):
    return pa, ta

def approach1(pa, ta):
    i = np.random.choice(np.arange(len(pa)), len(pa))
    return pa[i, :], ta[i]

def approach2(pa, ta):
    data = np.array([pa[:, 0], pa[:, -1], np.log(ta)]).T
    k = KDE(data, scaling=True)
    k.compute_bandwidth()
    p = np.zeros_like(pa)
    t = np.zeros_like(ta)
    for i in range(len(pa)):
        pars = k.sample()[0]
        pars[1] = max(pars[1], 0)
        while pars[1] >= pars[0]:
            pars = k.sample()[0]
            pars[1] = max(pars[1], 0)
        p[i] = np.linspace(pars[0], pars[1], pa.shape[1])
        t[i] = np.exp(pars[2])
    return p, t

def approach3(pa, ta, q=2):
    p = np.zeros_like(pa)
    t = np.zeros_like(ta)
    mean = np.mean(pa, axis=0)
    u,s,v = np.linalg.svd(pa-mean, full_matrices=False)
    k = KDE(np.hstack((np.log(ta)[:, np.newaxis], u[:, :q])), scaling=True)
    k.compute_bandwidth(max_bw=k.silverman())
    for i in range(len(pa)):
        pars = k.sample()[0]
        p[i] = np.dot(pars[1:]*s[:q], v[:q]) + mean
        t[i] = np.exp(pars[0])
    return p, t

def approach4(pa, ta, q=2):
    data = np.array([pa[:, 0], pa[:, -1], np.log(ta)]).T
    k = KDE(data, scaling=True)
    k.compute_bandwidth()
    p = np.zeros_like(pa)
    t = np.zeros_like(ta)
    psvd = pa.copy()
    for i in range(len(pa)):
        psvd[i] = (psvd[i] - psvd[i, -1]) / (psvd[i, 0] - psvd[i, -1])
    mean = np.mean(psvd, axis=0)
    u,s,v = np.linalg.svd(psvd-mean, full_matrices=False)
    for i in range(len(pa)):
        pars = k.sample()[0]
        pars[1] = max(pars[1], 0)
        while pars[1] > pars[0] or pars[1] < 0:
            pars = k.sample()[0]
            pars[1] = max(pars[1], 0)
        p[i] = np.dot(np.random.randn(q)*s[:q]/np.sqrt(pa.shape[0]), v[:q]) + mean
        p[i] = p[i] * (pars[0] - pars[1]) + pars[1]
        t[i] = np.exp(pars[2])
    return p, t

def approach5(pa, ta, q=3, a=50):
    p = np.zeros_like(pa)
    t = np.zeros_like(ta)
    svd = np.hstack((a*np.log(ta)[:, np.newaxis], pa))
    mean = np.mean(svd, axis=0)
    u,s,v = np.linalg.svd(svd-mean, full_matrices=False)
    k = KDE(u[:, :q], scaling=True)
    k.compute_bandwidth()
    for i in range(len(pa)):
        pars = k.sample()[0]
        tmp = np.dot(pars*s[:q], v[:q]) + mean
        p[i] = tmp[1:]
        t[i] = np.exp(tmp[0]/a)
    return p, t

In [None]:
for i, method in enumerate([approach0, approach1, approach2, approach3, approach4, approach5]):
    np.random.seed(2)
    p, t = method(pa, ta)
    score1 = score(p, pb, t, tb)
    score2 = score(p, pa, t, ta)
    print("Method {:d}: {:.4f} {:.4f} {:.4f}".format(i, score1, score2, 2*score1-score2))

In [None]:
np.random.seed(0)
for i in range(0, 10):
    p, t = approach5(pa, ta, q=i)
    score1 = score(p, pb, t, tb)
    score2 = score(p, pa, t, ta)
    print("q={:2d}: {:.4f} {:.4f} {:.4f}".format(i, score1, score2, 2*score1-score2))

## Perform comparison multiple times

In [None]:
nrepeat = 50
qmax = 4
np.random.seed(0)
scores1 = np.zeros((nrepeat, qmax+2))
scores2 = np.zeros_like(scores1)
for i in tqdm(range(nrepeat)):
    indexa, indexb = train_test_split(np.arange(len(dfs)), test_size=.5, random_state=i)
    scalinga, scalingb = scaling[indexa, :], scaling[indexb, :]
    pa, pb = profiles_scaled[indexa, :], profiles_scaled[indexb, :]
    ta, tb = scalinga[:, 0], scalingb[:, 0]
    
    # Using the default.
    scores1[i, 0] = score(pa, pb, ta, tb)
    
    # Using approach 3 with different q values.
    for q in range(0, qmax+1):
        p, t = approach3(pa, ta, q=q)
        scores1[i, q+1] = score(p, pb, t, tb)
        scores2[i, q+1] = score(p, pa, t, ta)
scores = 2*scores1 - scores2

In [None]:
plt.boxplot(scores)
_ = plt.xticks(np.arange(1, qmax+3), ["Training set"]+["q={:d}".format(q) for q in range(qmax+2)])

In [None]:
score_difference = scores[:, 1:].copy()
for q in range(qmax+1):
    score_difference[:, q] = score_difference[:, q] - scores[:, 0]
plt.boxplot(score_difference)
_ = plt.xticks(np.arange(1, qmax+2), ["q={:d}".format(q) for q in range(qmax+1)])

In [None]:
import scipy.stats

In [None]:
a = np.random.randn(1000)

In [None]:
scipy.stats.iqr(a)

In [None]:
np.percentile(a, 75) - np.percentile(a, 25)