In [None]:
import os
from typing import NamedTuple
import pickle
import matplotlib.pyplot as plt
import numpy as np
import ot
from tqdm import tqdm_notebook as tqdm
from scipy.spatial import distance
from scipy.stats import ttest_ind
from sklearn.model_selection import train_test_split
from stats import KDE

In [None]:
with open(os.path.join("pickles", "df.p"), "rb") as file:
    dfs, scaling = pickle.load(file)
scaling = scaling.T   # [time vstart vend]

In [None]:
n = 50
profiles = np.zeros((len(dfs), n))
for i, df in enumerate(dfs):
    profiles[i] = np.interp(np.linspace(0, 1, n), df["time"], df["vel"])
profiles_scaled = np.zeros_like(profiles)
for i in range(n):
    profiles_scaled[:, i] = profiles[:, i]*(scaling[:, 1] - scaling[:, 2]) + scaling[:, 2]

# Without scaling

In [None]:
indexa, indexb = train_test_split(np.arange(len(profiles)), test_size=.5, random_state=0)

In [None]:
scalinga, scalingb = scaling[indexa, :], scaling[indexb, :]
profilesa, profilesb = profiles[indexa, :], profiles[indexb, :]
pa, pb = profiles_scaled[indexa, :], profiles_scaled[indexb, :]
ta, tb = scalinga[:, 0], scalingb[:, 0]

In [None]:
def approach0(profilesa):
    return profilesa

def approach1(profilesa):
    return profilesa[np.random.choice(np.arange(len(profilesa)), len(profilesa)), :]

def approach2(profilesa):
    return np.array([np.linspace(1, 0, profilesa.shape[1]) for _ in range(profilesa.shape[0])])

def approach3(profilesa):
    profiles = np.zeros_like(profilesa)
    x = np.linspace(0, 1, n)
    for i in range(len(profilesa)):
        b = -np.random.rand()*2
        a = -1-b
        profiles[i] = a*x**2 + b*x + 1
    return profiles

def approach4(profilesa, q=3):
    profiles = np.zeros_like(profilesa)
    mean = np.mean(profilesa, axis=0)
    u,s,v = np.linalg.svd(profilesa-mean, full_matrices=False)
    for i in range(len(profilesa)):
        profiles[i] = np.dot(np.random.randn(q)*s[:q]/np.sqrt(profilesa.shape[0]), v[:q]) + mean
    return profiles

def approach5(profilesa):
    profiles = approach1(profilesa)
    x = np.linspace(0, 1, n)
    for i in range(len(profilesa)):
        b = (2*np.random.rand() - 1)*.1
        a = -b
        profiles[i] += a*x**2 + b*x
    return profiles

def approach6(profilesa, q=3):
    profiles = np.zeros_like(profilesa)
    mean = np.mean(profilesa, axis=0)
    u,s,v = np.linalg.svd(profilesa-mean, full_matrices=False)
    for i in range(len(profilesa)):
        profiles[i] = np.dot(np.random.randn(q)*s[:q]/np.sqrt(profilesa.shape[0]), v[:q]) + mean
    profiles[profiles > 1] = 1
    profiles[profiles < 0] = 0
    return profiles

def approach7(profilesa, q=2):
    profiles = np.zeros_like(profilesa)
    mean = np.mean(profilesa, axis=0)
    u,s,v = np.linalg.svd(profilesa-mean, full_matrices=False)
    k = KDE(u[:, :q], scaling=True)
    k.compute_bandwidth()
    profiles = np.dot(k.sample(len(k.data)) * s[:q], v[:q]) + mean
    profiles[profiles > 1] = 1
    profiles[profiles < 0] = 0
    return profiles

In [None]:
def score(approach, test):
    return ot.emd2([], [], distance.cdist(test, approach))

In [None]:
def test_without_scaling(alpha, seed=0, overwrite=False):
    filename = os.path.join("pickles", 
                            "test_without_scaling_seed{:d}.p".format(seed))
    if os.path.exists(filename) and not overwrite:
        with open(filename, "rb") as file:
            scores1, scores2 = pickle.load(file)
    else:
        approaches = [approach0, approach1, approach2, approach3, 
                      approach4, approach5, approach6, approach7]
        scores1 = np.zeros(len(approaches))
        scores2 = np.zeros_like(scores1)
        for i, method in enumerate(approaches):
            profiles = method(profilesa)
            scores1[i] = score(profiles, profilesb)
            scores2[i] = score(profiles, profilesa)
        with open(filename, "wb") as file:
            pickle.dump((scores1, scores2), file)
    
    for i in range(len(scores1)):
        print("Method {:d}: {:.4f} {:.4f} {:.4f}".format(i, scores1[i], scores2[i], 
                                                         (1+alpha)*scores1[i]-alpha*scores2[i]))

In [None]:
test_without_scaling(alpha=1)

In [None]:
test_without_scaling(alpha=.25)

In [None]:
def test_approach_without_scaling(approach, qmax, alpha=1, seed=0, overwrite=False):
    filename = os.path.join("pickles", 
                            "test_without_scaling_{:s}_qmax{:d}_seed{:d}.p"
                            .format(approach.__name__, qmax, seed))
    if os.path.exists(filename) and not overwrite:
        with open(filename, "rb") as file:
            scores1, scores2 = pickle.load(file)
    else:
        scores1 = np.zeros(qmax+1)
        scores2 = np.zeros_like(scores1)
        for i in range(qmax+1):
            profiles = approach(profilesa, q=i)
            scores1[i] = score(profiles, profilesb)
            scores2[i] = score(profiles, profilesa)
        with open(filename, "wb") as file:
            pickle.dump((scores1, scores2), file)
    
    combined_score = (1+alpha)*scores1-alpha*scores2
    for i in range(len(scores1)):
        print("q={:2d}: {:.4f} {:.4f} {:.4f}".format(i, scores1[i], scores2[i], combined_score[i]),
              end="")
        if combined_score[i] == np.min(combined_score):
            print("  *")
        else:
            print()

In [None]:
test_approach_without_scaling(approach4, qmax=30, alpha=.25, seed=0)

In [None]:
test_approach_without_scaling(approach7, qmax=10, alpha=.25, seed=0)

# With scaling

In [None]:
def score(p1, p2, t1, t2):
    a = 25
    s1 = np.hstack((np.log(t1[:, np.newaxis])*a, p1))
    s2 = np.hstack((np.log(t2[:, np.newaxis])*a, p2))
    return ot.emd2([], [], distance.cdist(s1, s2))

In [None]:
def approach0(pa, tt):
    return pa, ta

def approach1(pa, ta, n=None):
    i = np.random.choice(np.arange(len(pa)), len(pa) if n is None else n)
    return pa[i, :], ta[i]

def approach2(pa, ta):
    data = np.array([pa[:, 0], pa[:, -1], np.log(ta)]).T
    k = KDE(data, scaling=True)
    k.compute_bandwidth()
    p = np.zeros_like(pa)
    t = np.zeros_like(ta)
    for i in range(len(pa)):
        pars = k.sample()[0]
        pars[1] = max(pars[1], 0)
        while pars[1] >= pars[0]:
            pars = k.sample()[0]
            pars[1] = max(pars[1], 0)
        p[i] = np.linspace(pars[0], pars[1], pa.shape[1])
        t[i] = np.exp(pars[2])
    return p, t

def approach3(pa, ta, q=2, n=None):
    if n is None:
        n = len(pa)
    p = np.zeros((n, pa.shape[1]))
    t = np.zeros(n)
    mean = np.mean(pa, axis=0)
    u,s,v = np.linalg.svd(pa-mean, full_matrices=False)
    k = KDE(np.hstack((np.log(ta)[:, np.newaxis], u[:, :q])), scaling=True)
    k.compute_bandwidth(max_bw=k.silverman())
    for i in range(n):
        pars = k.sample()[0]
        p[i] = np.dot(pars[1:]*s[:q], v[:q]) + mean
        t[i] = np.exp(pars[0])
    return p, t

def approach4(pa, ta, q=2):
    data = np.array([pa[:, 0], pa[:, -1], np.log(ta)]).T
    k = KDE(data, scaling=True)
    k.compute_bandwidth()
    p = np.zeros_like(pa)
    t = np.zeros_like(ta)
    psvd = pa.copy()
    for i in range(len(pa)):
        psvd[i] = (psvd[i] - psvd[i, -1]) / (psvd[i, 0] - psvd[i, -1])
    mean = np.mean(psvd, axis=0)
    u,s,v = np.linalg.svd(psvd-mean, full_matrices=False)
    for i in range(len(pa)):
        pars = k.sample()[0]
        pars[1] = max(pars[1], 0)
        while pars[1] > pars[0] or pars[1] < 0:
            pars = k.sample()[0]
            pars[1] = max(pars[1], 0)
        p[i] = np.dot(np.random.randn(q)*s[:q]/np.sqrt(pa.shape[0]), v[:q]) + mean
        p[i] = p[i] * (pars[0] - pars[1]) + pars[1]
        t[i] = np.exp(pars[2])
    return p, t

def approach5(pa, ta, q=3, a=25, n=None):
    if n is None:
        n = len(pa)
    p = np.zeros((n, pa.shape[1]))
    t = np.zeros(n)
    svd = np.hstack((a*np.log(ta[:, np.newaxis]), pa))
    mean = np.mean(svd, axis=0)
    u,s,v = np.linalg.svd(svd-mean, full_matrices=False)
    k = KDE(u[:, :q], scaling=True)
    k.compute_bandwidth(max_bw=k.silverman())
    for i in range(n):
        pars = k.sample()[0]
        tmp = np.dot(pars*s[:q], v[:q]) + mean
        p[i] = tmp[1:]
        t[i] = np.exp(tmp[0]/a)
    return p, t

In [None]:
def test_with_scaling(alpha, seed=0, overwrite=False):
    filename = os.path.join("pickles", 
                            "test_with_scaling_seed{:d}.p".format(seed))
    if os.path.exists(filename) and not overwrite:
        with open(filename, "rb") as file:
            scores1, scores2 = pickle.load(file)
    else:
        approaches = [approach0, approach1, approach2, approach3, approach4, approach5]
        scores1 = np.zeros(len(approaches))
        scores2 = np.zeros_like(scores1)
        for i, method in enumerate(approaches):
            p, t = method(pa, ta)
            scores1[i] = score(p, pb, t, tb)
            scores2[i] = score(p, pa, t, ta)
        with open(filename, "wb") as file:
            pickle.dump((scores1, scores2), file)
    
    for i in range(len(scores1)):
        print("Method {:d}: {:.4f} {:.4f} {:.4f}".format(i, scores1[i], scores2[i], 
                                                         (1+alpha)*scores1[i]-alpha*scores2[i]))

In [None]:
test_with_scaling(alpha=.25, seed=0)

In [None]:
def test_approach_with_scaling(approach, qmax, alpha=1, seed=0, overwrite=False):
    filename = os.path.join("pickles", 
                            "test_with_scaling_{:s}_qmax{:d}_seed{:d}.p"
                            .format(approach.__name__, qmax, seed))
    if os.path.exists(filename) and not overwrite:
        with open(filename, "rb") as file:
            scores1, scores2 = pickle.load(file)
    else:
        scores1 = np.zeros(qmax+1)
        scores2 = np.zeros_like(scores1)
        for i in range(qmax+1):
            p, t = approach(pa, ta, q=i)
            scores1[i] = score(p, pb, t, tb)
            scores2[i] = score(p, pa, t, ta)
        with open(filename, "wb") as file:
            pickle.dump((scores1, scores2), file)
    
    combined_score = (1+alpha)*scores1-alpha*scores2
    for i in range(len(scores1)):
        print("q={:2d}: {:7.4f} {:7.4f} {:7.4f}".format(i, scores1[i], scores2[i], combined_score[i]),
              end="")
        if combined_score[i] == np.min(combined_score):
            print("  *")
        else:
            print()

In [None]:
test_approach_with_scaling(approach5, qmax=8, alpha=.3, seed=0)

## Perform comparison multiple times

In [None]:
def test_multiple_times(approach, qmax, overwrite=False):
    filename = os.path.join("pickles", 
                            "test_multiple_times_{:s}_qmax{:d}".format(approach.__name__, qmax))
    if os.path.exists(filename) and not overwrite:
        with open(filename, "rb") as file:
            return pickle.load(file)
        
    nrepeat = 50
    np.random.seed(0)
    scores1 = np.zeros((nrepeat, qmax+1))
    scores2 = np.zeros_like(scores1)
    for i in tqdm(range(nrepeat)):
        indexa, indexb = train_test_split(np.arange(len(dfs)), test_size=.5, random_state=i)
        scalinga, scalingb = scaling[indexa, :], scaling[indexb, :]
        pa, pb = profiles_scaled[indexa, :], profiles_scaled[indexb, :]
        ta, tb = scalinga[:, 0], scalingb[:, 0]

        # Using the default.
        scores1[i, 0] = score(pa, pb, ta, tb)

        # Using approach with different q values.
        for q in range(1, qmax+1):
            p, t = approach(pa, ta, q=q)
            scores1[i, q] = score(p, pb, t, tb)
            scores2[i, q] = score(p, pa, t, ta)
    
    with open(filename, "wb") as file:
        pickle.dump((scores1, scores2), file)
    
    return scores1, scores2

In [None]:
def boxplots(scores):
    plt.boxplot(scores)
    plt.xticks(np.arange(1, qmax+2), 
               ["Training set\n{:.1f}".format(np.median(scores[:, 0]))] +
               ["d={:d}\n{:.1f}".format(q, np.median(scores[:, q])) for q in range(1, qmax+1)])

In [None]:
qmax = 6
s1, s2 = test_multiple_times(approach5, qmax)
alpha = .5
scores = s1 + alpha*(s1-s2)
boxplots(scores)

In [None]:
score_difference = scores[:, 1:].copy()
for q in range(qmax+1):
    score_difference[:, q] = score_difference[:, q] - scores[:, 0]
plt.boxplot(score_difference)
_ = plt.xticks(np.arange(1, qmax+2), ["d={:d}".format(q) for q in range(qmax+1)])

## Validation of scoring measure

In [None]:
def validation_scoring(approach, qreal, nmin=100, overwrite=False):
    filename = os.path.join("pickles", "test_validation_{:s}_qreal{:d}_nmin{:d}.p"
                            .format(approach.__name__, qreal, nmin))
    if os.path.exists(filename) and not overwrite:
        with open(filename, "rb") as file:
            return pickle.load(file)
    
    nmax = 5000
    nrepeat = 10
    qmax = 4
    np.random.seed(0)
    old_few = np.zeros((nrepeat, qmax+1))
    old_many = np.zeros_like(old_few)
    self_few = np.zeros_like(old_few)
    self_many = np.zeros_like(old_few)
    for i in tqdm(range(nrepeat)):
        Y1, Y2 = approach(pa, ta, n=nmin, q=qreal)
        Z1, Z2 = approach(pa, ta, n=nmax, q=qreal)

        # Using the default.
        X1, X2 = approach1(Y1, Y2, n=nmax)
        old_few[i, 0] = score(X1, Z1[:nmin], X2, Z2[:nmin])
        old_many[i, 0] = score(X1, Z1, X2, Z2)
        self_few[i, 0] = score(X1, Y1, X2, Y2)
        self_many[i, 0] = score(X1, Y1, X2, Y2)

        # Using approach 3 with different q values.
        for q in range(1, qmax+1):
            X1, X2 = approach(Y1, Y2, q=q, n=nmax)
            old_few[i, q] = score(X1[:nmin], Z1, X2[:nmin], Z2)
            old_many[i, q] = score(X1, Z1, X2, Z2)
            self_few[i, q] = score(X1[:nmin], Y1, X2[:nmin], Y2)
            self_many[i, q] = score(X1, Y1, X2, Y2)
            
    with open(filename, "wb") as file:
        pickle.dump((old_few, old_many, self_few, self_many), file)
    
    return old_few, old_many, self_few, self_many

In [None]:
qreal = 3
old_few, old_many, self_few, self_many = validation_scoring(approach3, qreal)

In [None]:
boxplot(old_few)

In [None]:
boxplot(old_many)

In [None]:
alpha = np.linspace(0, 1, 101)
beta = np.zeros_like(alpha)
old_few2 = old_few[:, 2]
self_few2 = self_few[:, 2]
old_many2 = old_many[:, 2]
for i in range(len(alpha)):
    beta[i] = np.corrcoef(old_few2 + alpha[i]*(old_few2-self_few2), 
                          old_many2)[0][1]
    
plt.plot(alpha, beta)
plt.xlabel(r"$\alpha$")
plt.ylabel("Correlation")
plt.title(r"Max correlation: {:.3f} at $\alpha$= {:.2f}".format(np.max(beta), alpha[np.argmax(beta)]))

Training Y
Test Z (klein)
Test Z' (groot) -> benadering van originele distributie
Generation X

Wat we willen: X komt uit zelfde distributie als Y
Lijkt X op Z? -- probleem: je kunt het beste gewoon X=Y nemen

ideal(X) = W(X,Z')

oud(X) = W(X,Z)
adhoc(X;a) = W(X,Z) - a*(W(X,Z) - W(X,Y))

W(X,Z') = f(X,Y,Z)

## Try alternative way: create large number of test cases

In [None]:
nmin, nmax = 100, 5000
nrepeat = 10
qmax = 4
np.random.seed(0)
old_scores_few = np.zeros((nrepeat, qmax+2))
old_scores_many = np.zeros_like(old_scores_few)
new_scores_few = np.zeros_like(old_scores_few)
new_scores_many = np.zeros_like(old_scores_few)
for i in tqdm(range(nrepeat)):
    Y1, Y2 = approach5(pa, ta, n=nmin)
    Z1, Z2 = approach5(pa, ta, n=nmin)
    
    # Using the default.
    X1, X2 = approach1(Y1, Y2, n=nmax)
    old_scores_few[i, 0] = score(X1[:nmin], Z1, X2[:nmin], Z2)
    old_scores_many[i, 0] = score(X1, Z1, X2, Z2)
    new_scores_few[i, 0] = 2*old_scores_few[i, 0] - score(X1[:nmin], Y1, X2[:nmin], Y2)
    new_scores_many[i, 0] = 2*old_scores_many[i, 0] - score(X1, Y1, X2, Y2)
    
    # Using approach 3 with different q values.
    for q in range(0, qmax+1):
        X1, X2 = approach3(Y1, Y2, q=q, n=nmax)
        old_scores_few[i, q+1] = score(X1[:nmin], Z1, X2[:nmin], Z2)
        old_scores_many[i, q+1] = score(X1, Z1, X2, Z2)
        new_scores_few[i, q+1] = 2*old_scores_few[i, q+1] - score(X1[:nmin], Y1, X2[:nmin], Y2)
        new_scores_many[i, q+1] = 2*old_scores_many[i, q+1] - score(X1, Y1, X2, Y2)

In [None]:
boxplot(old_scores_few)

In [None]:
boxplot(old_scores_many)

## Try with uniform distribution

In [None]:
def naive(x, n=None):
    i = np.random.choice(np.arange(len(x)), len(x) if n is None else n)
    return x[i, :]

def generate_new(x, n=None, h=.1):
    k = KDE(x)
    k.set_bandwidth(h)
    return k.sample(len(x) if n is None else n)

def s(a, b):
    return ot.emd2([], [], distance.cdist(a, b))

In [None]:
np.random.seed(1)
nmin, nmax = 5, 100
nrepeat = 500
hh = np.array([.01, 0.02, .04, .06, .08, .1, .15, .2, .3, .4])*2.5
real_numbers = np.atleast_2d(np.linspace(0, 1, 100, endpoint=False) + .5/100).T
real_numbers = np.atleast_2d(np.random.randn(1, 200)).T

test_scores = np.zeros((nrepeat, len(hh)+1))
self_scores = np.zeros_like(test_scores)
real_scores = np.zeros_like(test_scores)

for i in tqdm(range(nrepeat)):
    y = np.random.randn(1, nmin).T
    z = np.random.randn(1, nmin).T
    
    x = naive(y, n=nmax)
    test_scores[i, 0] = s(x, z)
    self_scores[i, 0] = s(x, y)
    real_scores[i, 0] = s(x, real_numbers)
    for j, h in enumerate(hh):
        x = generate_new(y, n=nmax, h=h)
        test_scores[i, j+1] = s(x, z)
        self_scores[i, j+1] = s(x, y)
        real_scores[i, j+1] = s(x, real_numbers)

In [None]:
def boxplot(s):
    plt.subplots(1, 1, figsize=(12, 4))
    plt.boxplot(s)
    _ = plt.xticks(np.arange(1, len(hh)+2), 
                   ["Training set\n{:.3f}".format(np.mean(s[:, 0]))]+
                   ["h={:.3f}\n{:.3f}".format(h, np.mean(s[:, i+1])) 
                    for i, h in enumerate(hh)])
boxplot(real_scores)

In [None]:
boxplot(test_scores)

In [None]:
boxplot(test_scores + 0.31*(test_scores - self_scores))

In [None]:
a = np.mean(test_scores, axis=0)
b = np.mean(self_scores, axis=0)
c = np.mean(real_scores, axis=0)
alpha = np.linspace(0, 1, 101)
beta = np.zeros_like(alpha)
for i in range(len(alpha)):
    beta[i] = np.corrcoef(a + alpha[i]*(a-b), c)[0][1]
plt.plot(alpha, beta)
plt.xlabel(r"$\alpha$")
plt.ylabel("Correlation *true metric* and *ad-hoc method*")
alpha[np.argmax(beta)], np.max(beta)

In [None]:
alpha = 0.31
plt.plot(a+alpha*(a-b), c, '.')
plt.xlabel("Ad-hoc metric")
plt.ylabel("True metric")
plt.title(r"$\alpha^*={:.2f}$, Correlation: {:.4f}".format(alpha, np.max(beta)))

In [None]:
def g1(n):  # Uniform
    return np.random.rand(n, 1)
def g2(n):  # Normal, 1D
    return np.random.randn(n, 1)
def g3(n):  # Normal, 2D
    return np.random.randn(n, 2)
gkde = KDE([-1, .2, 1], bandwidth=.2)
def g4(n):  # KDE
    return gkde.sample(n)
def g5(n):  # Normal, 3D
    return np.random.randn(n, 3)
def g6(n):  # Normal, 4D
    return np.random.randn(n, 4)

In [None]:
plt.hist(g4(1000))

In [None]:
TestResult = NamedTuple("TestResult", [("alpha", float),
                                       ("correlation", float),
                                       ("old_correlation", float),
                                       ("test_scores", np.ndarray),
                                       ("self_scores", np.ndarray),
                                       ("real_scores", np.ndarray)])                                       
hh = np.array([.01, 0.02, .04, .06, .08, .1, .15, .2, .3, .4])*3
def find_alpha(generator, nyz, nx=100, nreal=500, nrepeat=500, seed=0, overwrite=False):
    filename = os.path.join('pickles', 
                            'result_{:s}_nx{:d}_ny{:d}_nreal{:d}_nrepeat{:d}_seed{:d}.p'
                            .format(generator.__name__, nx, nyz, nreal, nrepeat, seed))
    if not overwrite and os.path.exists(filename):
        with open(filename, 'rb') as file:
            result = pickle.load(file)
        return result    
    
    np.random.rand(seed)
    real_numbers = generator(nreal)
    test_scores = np.zeros((nrepeat, len(hh)+1))
    self_scores = np.zeros_like(test_scores)
    real_scores = np.zeros_like(test_scores)

    for i in tqdm(range(nrepeat), leave=False):
        y = generator(nyz)
        z = generator(nyz)

        x = naive(y, n=nx)
        test_scores[i, 0] = s(x, z)
        self_scores[i, 0] = s(x, y)
        real_scores[i, 0] = s(x, real_numbers)
        for j, h in enumerate(hh):
            x = generate_new(y, n=nx, h=h)
            test_scores[i, j+1] = s(x, z)
            self_scores[i, j+1] = s(x, y)
            real_scores[i, j+1] = s(x, real_numbers)
            
    a = np.mean(test_scores, axis=0)
    b = np.mean(self_scores, axis=0)
    c = np.mean(real_scores, axis=0)
    alpha = np.linspace(0, 1, 101)
    beta = np.zeros_like(alpha)
    for i in range(len(alpha)):
        beta[i] = np.corrcoef(a + alpha[i]*(a-b), c)[0][1]
    result = TestResult(alpha=alpha[np.argmax(beta)], 
                        correlation=np.max(beta), 
                        old_correlation=beta[0],
                        test_scores=test_scores, self_scores=self_scores, real_scores=real_scores)
    with open(filename, 'wb') as file:
        pickle.dump(result, file)
    return result

In [None]:
for i, g in enumerate([g1, g2, g3, g4], start=1):
    for n in [2, 5, 20]:
        r = find_alpha(g, n)
        print("Generator {:d}, n={:2d}, alpha={:.4f},".format(i, n, r.alpha),
              "corr={:.4f}, corr at a={:.4f}".format(r.correlation, r.old_correlation))

In [None]:
for i, g in enumerate([g1, g2, g3, g4], start=1):
    for n in [2, 5, 20]:
        r = find_alpha(g, n, seed=1)
        print("Generator {:d}, n={:2d}, alpha={:.4f},".format(i, n, r.alpha),
              "corr={:.4f}, corr at a={:.4f}".format(r.correlation, r.old_correlation))

In [None]:
res = find_alpha(g5, 100, nx=1000, nrepeat=100, nreal=5000)

In [None]:
def boxplots(result):
    boxplot(result.real_scores)
    plt.title("True metric")
    boxplot(result.test_scores)
    plt.title("Old score (correlation={:.4f})".format(result.old_correlation))
    boxplot(result.test_scores + result.alpha*(result.test_scores - result.self_scores))
    plt.title(r"Ad-hoc metric ($\alpha$={:.2f}, correlation={:.4f})".format(result.alpha, 
                                                                            result.correlation))

In [None]:
boxplots(res)

In [None]:
res2 = find_alpha(g6, 100, nx=1000, nrepeat=100, nreal=5000)

In [None]:
boxplot(res.real_scores)
plt.title("True metric")
boxplot(res.test_scores)
plt.title("Old score (correlation={:.4f})".format(res.old_correlation))
boxplot(res.test_scores + res.alpha*(res.test_scores - res.self_scores))
plt.title(r"Ad-hoc metric ($\alpha$={:.2f}, correlation={:.4f})".format(res.alpha, res.correlation))