In [1]:
from copy import deepcopy
import numpy as np
import os
import matplotlib.pyplot as plt
import scipy.special
import scipy.spatial.distance as dist
from sklearn.model_selection import train_test_split
import time
import pickle
from stats import Copula, CopulaPairs, CopulaOptions, KDE, GaussianMixture
from IPython.core.debugger import set_trace

## Generate the data

In [None]:
# Set parameters.
SEED = 0
NDATA = 1000
MUS = np.array([[0.5, 0.5], [-0.5, -1]])
SIGMAS = np.array([[[0.5, -0.1], [-0.1, 0.25]],
                   [[0.5, 0.3], [0.3, 0.4]]])
np.random.seed(SEED)

In [None]:
# Generate the data and obtain the PDF.
MIXTURE = GaussianMixture(MUS, SIGMAS)
SAMPLES = MIXTURE.generate_samples(NDATA)
(X1PDF, X2PDF), YPDF = MIXTURE.pdf()

In [None]:
# Show the data and the true PDF.
HIST2D = plt.hist2d(SAMPLES[:, 0], SAMPLES[:, 1], bins=10, cmap=plt.cm.get_cmap('BuGn_r'))
plt.contour(X1PDF, X2PDF, YPDF)
plt.xlim([np.min(HIST2D[1]), np.max(HIST2D[1])])
plt.ylim([np.min(HIST2D[2]), np.max(HIST2D[2])])
plt.title("Histogram of data with contour plot of real PDF")
plt.show()

## Map data to uniformly distributed data

In [None]:
# Construct the Kernel Density Estimations.
KDES = [KDE(SAMPLES[:, i]) for i in range(SAMPLES.shape[1])]
for kde in KDES:
    kde.compute_bandwidth()

In [None]:
# Obtain the uniform data using the CDF of the KDE.
USAMPLES = np.zeros_like(SAMPLES)
for i, kde in enumerate(KDES):
    USAMPLES[:, i] = kde.cdf(SAMPLES[:, i])

In [None]:
HIST2D = plt.hist2d(USAMPLES[:, 0], USAMPLES[:, 1], bins=10, cmap=plt.cm.get_cmap('BuGn_r'),
                    range=((0, 1), (0, 1)))

## Construct the copula density

In [None]:
# First, convert the data, such that each marginal is normally distributed.
NSAMPLES = scipy.special.erfinv(USAMPLES*2-1)*np.sqrt(2)

In [None]:
# Construct KDE the "normally" distributed data.
CKDE = KDE(NSAMPLES)
CKDE.compute_bandwidth()

In [None]:
# Compute the likelihood of all data.
SCORE_COPULA = (np.sum(np.log(CKDE.score_samples(NSAMPLES))) + 
                NDATA*np.log(2*np.pi) + 1/2*np.sum(NSAMPLES**2))
SCORE_MARGINALS = np.sum([np.sum(np.log(kde.score_samples(SAMPLES[:, i]))) 
                          for i, kde in enumerate(KDES)])
SCORE = SCORE_COPULA + SCORE_MARGINALS

In [None]:
# Compute the score if we would directly use the KDE.
KDE_DATA = KDE(SAMPLES)
KDE_DATA.compute_bandwidth()
SCORE_KDE = np.sum(np.log(KDE_DATA.score_samples(SAMPLES)))

In [None]:
print("Score with copula: {:.2f}".format(SCORE))
print("Score with KDE:    {:.2f}".format(SCORE_KDE))

In [None]:
# Show that same result is obtained when using the Copula class.
COPULA = Copula(SAMPLES)
np.sum(np.log(COPULA.pdf(SAMPLES)))

## Compute likelihood of newly generated data

In [None]:
# Compute the score if we would directly use the KDE.
NDATA = 1000
np.random.seed(SEED+1)
SAMPLES = MIXTURE.generate_samples(NDATA)
SCORE_KDE = np.sum(np.log(KDE_DATA.score_samples(SAMPLES)))

In [None]:
# Compute the score when using the copula.
USAMPLES = np.zeros_like(SAMPLES)
for i, kde in enumerate(KDES):
    USAMPLES[:, i] = kde.cdf(SAMPLES[:, i])
NSAMPLES = scipy.special.erfinv(USAMPLES*2-1)*np.sqrt(2)
SCORE_COPULA = (np.sum(np.log(CKDE.score_samples(NSAMPLES))) + 
                NDATA*np.log(2*np.pi) + 1/2*np.sum(NSAMPLES**2))
SCORE_MARGINALS = np.sum([np.sum(np.log(kde.score_samples(SAMPLES[:, i]))) 
                          for i, kde in enumerate(KDES)])
SCORE = SCORE_COPULA + SCORE_MARGINALS

In [None]:
print("Score with copula: {:.2f}".format(SCORE))
print("Score with KDE:    {:.2f}".format(SCORE_KDE))
print("Score with COPULA: {:.2f}".format(np.sum(np.log(COPULA.pdf(SAMPLES)))))

## Test copula on three dimensional data

The first two columns of the data comes from the same distribution as for the 2-dimensional data. The third column contains data that is normally distributed with a mean of $\mu$ and a standard deviation of $\sigma$, where

$$\mu = (y+1)^2,$$
$$\sigma = \frac{1}{\sqrt{|y|}+1},$$

where $y$ corresponds to the second column.

In [None]:
NDATA = 200
np.random.seed(SEED)

def dependent_data(data_in):
    data_out = np.random.randn(len(data_in))
    data_out /= (np.sqrt(np.abs(data_in))+1)
    data_out += (data_in + 1)**2
    return data_out

def get_data(ndata):
    samples = MIXTURE.generate_samples(ndata)
    third = dependent_data(samples[:, 1])
    samples = np.concatenate((samples, third[:, np.newaxis]), axis=1)
    return samples
SAMPLES = get_data(NDATA)
TEST = get_data(NDATA)

In [None]:
def test_copula(samples, test, pairs):
    # Use KDE and compute the scores.
    normal_kde = KDE(samples)
    normal_kde.compute_bandwidth()
    print("Score of regular KDE:       {:.2f}"
          .format(np.sum(np.log(normal_kde.score_samples(test)))))
    
    # Compute score if it is assumed that the data is independent.
    kdes = [KDE(samples[:, i]) for i in range(3)]
    score_ind = 0
    for i, kde in enumerate(kdes):
        kde.compute_bandwidth()
        score_ind += np.sum(np.log(kde.score_samples(test[:, i])))
    print("Score assuming independent: {:.2f}".format(score_ind))
        
    # Create big copula.
    copula = Copula(samples, CopulaOptions())
    print("Score 3D copula:            {:.2f}".format(np.sum(np.log(copula.pdf(test)))))
    
    # Create copulas pairs and show the copula pairs.
    copula_pairs = CopulaPairs(samples, pairs)
    print("Score for pairs:            {:.2f}".format(np.sum(np.log(copula_pairs.pdf(test)))))
    height = np.min((4, 16/len(copula_pairs.parms.copulas)))
    _, axes = plt.subplots(1, len(copula_pairs.parms.copulas),
                           figsize=(height*len(copula_pairs.parms.copulas), height))
    x = np.linspace(0, 1, 51)
    xx, yy = np.meshgrid(x, x)
    xxyy = np.transpose(np.array([xx, yy]), [1, 2, 0])
    for c, ax in zip(copula_pairs.parms.copulas, axes):
        ax.contourf(x, x, c.copula(xxyy))
        
    return normal_kde, kdes, copula, copula_pairs

In [None]:
k, i, c, p = test_copula(SAMPLES, TEST, [(0, 1), (1, 2)])

In [None]:
k1 = KDE(SAMPLES[:, :2])
k2 = KDE(SAMPLES[:, 1:])
#k1.set_bandwidth(k.bandwidth)
#k2.set_bandwidth(k.bandwidth)
k1.compute_bandwidth()
k2.compute_bandwidth()

In [None]:
pb1 = k1.cdf(np.array([[1000, x] for x in TEST[:, 1]]))
pb2 = k2.cdf(np.array([[x, 1000] for x in TEST[:, 1]]))
score = k1.score_samples(TEST[:, :2]) * k2.score_samples(TEST[:, 1:]) / pb2
np.sum(np.log(score))

In [None]:
np.sum(np.log(k.score_samples(TEST)))

In [None]:
score / k.score_samples(TEST)

PROBLEM: The above works well with `NDATA = 500`. However, for other values (e.g., `50` or `1000`), it does not work well for the copulas. I think this is due to the fact that, by accident, there is one test sample that is far off from the original samples, hence resulting in a very low probability when calculating its marginal probability.

## Test copula on three dimensional data

The first three columns are generated in the same way as for the previous dataset. The fourth column is generated like the third column of the previous dataset, but now the dependence is on the first column instead of the second column.

In [None]:
NDATA = 500
np.random.seed(SEED+4)

def get_data4(ndata):
    samples = get_data(ndata)
    fourth = dependent_data(samples[:, 0])
    samples = np.concatenate((samples, fourth[:, np.newaxis]), axis=1)
    return samples
SAMPLES = get_data4(NDATA)
TEST = get_data4(NDATA)

In [None]:
_ = test_copula(SAMPLES, TEST, [(0, 1), (1, 2), (0, 3)])

## Try it for data that is strongly correlated

In [None]:
def new_samples(s):
    n = s.copy()
    n[:, 0] *= -1
    n[:, 1] *= 2
    n[:, 2] *= 0.5
    n[:, 3] *= -2
    n += np.random.randn(*s.shape)*0.1
    return n
SAMPLES2 = np.concatenate((SAMPLES, new_samples(SAMPLES)), axis=1)
TEST2 = np.concatenate((TEST, new_samples(TEST)), axis=1)

In [None]:
test_copula(SAMPLES2, TEST2, [(0, 1), (1, 2), (0, 3), (0, 4), (1, 5), (2, 6), (3, 7)])

## Try the copulas for the real data

In [None]:
# Open the dataset
with open(os.path.join('pickles', 'df.p'), 'rb') as f:
    dfs, scaling = pickle.load(f)
scaling = scaling.T  # [time vstart vend]
scaling = scaling[scaling[:, 2] > 0, :]  # Remove full stops
scaling[:, 1] = scaling[:, 1] - scaling[:, 2]  # Now it becomes: [time deltav vend] (less correlation)
scaling[:, 0] = scaling[:, 1] / scaling[:, 0]  # Now it becomes: [deceleration deltav vend] (better behaved)
std_scaling = np.std(scaling, axis=0)
mean_scaling = np.mean(scaling, axis=0)
scaling = (scaling - mean_scaling) / std_scaling

In [None]:
samples, test = train_test_split(scaling, test_size=0.1, random_state=0)
test_copula(samples, test, [(0, 1), (1, 2)])

In [None]:
test_copula(samples, test, [(0, 1), (0, 2)])

In [None]:
test_copula(samples, test, [(0, 2), (1, 2)])

In [None]:
x = SAMPLES[:, 0]

In [None]:
k=KDE(x)
k.compute_bandwidth()
kold = deepcopy(k)

In [None]:
print(np.sum(np.log(k.score_samples(x))))
print(np.sum(np.log(k.score_samples(TEST[:, 0]))))

In [None]:
tmp = -np.percentile(k.data_helpers.mindists, 95, axis=0)
k.set_bandwidth(tmp / np.mean(tmp) * kold.bandwidth)
h = k.bandwidth.copy()

In [None]:
k2 = deepcopy(k)

In [None]:
k2.compute_bandwidth()

In [None]:
x = np.linspace(-3, 3, 100)
plt.plot(x, kold.score_samples(x), label="Fixed bandwidth")
plt.plot(x, k2.score_samples(x), label="Variable bandwidth")
plt.legend()
plt.xlim(np.min(x), np.max(x))

In [None]:
score = np.zeros((31, 2))
t = k.bandwidth.copy()
ss = np.logspace(-1, 1, 31)
for i, s in enumerate(ss):
    k.set_bandwidth(s*t)
    score[i, 0] = np.sum(np.log(k.score_samples(x)))
    score[i, 1] = np.sum(np.log(k.score_samples(TEST[:, 0])))
k.set_bandwidth(t)

In [None]:
plt.semilogx(ss, score[:, 0], 'b', label="Sampel data")
plt.semilogx(ss, score[:, 1], 'g', label="Test data")
plt.legend()
print("Max test score {:.2f} at {:.3f}".format(np.max(score[:, 1]), ss[np.argmax(score[:, 1])]))

In [None]:
plt.plot(TEST[:, 0], np.log(kold.score_samples(TEST[:, 0])), 'b.')
plt.plot(TEST[:, 0], np.log(k.score_samples(TEST[:, 0])), 'g.')
plt.plot(TEST[:, 0], np.log(k.score_samples(TEST[:, 0]))-np.log(kold.score_samples(TEST[:, 0])), 'r.')

In [None]:
plt.plot(TEST[:, 0], '.')

In [None]:
xpdf = np.linspace(-3, 3, 100)
y1 = k.score_samples(xpdf)
y2 = kold.score_samples(xpdf)
plt.plot(xpdf, y2, 'b', label="old")
plt.plot(xpdf, y1, 'g', label="new")
plt.legend()

In [None]:
print("Test old", np.sum(np.log(kold.score_samples(TEST[:, 0]))))
print("Test new", np.sum(np.log(k.score_samples(TEST[:, 0]))))

In [None]:
k = KDE(x[:100])

In [None]:
k.constants.variable_bandwidth = True

In [None]:
k.score_leave_one_out(include_const=True)

In [None]:
h = k.bandwidth.copy()

In [None]:
scores = np.zeros(len(x))
for i in range(len(x)):
    xtmp = x.copy()
    xtmp = np.delete(xtmp, i)
    h = k.bandwidth.copy()
    h = np.delete(h, i)
    ktmp = KDE(xtmp)
    ktmp.set_bandwidth(h)
    scores[i] = ktmp.score_samples(np.array([x[i]]))
print(np.sum(np.log(scores)))

In [None]:
ss = np.logspace(-1, 1, 100)
scores = np.zeros(len(ss))
for i, s in enumerate(ss):
    k.set_bandwidth(s*h)
    scores[i] = k.score_leave_one_out(include_const=True)

In [None]:
plt.semilogx(ss, scores)

In [None]:
(np.sum(np.log(np.sum(np.exp(mindists / k.bandwidth**2) / k.bandwidth, axis=1) - 1/k.bandwidth)) +
 k.constants.const_score)

In [None]:
k.score_leave_one_out()

In [None]:
(np.sum(np.log((np.sum(np.exp(mindists / k.bandwidth**2), axis=0) - 1))) -
 200/2*np.log(2*np.pi) - 200*np.log(k.bandwidth) - 200*np.log(199))

In [None]:
k.constants.const_score

In [None]:
-200/2*np.log(2*np.pi) - 200*np.log(199)

In [None]:
(-200*1/2*np.log(2*np.pi) - 200*np.log(199))

In [None]:
k.set_n(200)

In [None]:
np.ones((4, 4)) / np.array([1, 2, 3, 4])