In [None]:
import os
import numpy as np
import pickle
from matplotlib import pyplot as plt
import pandas as pd
from stats import KDE
from ml_method import ErrorStatistics

# Applying method on artificial data

## Definition of the data

Apply method for artificial data. Originally, data $(x,y)$ is generated from two uniform distributions with intervals $[0.5, 1.5]$ and $[0, 1]$.

The dependent data is defined as follows:
\begin{align}
x' &= 2x + y, \\
y' &= \frac{2}{x} + y.
\end{align}

In [None]:
# Generate random data
np.random.seed(1)
xorig = np.random.rand(500)+0.5
yorig = np.random.rand(500)

# Make data dependent
x = 2*xorig + yorig
y = 2/xorig + yorig

# Plot result
plt.plot(x, y, '.')
plt.xlabel("x'")
plt.ylabel("y'")
plt.title("Dependent data")

To apply our method, we need to know how to transfor the data $(x', y')$ to $(x,y)$, i.e., the inverse mapping. The inverse mapping is as follows:
\begin{align}
    x &= \frac{x' - y'}{4} + \sqrt{\frac{(y - x)^2}{16} + 1} \\
    y &= \frac{x' + y'}{2} - \sqrt{\frac{(y - x)^2}{4} + 4}
\end{align}

In [None]:
# Apply inverse
xorig = (x - y)/4 + np.sqrt((y - x)**2/16+1)
yorig = (x + y)/2 - np.sqrt((y - x)**2/4 +4)
plt.plot(xorig, yorig, '.')
plt.xlabel("x")
plt.ylabel("y")
plt.title("Original data")

In [None]:
# Apply fastkde when assuming independent and assuming independent
def get_kde_score(data):
    kde = KDE(data=data)
    kde.compute_bandwidth()
    return np.sum(np.log(kde.score_samples(data)))
score_dependent = get_kde_score(np.array([x, y]).T)
score_independent = get_kde_score(x) + get_kde_score(y)
print("Score when assuming dependent:   {:7.3f}".format(score_dependent))
print("Score when assuming independent: {:7.3f}".format(score_independent))

In [None]:
# Apply fastkde when assuming independent and assuming independent for original data
score_dependent = get_kde_score(np.array([xorig, yorig]).T)
score_independent = get_kde_score(xorig) + get_kde_score(yorig)
print("Score when assuming dependent:   {:7.3f}".format(score_dependent))
print("Score when assuming independent: {:7.3f}".format(score_independent))

In [None]:
u = (y - x) / np.sqrt((y - x)**2 + 16)
H = (1 - u) / 4

In [None]:
kdexy = KDE(data=np.array([xorig, yorig]).T)
kdexy.compute_bandwidth()
scores = kdexy.score_samples(np.array([xorig, yorig]).T) * H
score_dependent = np.sum(np.log(scores))
score_dependent

In [None]:
kdex, kdey = KDE(data=xorig), KDE(data=yorig)
kdex.compute_bandwidth(), kdey.compute_bandwidth()
scores = kdex.score_samples(xorig) * kdey.score_samples(yorig) * H
score_independent = np.sum(np.log(scores))
score_independent

## Loglikelihoods for different number of datapoints

In [None]:
n = 10
d = 2

def loglikelihoods(n, d):
    data = np.random.randn(n, d)
    kde = KDE(data=data)
    kde.compute_bandwidth()
    score_dependent = np.sum(np.log(kde.score_samples(data)))
    score_independent = 0
    for i in range(d):
        kde = KDE(data=data[:, i:i+1])
        kde.compute_bandwidth()
        score_independent += np.sum(np.log(kde.score_samples(data[:, i:i+1])))
        
    kde = KDE(data=data[:, 0:1])
    kde.compute_bandwidth()
    s = np.sum(np.log(kde.score_samples(data[:, 0:1])))
    kde = KDE(data=data[:, 1:d])
    kde.compute_bandwidth()
    s += np.sum(np.log(kde.score_samples(data[:, 1:d])))
    return score_dependent, score_independent, s
loglikelihoods(1000, 10)

In [None]:
kdex.bandwidth

## Determining the dependence using data-driven technique

In [None]:
help(pd.DataFrame)

In [None]:
df = pd.DataFrame([x, y], columns=["x", "y", "z"])
features = ["1", "df['x']", "df['y']", "df['x']**2", "df['x']*df['y']", "df['y']**2"]
logfile = os.path.join("log", "artificial_data.txt")
es = ErrorStatistics(1)
es.set_mu_features(features)
es.set_sigma_features(features)
es.set_data(df, df["z"])
es.sequential_forward_selection(logfile=logfile)

In [None]:
es.set_parameters(np.zeros_like(es.get_current_values()))
es.set_mask(mask_x_mu=[[True], [True], [False], [True], [False], [False]],
            mask_x_sigma=[[[True]], [[True]], [[False]], [[True]], [[False]], [[False]]])
es.optimize(method="custom")

In [None]:
a, b, c, d, e, f = es.get_current_values()  # z = a + bx + cx^2

In [None]:
plt.plot(df['x'], df['z'], '.')

In [None]:
df["znew"] = (df['z']-a - b*df['x'] - c*df['x']**2) / (d + e*df['x'] + f*df['x']**2)
plt.plot(df['x'], df["znew"], '.')

In [None]:
kde_old = KDE(df[["x", "z"]].values)
kde_old.compute_bandwidth()
np.sum(np.log(kde_old.score_samples(df[["x", "z"]])))

In [None]:
kde_old = KDE(df[["x", "znew"]].values)
kde_old.compute_bandwidth()
determinant = 1 / (d + e*df['x'] + f*df['x']**2)
np.sum(np.log(kde_old.score_samples(df[["x", "znew"]])*determinant))

In [None]:
# Open the dataset
with open(os.path.join('pickles', 'df.p'), 'rb') as f:
    dfs, scaling = pickle.load(f)
scaling = scaling.T  # [time vstart vend]
scaling = scaling[scaling[:, 2] > 0, :]  # Remove full stops
scaling_old = scaling.copy()
scaling[:, 1] = scaling[:, 1] - scaling[:, 2]  # Now it becomes: [time deltav vend] (less correlation)
scaling[:, 0] = scaling[:, 1] / scaling[:, 0]  # Now it becomes: [deceleration deltav vend] (better behaved)
#std_scaling = np.std(scaling, axis=0)
#mean_scaling = np.mean(scaling, axis=0)
#scaling = (scaling - mean_scaling) / std_scaling

In [None]:
kde = KDE(data=scaling_old)
kde.compute_bandwidth(max_bw=2)
np.sum(np.log(kde.score_samples(scaling_old)))

In [None]:
kde = KDE(data=scaling_old/np.std(scaling_old, axis=0))
kde.compute_bandwidth()
np.sum(np.log(kde.score_samples(scaling_old/np.std(scaling_old, axis=0)) / np.prod(np.std(scaling_old, axis=0))))

In [None]:
kde = KDE(data=scaling)
kde.compute_bandwidth(max_bw=1)
determinant = np.abs(scaling_old[:, 1] - scaling_old[:, 2]) / scaling_old[:, 0]**2
np.sum(np.log(kde.score_samples(scaling)*determinant))

In [None]:
kde = KDE(data=scaling/np.std(scaling, axis=0))
kde.compute_bandwidth()
np.sum(np.log(kde.score_samples(scaling/np.std(scaling, axis=0)) * determinant / np.prod(np.std(scaling, axis=0))))

In [None]:
kdex = KDE(data=scaling[:, 0])
kdex.compute_bandwidth()
scorex = kdex.score_samples(scaling[:, 0])
kdey = KDE(data=scaling[:, 1:])
kdey.compute_bandwidth()
scorey = kdey.score_samples(scaling[:, 1:])
np.sum(np.log(scorex*scorey*determinant))

In [None]:
kdex = KDE(data=scaling[:, 0]/np.std(scaling[:, 0]))
kdex.compute_bandwidth()
scorex = kdex.score_samples(scaling[:, 0] / np.std(scaling[:, 0]))
kdey = KDE(data=scaling[:, 1:]/np.std(scaling[:, 1:], axis=0))
kdey.compute_bandwidth()
scorey = kdey.score_samples(scaling[:, 1:] / np.std(scaling[:, 1:], axis=0))
np.sum(np.log(scorex*scorey*determinant / np.prod(np.std(scaling, axis=0))))

In [None]:
plt.plot(scaling[:, 1], scaling[:, 2], '.')