In [1]:
# This scripts simulated the data and save them in a CSV

In [2]:
# import packages
import os
import numpy as np
import pandas as pd

## Simulation Formulations
$$ X_1, X_2, X_3 \sim i.i.d. Unif(-1, 1) $$


\begin{equation}
\mathcal{F}(\mathcal{G} | \mathbf{X} = x) = \pi_1(x) \cdot f_1(x) + \pi_2(x) \cdot f_2(x),
\label{eq:simulation}
\end{equation}
where
\begin{equation*}
\begin{split}
\pi_1(x) = \frac{1}{1 + \exp(x_3)}, &\ \pi_2(x) = \frac{\exp(x_3)}{1+\exp(x_3)}, \\
f_1(x) = \mathcal{N}\big(x_1 + \varepsilon, sd = |x_2|+0.5\big),&\ f_2(x) = \mathcal{N}\big(2x_2^2 + 2 + \varepsilon, sd = |x_1|+0.5\big),   
\end{split}
\end{equation*}
with independent random noise variable $\varepsilon \sim \mathcal{N}(0, \omega^2)$.


In [15]:
# simulate the data
## set the parameters
K = 2
n_dist = 200 ## number of distributions
n_sample = 300 ## number of points in each distribution
omega_list = [0.1, 0.2, 0.5, 1, 2] ## choices of noise standard deviations
id_setting = 5
omega = omega_list[id_setting-1] 
## start simulation
np.random.seed(2020)
X = np.random.uniform(size=(n_dist, 3)) * 2 - 1 ## simulate X
Y = np.zeros((n_dist, n_sample))
for i in range(n_dist):
    mu_1 = X[i, 0]
    mu_2 = 2 * X[i, 1]**2 + 2
    mu_true = [mu_1, mu_2]
    sig_1 = np.abs(X[i, 1]) + 0.5
    sig_2 = np.abs(X[i, 0]) + 0.5
    sig_true = [sig_1, sig_2]
    pi_1 = 1 / (1 + np.exp(X[i, 2]))
    pi_true = [pi_1, 1-pi_1]
    ## simulate noise
    eps_noise = np.random.normal(loc=0, scale=omega, size=1)
    ## simulate responses
    var_gaussian = np.array([np.random.normal(loc=mu_true[k]+eps_noise, 
                                              scale=sig_true[k], 
                                              size=n_sample) for k in range(K)]).T
    var_mult = np.random.choice(range(K), size=n_sample, replace=True, p=pi_true)
    var_mult = np.eye(K)[var_mult]
    var_GMM = np.sum(var_mult * var_gaussian, axis=1)
    Y[i] = np.sort(var_GMM)


In [16]:
# K-fold cross validation
## create train and test (4:1)
np.random.seed(2020)
n_fold = 5
loc_cv = np.random.choice(len(Y), len(Y), replace=False) % n_fold

In [17]:
# save the simulated data
target_path = '../../data/simulation/setting_' + str(id_setting)
if not os.path.isdir(target_path):
    os.mkdir(target_path)
pd.DataFrame(X).to_csv(target_path + '/dat_X.csv', index=False)
pd.DataFrame(Y).to_csv(target_path + '/dat_Y.csv', index=False)
pd.DataFrame(loc_cv).to_csv(target_path + '/dat_CV.csv', index=False)