# Проверяем корректность разделения выборки на тестовую и тренировочную

In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm 
import itertools
from scipy import linalg
import matplotlib as mpl
from sklearn import mixture
from scipy.stats import norm, multivariate_normal, gaussian_kde, dirichlet, wishart
from scipy.integrate import quad, nquad
from scipy.special import gammaln, psi, digamma, gamma
from scipy.linalg import det, inv
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import rbf_kernel

In [2]:
from dcor import energy_distance

In [3]:
data = pd.read_excel('euro_dollar.xlsx')
data = data.sort_values(by = ['curs_euro'])
data.head()

data = data[data['curs_euro'] > -0.14]

list_of_euro = list(data['curs_euro'])
list_of_dollar = list(data['curs_dollar'])
n_samples = data['curs_euro'].size
n_samples

np_data_dollar = np.array(data['curs_dollar']).reshape(-1, 1)
np_data_euro = np.array(data['curs_euro']).reshape(-1, 1)
np_data_euro

array([[-0.08408749],
       [-0.07404701],
       [-0.0712909 ],
       ...,
       [ 0.10664213],
       [ 0.10718334],
       [ 0.10994489]])

In [4]:
data_XY = np.concatenate([np_data_euro, np_data_dollar], axis = 1)
data_XY

array([[-0.08408749, -0.07988462],
       [-0.07404701, -0.07152092],
       [-0.0712909 , -0.07147347],
       ...,
       [ 0.10664213,  0.09724502],
       [ 0.10718334,  0.11812002],
       [ 0.10994489,  0.11316388]])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    np_data_euro, 
    np_data_dollar, 
    test_size=0.2,  # Доля тестовой выборки (можно указать train_size)
    random_state=42,  # Для воспроизводимости
)

XY_train = np.concatenate([X_train, y_train], axis = 1)
XY_test = np.concatenate([X_test, y_test], axis = 1)

n_samples_train = X_train.shape[0]
n_samples_test = X_test.shape[0]

In [None]:
distance = energy_distance(XY_train, XY_test)
distance

1.0722828728127257e-05

## Пермутационный подход + энергитическое растояние

In [16]:
def permutation_test(sample1, sample2, n_permutations=1000):
    combined = np.vstack([sample1, sample2])
    obs_stat = energy_distance(sample1, sample2)

    n_x = len(sample1)
    
    perm_stats = []
    for _ in range(n_permutations):
        print('Current permutation:', _)
        np.random.shuffle(combined)
        perm1 = combined[:n_x]
        perm2 = combined[n_x:]
        perm_stat = energy_distance(perm1, perm2)
        perm_stats.append(perm_stat)
    
    p_value = (perm_stats >= obs_stat).mean()
    return p_value

In [17]:
permutation_test(XY_train, XY_test)

Current permutation: 0
Current permutation: 1
Current permutation: 2
Current permutation: 3
Current permutation: 4
Current permutation: 5
Current permutation: 6
Current permutation: 7
Current permutation: 8
Current permutation: 9
Current permutation: 10
Current permutation: 11
Current permutation: 12
Current permutation: 13
Current permutation: 14
Current permutation: 15
Current permutation: 16
Current permutation: 17
Current permutation: 18
Current permutation: 19
Current permutation: 20
Current permutation: 21
Current permutation: 22
Current permutation: 23
Current permutation: 24
Current permutation: 25
Current permutation: 26
Current permutation: 27
Current permutation: 28
Current permutation: 29
Current permutation: 30
Current permutation: 31
Current permutation: 32
Current permutation: 33
Current permutation: 34
Current permutation: 35
Current permutation: 36
Current permutation: 37
Current permutation: 38
Current permutation: 39
Current permutation: 40
Current permutation: 41
Cu

0.533

## Пермутационный подход + Maximum Mean Discrepancy (MMD)

In [23]:
def mmd_rbf(X, Y, gamma=1.0):
    """Вычисляет MMD² между X и Y с гауссовым ядром."""
    K_XX = rbf_kernel(X, X, gamma=gamma)
    K_YY = rbf_kernel(Y, Y, gamma=gamma)
    K_XY = rbf_kernel(X, Y, gamma=gamma)
    
    m = len(X)
    n = len(Y)
    
    mmd = K_XX.mean() + K_YY.mean() - 2 * K_XY.mean()
    return mmd

def mmd_test_with_pvalue(X, Y, gamma=1.0, n_permutations=1000):
    """Пермутационный тест для MMD²."""
    # Наблюдаемое MMD²
    mmd_observed = mmd_rbf(X, Y, gamma=gamma)
    
    # Объединённая выборка
    Z = np.vstack([X, Y])
    n_X = len(X)
    
    # Пермутационный тест
    perm_mmd = []
    for _ in range(n_permutations):
        print('Current permutation:', _)
        np.random.shuffle(Z)  # Перемешиваем
        X_perm = Z[:n_X]      # Первая псевдовыборка
        Y_perm = Z[n_X:]      # Вторая псевдовыборка
        perm_mmd.append(mmd_rbf(X_perm, Y_perm, gamma=gamma))
    
    # Вычисление p-value (с поправкой на устойчивость)
    p_value = (np.sum(perm_mmd >= mmd_observed) + 1) / (n_permutations + 1)
    
    return mmd_observed, p_value

In [24]:
mmd_test_with_pvalue(XY_train, XY_test)

Current permutation: 0
Current permutation: 1
Current permutation: 2
Current permutation: 3
Current permutation: 4
Current permutation: 5
Current permutation: 6
Current permutation: 7
Current permutation: 8
Current permutation: 9
Current permutation: 10
Current permutation: 11
Current permutation: 12
Current permutation: 13
Current permutation: 14
Current permutation: 15
Current permutation: 16
Current permutation: 17
Current permutation: 18
Current permutation: 19
Current permutation: 20
Current permutation: 21
Current permutation: 22
Current permutation: 23
Current permutation: 24
Current permutation: 25
Current permutation: 26
Current permutation: 27
Current permutation: 28
Current permutation: 29
Current permutation: 30
Current permutation: 31
Current permutation: 32
Current permutation: 33
Current permutation: 34
Current permutation: 35
Current permutation: 36
Current permutation: 37
Current permutation: 38
Current permutation: 39
Current permutation: 40
Current permutation: 41
Cu

(3.986392440591402e-07, 0.2967032967032967)