# Реализация ядерного сглажиания

In [1]:
import numpy as np
import pandas as pd
from scipy.integrate import quad
from scipy.optimize import minimize_scalar, minimize
from statsmodels.nonparametric.kernel_regression import KernelReg
from sklearn.model_selection import KFold

In [2]:
import numpy as np
from statsmodels.nonparametric.kernel_regression import KernelReg
from sklearn.model_selection import KFold
from scipy.optimize import minimize

In [3]:
def kernel_regression_silverman(x_train, y_train):
    """
    Обучает ядерную регрессию с шириной окна по правилу Сильвермана.
    
    Возвращает:
    KernelReg — обученная модель для предсказания в любых точках
    """
    x_train = np.asarray(x_train).reshape(-1, 1)
    h = 1.06 * np.std(x_train) * len(x_train)**(-1/5)
    return KernelReg(y_train, x_train, var_type='c', bw=[h])

def kernel_regression_cv_safe(x_train, y_train):
    x_train = np.asarray(x_train).reshape(-1, 1)
    h_silverman = 1.06 * np.std(x_train) * len(x_train)**(-1/5)
    
    kf = KFold(5)
    
    def cv_mse(log_h):
        h = np.exp(log_h)
        # Защита от экстремальных значений
        if h < 1e-6 or h > 100.0 or np.isnan(h) or np.isinf(h):
            return 1e10
        
        total_mse = 0.0
        try:
            for train_idx, val_idx in kf.split(x_train):
                kr = KernelReg(y_train[train_idx], x_train[train_idx], var_type='c', bw=[h])
                pred = kr.fit(x_train[val_idx])[0]
                total_mse += np.mean((pred - y_train[val_idx])**2)
            return total_mse / kf.n_splits
        except:
            return 1e10
    
    h_init = np.log(h_silverman)
    res = minimize(cv_mse, x0=h_init, method='L-BFGS-B')
    best_h = np.exp(res.x[0]) if res.success and not (np.isnan(res.x[0]) or np.isinf(res.x[0])) else h_silverman
    
    return KernelReg(y_train, x_train, var_type='c', bw=[best_h])

# Оценка

## Метрики

### Интегральные

In [4]:
def imse(g_hat, g_true, a=0, b=1, epsabs=1e-8):
    integrand = lambda x: (g_hat(x) - g_true(x)) ** 2
    integral, _ = quad(integrand, a, b, epsabs=epsabs, limit=1000)
    return integral / (b - a)

def imae(g_hat, g_true, a=0, b=1, epsabs=1e-8):
    integrand = lambda x: abs(g_hat(x) - g_true(x))
    integral, _ = quad(integrand, a, b, epsabs=epsabs, limit=1000)
    return integral / (b - a)

def maxerr(g_hat, g_true, a=0, b=1, xatol=1e-8):
    objective = lambda x: -abs(g_hat(x) - g_true(x))
    result = minimize_scalar(objective, bounds=(a, b), method='bounded', options={'xatol': xatol})
    if result.success:
        return -result.fun
    x_grid = np.linspace(a, b, 1000)
    return max(abs(g_hat(x) - g_true(x)) for x in x_grid)


### Дискретные

In [5]:
def imse_discrete(g_hat, g_true, a=0, b=1, n_points=1000):
    x_grid = np.linspace(a, b, n_points)
    errors = [(g_hat(x) - g_true(x)) ** 2 for x in x_grid]
    return np.mean(errors)

def imae_discrete(g_hat, g_true, a=0, b=1, n_points=1000):
    x_grid = np.linspace(a, b, n_points)
    errors = [abs(g_hat(x) - g_true(x)) for x in x_grid]
    return np.mean(errors)

def maxerr_discrete(g_hat, g_true, a=0, b=1, n_points=1000):
    x_grid = np.linspace(a, b, n_points)
    errors = [abs(g_hat(x) - g_true(x)) for x in x_grid]
    return max(errors)

## На полиномах

In [6]:
log_df = pd.read_csv('../datasets/synthetic/synthetic_datasets_with_coeffs/seed_log.csv')
results = []

for degree in range(1, 7):
    for noise_level in ['low', 'moderate', 'high']:
        df = pd.read_csv(f'../datasets/synthetic/synthetic_datasets_with_coeffs/noise_{noise_level}_deg{degree}.csv')
        metrics = {'silverman': {'imse': [], 'imae': [], 'maxerr': []},
                   'cv': {'imse': [], 'imae': [], 'maxerr': []}}
        
        for seed in df['seed'].unique()[:300]:
            subset = df[df['seed'] == seed]
            x_train = subset['x'].values
            y_train = subset['y_noisy'].values
            
            # Истинная регрессия
            coeffs = [subset[f'coeff_{i}'].iloc[0] for i in range(degree + 1)]
            g_true = lambda x, c=coeffs: sum(c[i] * x**i for i in range(len(c)))
            
            # Модели
            model_silverman = kernel_regression_silverman(x_train, y_train)
            model_cv = kernel_regression_cv_safe(x_train, y_train)
            
            # Callable для метрик
            g_silverman = lambda x, m=model_silverman: m.fit(np.array([[x]]))[0][0]
            g_cv = lambda x, m=model_cv: m.fit(np.array([[x]]))[0][0]
            
            # Расчёт метрик (интегральные + дискретные для валидации)
            for method, g_hat in [('silverman', g_silverman), ('cv', g_cv)]:
                try:
                    metrics[method]['imse'].append(imse(g_hat, g_true))
                    metrics[method]['imae'].append(imae(g_hat, g_true))
                    metrics[method]['maxerr'].append(maxerr(g_hat, g_true))
                except:
                    # Фолбэк на дискретные метрики при сбое интегрирования
                    metrics[method]['imse'].append(imse_discrete(g_hat, g_true))
                    metrics[method]['imae'].append(imae_discrete(g_hat, g_true))
                    metrics[method]['maxerr'].append(maxerr_discrete(g_hat, g_true))
        
        # Агрегация результатов
        for method in ['silverman', 'cv']:
            m = metrics[method]
            results.append({
                'degree': degree,
                'noise_level': noise_level,
                'method': method,
                'imse_mean': np.mean(m['imse']),
                'imse_sem': np.std(m['imse']) / np.sqrt(len(m['imse'])),  # ← SEM вместо σ
                'imae_mean': np.mean(m['imae']),
                'imae_sem': np.std(m['imae']) / np.sqrt(len(m['imae'])),
                'maxerr_mean': np.mean(m['maxerr']),
                'maxerr_sem': np.std(m['maxerr']) / np.sqrt(len(m['maxerr']))
            })

# Сохранение и вывод
results_df = pd.DataFrame(results)
results_df.to_csv('kernel_regression_polynomial_results.csv', index=False)

print("Результаты ядерной регрессии на полиномиальных данных")
print("=" * 85)
print(results_df.to_string(index=False, float_format='%.6f'))
print("=" * 85)
print(f"\nСохранено: kernel_regression_polynomial_results.csv")

  s = divide(1, s, where=large, out=s)
  df = fun(x1) - f0
  s = divide(1, s, where=large, out=s)
  return _core_matmul(x1, x2)
  s = divide(1, s, where=large, out=s)
  df = fun(x1) - f0
  s = divide(1, s, where=large, out=s)
  df = fun(x1) - f0
  s = divide(1, s, where=large, out=s)
  df = fun(x1) - f0
  s = divide(1, s, where=large, out=s)
  df = fun(x1) - f0
  s = divide(1, s, where=large, out=s)
  df = fun(x1) - f0
  s = divide(1, s, where=large, out=s)
  df = fun(x1) - f0
  s = divide(1, s, where=large, out=s)
  df = fun(x1) - f0
  s = divide(1, s, where=large, out=s)
  df = fun(x1) - f0
  s = divide(1, s, where=large, out=s)
  df = fun(x1) - f0
  the requested tolerance from being achieved.  The error may be 
  underestimated.
  integral, _ = quad(integrand, a, b, epsabs=epsabs, limit=1000)
  the requested tolerance from being achieved.  The error may be 
  underestimated.
  integral, _ = quad(integrand, a, b, epsabs=epsabs, limit=1000)
  s = divide(1, s, where=large, out=s)
  df

Результаты ядерной регрессии на полиномиальных данных
 degree noise_level    method  imse_mean  imse_sem  imae_mean  imae_sem  maxerr_mean  maxerr_sem
      1         low silverman   0.160349  0.015640   0.278645  0.007326     0.538294    0.035277
      1         low        cv   0.261252  0.130959   0.267028  0.010479     0.666592    0.148567
      1    moderate silverman   0.176007  0.029440   0.277048  0.009487     0.531820    0.031539
      1    moderate        cv   0.167202  0.034568   0.262182  0.011558     0.511096    0.029212
      1        high silverman   0.177551  0.019022   0.282003  0.007961     0.533536    0.036045
      1        high        cv   0.174046  0.026942   0.271029  0.009002     0.525461    0.027607
      2         low silverman   0.421302  0.029298   0.477747  0.017003     0.787618    0.031399
      2         low        cv   0.277729  0.016931   0.383344  0.009055     0.694364    0.024631
      2    moderate silverman   0.266158  0.019718   0.383053  0.012630  

In [7]:
import numpy as np
import pandas as pd
from scipy.integrate import quad
from scipy.optimize import minimize_scalar
from statsmodels.nonparametric.kernel_regression import KernelReg
from sklearn.model_selection import KFold

# Функции ядерной регрессии
def kernel_regression_silverman(x_train, y_train):
    x_train = np.asarray(x_train).reshape(-1, 1)
    h = 1.06 * np.std(x_train) * len(x_train)**(-1/5)
    return KernelReg(y_train, x_train, var_type='c', bw=[h])

def kernel_regression_cv_safe(x_train, y_train):
    x_train = np.asarray(x_train).reshape(-1, 1)
    h_silverman = 1.06 * np.std(x_train) * len(x_train)**(-1/5)
    
    kf = KFold(5)
    
    def cv_mse(log_h):
        h = np.exp(log_h)
        # Защита от экстремальных значений
        if h < 1e-6 or h > 100.0 or np.isnan(h) or np.isinf(h):
            return 1e10
        
        total_mse = 0.0
        try:
            for train_idx, val_idx in kf.split(x_train):
                kr = KernelReg(y_train[train_idx], x_train[train_idx], var_type='c', bw=[h])
                pred = kr.fit(x_train[val_idx])[0]
                total_mse += np.mean((pred - y_train[val_idx])**2)
            return total_mse / kf.n_splits
        except:
            return 1e10
    
    h_init = np.log(h_silverman)
    res = minimize(cv_mse, x0=h_init, method='L-BFGS-B')
    best_h = np.exp(res.x[0]) if res.success and not (np.isnan(res.x[0]) or np.isinf(res.x[0])) else h_silverman
    
    return KernelReg(y_train, x_train, var_type='c', bw=[best_h])

# Интегральные метрики
def imse(g_hat, g_true, a=0, b=1, epsabs=1e-6):
    integrand = lambda x: (g_hat(x) - g_true(x)) ** 2
    integral, _ = quad(integrand, a, b, epsabs=epsabs, limit=100)
    return integral / (b - a)

def imae(g_hat, g_true, a=0, b=1, epsabs=1e-6):
    integrand = lambda x: abs(g_hat(x) - g_true(x))
    integral, _ = quad(integrand, a, b, epsabs=epsabs, limit=100)
    return integral / (b - a)

def maxerr(g_hat, g_true, a=0, b=1, xatol=1e-6):
    objective = lambda x: -abs(g_hat(x) - g_true(x))
    result = minimize_scalar(objective, bounds=(a, b), method='bounded', options={'xatol': xatol})
    if result.success:
        return -result.fun
    x_grid = np.linspace(a, b, 500)
    return max(abs(g_hat(x) - g_true(x)) for x in x_grid)

# Дискретные метрики (для валидации)
def imse_discrete(g_hat, g_true, a=0, b=1, n_points=1000):
    x_grid = np.linspace(a, b, n_points)
    errors = [(g_hat(x) - g_true(x)) ** 2 for x in x_grid]
    return np.mean(errors)

def imae_discrete(g_hat, g_true, a=0, b=1, n_points=1000):
    x_grid = np.linspace(a, b, n_points)
    errors = [abs(g_hat(x) - g_true(x)) for x in x_grid]
    return np.mean(errors)

def maxerr_discrete(g_hat, g_true, a=0, b=1, n_points=1000):
    x_grid = np.linspace(a, b, n_points)
    errors = [abs(g_hat(x) - g_true(x)) for x in x_grid]
    return max(errors)

# Тестирование с параллельным расчётом метрик
log_df = pd.read_csv('../datasets/synthetic/synthetic_datasets_with_coeffs/seed_log.csv')
results_integral = []
results_discrete = []

for degree in range(1, 7):
    for noise_level in ['low', 'moderate', 'high']:
        df = pd.read_csv(f'../datasets/synthetic/synthetic_datasets_with_coeffs/noise_{noise_level}_deg{degree}.csv')
        
        # Контейнеры для метрик
        metrics_int = {'silverman': {'imse': [], 'imae': [], 'maxerr': []},
                       'cv': {'imse': [], 'imae': [], 'maxerr': []}}
        metrics_disc = {'silverman': {'imse': [], 'imae': [], 'maxerr': []},
                        'cv': {'imse': [], 'imae': [], 'maxerr': []}}
        
        for seed in df['seed'].unique()[:30]:
            subset = df[df['seed'] == seed]
            x_train = subset['x'].values
            y_train = subset['y_noisy'].values
            
            # Истинная регрессия из коэффициентов
            coeffs = [subset[f'coeff_{i}'].iloc[0] for i in range(degree + 1)]
            g_true = lambda x, c=coeffs: sum(c[i] * x**i for i in range(len(c)))
            
            # Обучение моделей
            model_silverman = kernel_regression_silverman(x_train, y_train)
            model_cv = kernel_regression_cv_safe(x_train, y_train)
            
            # Callable функции
            g_silverman = lambda x, m=model_silverman: m.fit(np.array([[x]]))[0][0]
            g_cv = lambda x, m=model_cv: m.fit(np.array([[x]]))[0][0]
            
            # Расчёт ОБЕИХ версий метрик
            for method, g_hat in [('silverman', g_silverman), ('cv', g_cv)]:
                # Интегральные метрики
                metrics_int[method]['imse'].append(imse(g_hat, g_true))
                metrics_int[method]['imae'].append(imae(g_hat, g_true))
                metrics_int[method]['maxerr'].append(maxerr(g_hat, g_true))
                
                # Дискретные метрики
                metrics_disc[method]['imse'].append(imse_discrete(g_hat, g_true))
                metrics_disc[method]['imae'].append(imae_discrete(g_hat, g_true))
                metrics_disc[method]['maxerr'].append(maxerr_discrete(g_hat, g_true))
        
        # Агрегация результатов для интегральных метрик
        for method in ['silverman', 'cv']:
            m = metrics_int[method]
            results_integral.append({
                'degree': degree,
                'noise_level': noise_level,
                'method': method,
                'imse_mean': np.mean(m['imse']),
                'imse_std': np.std(m['imse']),
                'imae_mean': np.mean(m['imae']),
                'imae_std': np.std(m['imae']),
                'maxerr_mean': np.mean(m['maxerr']),
                'maxerr_std': np.std(m['maxerr'])
            })
        
        # Агрегация результатов для дискретных метрик
        for method in ['silverman', 'cv']:
            m = metrics_disc[method]
            results_discrete.append({
                'degree': degree,
                'noise_level': noise_level,
                'method': method,
                'imse_mean': np.mean(m['imse']),
                'imse_std': np.std(m['imse']),
                'imae_mean': np.mean(m['imae']),
                'imae_std': np.std(m['imae']),
                'maxerr_mean': np.mean(m['maxerr']),
                'maxerr_std': np.std(m['maxerr'])
            })

# Сохранение результатов
df_integral = pd.DataFrame(results_integral)
df_discrete = pd.DataFrame(results_discrete)

df_integral.to_csv('kernel_regression_integral.csv', index=False)
df_discrete.to_csv('kernel_regression_discrete.csv', index=False)



  If increasing the limit yields no improvement it is advised to analyze 
  the integrand in order to determine the difficulties.  If the position of a 
  local difficulty can be determined (singularity, discontinuity) one will 
  probably gain from splitting up the interval and calling the integrator 
  on the subranges.  Perhaps a special-purpose integrator should be used.
  integral, _ = quad(integrand, a, b, epsabs=epsabs, limit=100)
  If increasing the limit yields no improvement it is advised to analyze 
  the integrand in order to determine the difficulties.  If the position of a 
  local difficulty can be determined (singularity, discontinuity) one will 
  probably gain from splitting up the interval and calling the integrator 
  on the subranges.  Perhaps a special-purpose integrator should be used.
  integral, _ = quad(integrand, a, b, epsabs=epsabs, limit=100)


In [8]:
# Вывод сравнения для нескольких комбинаций
print("СРАВНЕНИЕ ИНТЕГРАЛЬНЫХ И ДИСКРЕТНЫХ МЕТРИК")
print("=" * 95)
print(f"{'Степень':<8} {'Шум':<10} {'Метод':<12} {'IMSE (инт)':<15} {'IMSE (дискр)':<15} {'Отклонение %':<15}")
print("-" * 95)

for i in range(0, len(df_integral)):  # Выводим по 2 метода на комбинацию (степень+шум)
    row_int = df_integral.iloc[i]
    row_disc = df_discrete.iloc[i]
    rel_diff = abs(row_int['imse_mean'] - row_disc['imse_mean']) / row_disc['imse_mean'] * 100
    
    print(f"{row_int['degree']:<8} {row_int['noise_level']:<10} {row_int['method']:<12} "
          f"{row_int['imse_mean']:<15.6f} {row_disc['imse_mean']:<15.6f} {rel_diff:<15.2f}")

print("=" * 95)
print("\nСохранено:")
print("  kernel_regression_integral.csv   — интегральные метрики")
print("  kernel_regression_discrete.csv   — дискретные метрики")
print("\nРекомендация: если отклонение < 1%, интегральные метрики можно считать корректными.")

СРАВНЕНИЕ ИНТЕГРАЛЬНЫХ И ДИСКРЕТНЫХ МЕТРИК
Степень  Шум        Метод        IMSE (инт)      IMSE (дискр)    Отклонение %   
-----------------------------------------------------------------------------------------------
1        low        silverman    0.162036        0.162341        0.19           
1        low        cv           1.435334        1.434366        0.07           
1        moderate   silverman    0.155697        0.156020        0.21           
1        moderate   cv           0.133213        0.133569        0.27           
1        high       silverman    0.129291        0.129669        0.29           
1        high       cv           0.179930        0.181281        0.75           
2        low        silverman    0.391991        0.392342        0.09           
2        low        cv           0.235423        0.235919        0.21           
2        moderate   silverman    0.285258        0.285774        0.18           
2        moderate   cv           0.342520        0.

In [9]:
print("=" * 85)
print(df_integral.to_string(index=False, float_format='%.6f'))
print("=" * 85)

 degree noise_level    method  imse_mean  imse_std  imae_mean  imae_std  maxerr_mean  maxerr_std
      1         low silverman   0.162036  0.127692   0.303220  0.126002     0.622027    0.310575
      1         low        cv   1.435334  7.048368   0.342159  0.385613     1.976011    7.931115
      1    moderate silverman   0.155697  0.161374   0.291610  0.138797     0.504707    0.440628
      1    moderate        cv   0.133213  0.152265   0.261826  0.136334     0.447013    0.419772
      1        high silverman   0.129291  0.100534   0.263552  0.101587     0.434250    0.238610
      1        high        cv   0.179930  0.408481   0.262964  0.154412     0.422804    0.228981
      2         low silverman   0.391991  0.388919   0.483569  0.258437     0.822938    0.466523
      2         low        cv   0.235423  0.158412   0.372275  0.124581     0.639168    0.266430
      2    moderate silverman   0.285258  0.385275   0.359416  0.242443     0.557338    0.426036
      2    moderate        cv 