# Реализация ядерного сглажиания

In [1]:
import numpy as np
import pandas as pd
from scipy.integrate import quad
from scipy.optimize import minimize_scalar, minimize
from statsmodels.nonparametric.kernel_regression import KernelReg
from sklearn.model_selection import KFold

In [2]:
import numpy as np
from statsmodels.nonparametric.kernel_regression import KernelReg
from sklearn.model_selection import KFold
from scipy.optimize import minimize

In [8]:
import numpy as np
import pandas as pd
from scipy.integrate import quad
from scipy.optimize import minimize_scalar
from statsmodels.nonparametric.kernel_regression import KernelReg
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings('ignore')

def kernel_regression_silverman(x_train, y_train):
    """Ядерная регрессия с правилом Сильвермана."""
    x_train = np.asarray(x_train, dtype=np.float64).reshape(-1, 1)
    y_train = np.asarray(y_train, dtype=np.float64)
    
    n = len(x_train)
    h = 1.06 * np.std(x_train) * n**(-1/5)
    
    model = KernelReg(y_train, x_train, var_type='c', bw=[h])
    return model, h

def kernel_regression_cv(x_train, y_train):
    """Ядерная регрессия с кросс-валидацией."""
    x_train = np.asarray(x_train, dtype=np.float64).reshape(-1, 1)
    y_train = np.asarray(y_train, dtype=np.float64)
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    def cv_mse(log_h):
        h = np.exp(log_h)
        
        total_mse = 0.0
        for train_idx, val_idx in kf.split(x_train):
            kr = KernelReg(y_train[train_idx], x_train[train_idx], var_type='c', bw=[h])
            pred = kr.fit(x_train[val_idx])[0]
            total_mse += np.mean((pred - y_train[val_idx])**2)
        
        return total_mse / kf.n_splits
    
    # Оптимизация в разумном диапазоне
    res = minimize_scalar(cv_mse, bounds=(-6, 3), method='bounded')
    
    if res.success:
        best_h = np.exp(res.x)
    else:
        # Фолбэк на Сильвермана
        n = len(x_train)
        best_h = 1.06 * np.std(x_train) * n**(-1/5)
    
    model = KernelReg(y_train, x_train, var_type='c', bw=[best_h])
    return model, best_h

def kernel_regression_loocv(x_train, y_train):
    """Ядерная регрессия с методом скользящего контроля (LOOCV)."""
    x_train = np.asarray(x_train, dtype=np.float64).reshape(-1, 1)
    y_train = np.asarray(y_train, dtype=np.float64)
    
    n = len(x_train)
    
    def loocv_mse(log_h):
        h = np.exp(log_h)
        total_error = 0.0
        
        # LOOCV: исключаем одну точку, предсказываем её
        for i in range(n):
            x_oob = x_train[i:i+1]
            y_oob = y_train[i]
            
            x_train_oob = np.delete(x_train, i, axis=0)
            y_train_oob = np.delete(y_train, i)
            
            kr = KernelReg(y_train_oob, x_train_oob, var_type='c', bw=[h])
            pred = kr.fit(x_oob)[0]
            total_error += (pred - y_oob) ** 2
        
        return total_error / n
    
    # Оптимизация в разумном диапазоне
    res = minimize_scalar(loocv_mse, bounds=(-6, 3), method='bounded')
    
    if res.success:
        best_h = np.exp(res.x)
    else:
        # Фолбэк на Сильвермана
        best_h = 1.06 * np.std(x_train) * n**(-1/5)
    
    model = KernelReg(y_train, x_train, var_type='c', bw=[best_h])
    return model, best_h

def create_predictor(model):
    """Создаёт предиктор для метрик."""
    def predictor(x):
        x_arr = np.asarray(x, dtype=np.float64).reshape(-1, 1)
        return model.fit(x_arr)[0]
    return predictor

def imse(g_hat, g_true, a=0, b=1):
    """IMSE через интегрирование."""
    integrand = lambda x: (g_hat(x) - g_true(x)) ** 2
    integral, _ = quad(integrand, a, b, epsabs=1e-6, limit=200)
    return integral / (b - a)

def imae(g_hat, g_true, a=0, b=1):
    """IMAE через интегрирование."""
    integrand = lambda x: abs(g_hat(x) - g_true(x))
    integral, _ = quad(integrand, a, b, epsabs=1e-6, limit=200)
    return integral / (b - a)

def maxerr(g_hat, g_true, a=0, b=1):
    """MaxErr через оптимизацию."""
    objective = lambda x: -abs(g_hat(x) - g_true(x))
    result = minimize_scalar(objective, bounds=(a, b), method='bounded')
    if result.success:
        return -result.fun
    # Фолбэк на дискретизацию
    x_grid = np.linspace(a, b, 1000)
    return max(abs(g_hat(x) - g_true(x)) for x in x_grid)

# Оценка

## На полиномах

In [9]:
def test_kernel_regression_single(seed, df_subset, degree):
    """Обработка одного датасета."""
    x_train = df_subset['x'].values
    y_train = df_subset['y_noisy'].values
    
    # Истинные коэффициенты
    coeffs_true = [df_subset[f'coeff_{i}'].iloc[0] for i in range(degree + 1)]
    g_true = lambda x, c=coeffs_true: sum(c[i] * x**i for i in range(len(c)))
    
    results = {}
    
    # Silverman
    try:
        model_silver, h_s = kernel_regression_silverman(x_train, y_train)
        g_silver = create_predictor(model_silver)
        
        results['silverman'] = {
            'imse': imse(g_silver, g_true),
            'imae': imae(g_silver, g_true),
            'maxerr': maxerr(g_silver, g_true),
            'bandwidth': h_s,
            'success': True
        }
    except Exception as e:
        print(f"Silverman failed (seed {seed}): {e}")
        results['silverman'] = {'success': False}
    
    # CV (5-fold)
    try:
        model_cv, h_c = kernel_regression_cv(x_train, y_train)
        g_cv = create_predictor(model_cv)
        
        results['cv'] = {
            'imse': imse(g_cv, g_true),
            'imae': imae(g_cv, g_true),
            'maxerr': maxerr(g_cv, g_true),
            'bandwidth': h_c,
            'success': True
        }
    except Exception as e:
        print(f"CV failed (seed {seed}): {e}")
        results['cv'] = {'success': False}
    
    # LOOCV
    try:
        model_loocv, h_l = kernel_regression_loocv(x_train, y_train)
        g_loocv = create_predictor(model_loocv)
        
        results['loocv'] = {
            'imse': imse(g_loocv, g_true),
            'imae': imae(g_loocv, g_true),
            'maxerr': maxerr(g_loocv, g_true),
            'bandwidth': h_l,
            'success': True
        }
    except Exception as e:
        print(f"LOOCV failed (seed {seed}): {e}")
        results['loocv'] = {'success': False}
    
    return results

def test_kernel_regression_degree(degree, noise_level, n_seeds=300):
    """Тестирование для одной степени и уровня шума."""
    print(f"\nТестирование: степень={degree}, шум={noise_level}")
    
    df = pd.read_csv(f'../datasets/synthetic/synthetic_datasets_with_coeffs/noise_{noise_level}_deg{degree}.csv')
    seeds = df['seed'].unique()[:n_seeds]
    
    all_results = []
    
    for i, seed in enumerate(seeds):
        if i % 50 == 0:
            print(f"  Прогресс: {i}/{n_seeds}")
        
        subset = df[df['seed'] == seed]
        results = test_kernel_regression_single(seed, subset, degree)
        
        for method in ['silverman', 'cv', 'loocv']:
            if results[method].get('success', False):
                all_results.append({
                    'degree': degree,
                    'noise_level': noise_level,
                    'seed': seed,
                    'method': method,
                    'imse': results[method]['imse'],
                    'imae': results[method]['imae'],
                    'maxerr': results[method]['maxerr'],
                    'bandwidth': results[method]['bandwidth']
                })
    
    # Агрегация
    results_agg = []
    for method in ['silverman', 'cv', 'loocv']:
        subset = [r for r in all_results if r['method'] == method]
        
        if subset:
            imse_vals = [r['imse'] for r in subset]
            imae_vals = [r['imae'] for r in subset]
            maxerr_vals = [r['maxerr'] for r in subset]
            bw_vals = [r['bandwidth'] for r in subset]
            
            results_agg.append({
                'degree': degree,
                'noise_level': noise_level,
                'method': method,
                'imse_mean': np.mean(imse_vals),
                'imse_sem': np.std(imse_vals) / np.sqrt(len(imse_vals)),
                'imae_mean': np.mean(imae_vals),
                'imae_sem': np.std(imae_vals) / np.sqrt(len(imae_vals)),
                'maxerr_mean': np.mean(maxerr_vals),
                'maxerr_sem': np.std(maxerr_vals) / np.sqrt(len(maxerr_vals)),
                'bandwidth_mean': np.mean(bw_vals),
                'bandwidth_std': np.std(bw_vals),
                'n_success': len(subset)
            })
    
    return results_agg

# ============================================================================
# ОСНОВНОЙ КОД
# ============================================================================

if __name__ == "__main__":
    degrees = range(1, 7)
    noise_levels = ['low', 'moderate', 'high']
    n_seeds = 300
    
    all_results = []
    
    for degree in degrees:
        for noise_level in noise_levels:
            results = test_kernel_regression_degree(degree, noise_level, n_seeds)
            all_results.extend(results)
    
    # Сохранение
    results_df = pd.DataFrame(all_results)
    results_df.to_csv('kernel_regression_results.csv', index=False)
    
    print("\n" + "="*85)
    print("РЕЗУЛЬТАТЫ ЯДЕРНОЙ РЕГРЕССИИ")
    print("="*85)
    print(results_df.to_string(index=False, float_format='%.6f'))
    print("="*85)


Тестирование: степень=1, шум=low
  Прогресс: 0/300
  Прогресс: 50/300
  Прогресс: 100/300
  Прогресс: 150/300
  Прогресс: 200/300
  Прогресс: 250/300

Тестирование: степень=1, шум=moderate
  Прогресс: 0/300
  Прогресс: 50/300
  Прогресс: 100/300
  Прогресс: 150/300
  Прогресс: 200/300
  Прогресс: 250/300

Тестирование: степень=1, шум=high
  Прогресс: 0/300
  Прогресс: 50/300
  Прогресс: 100/300
  Прогресс: 150/300
  Прогресс: 200/300
  Прогресс: 250/300

Тестирование: степень=2, шум=low
  Прогресс: 0/300
  Прогресс: 50/300
  Прогресс: 100/300
  Прогресс: 150/300
  Прогресс: 200/300
  Прогресс: 250/300

Тестирование: степень=2, шум=moderate
  Прогресс: 0/300
  Прогресс: 50/300
  Прогресс: 100/300
  Прогресс: 150/300
  Прогресс: 200/300
  Прогресс: 250/300

Тестирование: степень=2, шум=high
  Прогресс: 0/300
  Прогресс: 50/300
  Прогресс: 100/300
  Прогресс: 150/300
  Прогресс: 200/300
  Прогресс: 250/300

Тестирование: степень=3, шум=low
  Прогресс: 0/300
  Прогресс: 50/300
  Прогресс: