In [1]:
from utils import *
from sklearn import metrics
from sklearn.decomposition import PCA
import pandas as pd

# Experiment runner

In [2]:
def real_experiment(data, depths, alpha=1e-5):
    """
    Data format := `[X_train, y_train, X_test, y_test, norm : bool, noise : bool, name : str]`

    Outputs dictionary containing `dataset`, `means`, `kernel`, 
    `ntk`, `lap`, and `gaus` information
    """
    norm = data[-3]
    noise = data[-2]
    name = data[-1]

    print(f'\n{name} :\nnorm  = {norm}\nnoise = {noise}\ndepth = {depths}')

    exp_data = {}
    exp_data['kernel'] = {}
    exp_data['metrics'] = {}

    means_n = []
    for depth in depths:


        #########################
        # Neural tangent Kernel #
        #########################


        ntk = (
            ConstantKernel(constant_value=1) + 
            NTK(depth=depth, bias=0.1)
        )

        if noise:
            ntk += WhiteKernel(noise_level=0.1)

        gp_n = GPR(kernel=ntk, alpha=alpha, normalize_y=True,
            n_restarts_optimizer=9, random_state=29834057)

        gp_n.fit(data[0], data[1])
        mean_n = gp_n.predict(data[2])
        print(gp_n.kernel_)


        #########################
        #         Data          #
        #########################


        if noise: 
            const_val_n = gp_n.kernel_.get_params()['k1__k1__constant_value']
            noise_lvl_n = gp_n.kernel_.get_params()['k2__noise_level']
            bias = gp_n.kernel_.get_params()['k1__k2__bias']
        else:
            const_val_n = gp_n.kernel_.get_params()['k1__constant_value']
            noise_lvl_n = None
            bias = gp_n.kernel_.get_params()['k2__bias']

        means_n.append(mean_n.ravel())

        exp_data['kernel'][f'ntk_{depth}'] = {
                'C' : const_val_n,
                'W' : noise_lvl_n,
                'depth' : depth,
                'bias' : bias
            }
        
        exp_data['metrics'][f'ntk_{depth}'] = {
                'rmse' : metrics.mean_squared_error(data[3].ravel(), mean_n.ravel(), squared=False),
                'r2' : metrics.r2_score(data[3].ravel(), mean_n.ravel())
            }




    #########################
    #   Lap + Gaus Kernel   #
    #########################    


    lap = (
        ConstantKernel(constant_value=1) + 
        Matern(nu=1/2, length_scale=1)
    )

    gaus = (
        ConstantKernel(constant_value=1) + 
        Matern(nu=np.inf, length_scale=1)
    )

    if noise:
        lap += WhiteKernel(noise_level=0.1)
        gaus += WhiteKernel(noise_level=0.1)


    gp_l = GPR(kernel=lap, alpha=alpha, normalize_y=True,
        n_restarts_optimizer=9, random_state=29834057)

    gp_g = GPR(kernel=gaus, alpha=alpha, normalize_y=True,
        n_restarts_optimizer=9, random_state=29834057)


    gp_l.fit(data[0], data[1])
    mean_l = gp_l.predict(data[2])
    print(gp_l.kernel_)

    gp_g.fit(data[0], data[1])
    mean_g = gp_g.predict(data[2])
    print(gp_g.kernel_)


    #########################
    #         Data          #
    #########################


    if noise: 
        const_val_l = gp_l.kernel_.get_params()['k1__k1__constant_value']
        const_val_g = gp_g.kernel_.get_params()['k1__k1__constant_value']

        noise_lvl_l = gp_l.kernel_.get_params()['k2__noise_level']
        noise_lvl_g = gp_g.kernel_.get_params()['k2__noise_level']

        ell_l = gp_l.kernel_.get_params()['k1__k2__length_scale']
        ell_g = gp_g.kernel_.get_params()['k1__k2__length_scale']
    else:
        const_val_l = gp_l.kernel_.get_params()['k1__constant_value']
        const_val_g = gp_g.kernel_.get_params()['k1__constant_value']

        noise_lvl_l = None
        noise_lvl_g = None
        
        ell_l = gp_l.kernel_.get_params()['k2__length_scale']
        ell_g = gp_g.kernel_.get_params()['k2__length_scale']


    exp_data['dataset'] = {
        'name' : name, 
        'norm' : norm,
        'noise': noise,
        'test' : [data[2], data[3]]
    }

    exp_data['means'] = (*means_n, mean_l.ravel(), mean_g.ravel())


    exp_data['kernel']['lap'] = {
            'C' : const_val_l,
            'W' : noise_lvl_l,
            'ell' : ell_l
        }
    exp_data['kernel']['gaus'] = {
            'C' : const_val_g,
            'W' : noise_lvl_g,
            'ell' : ell_g
        }
    exp_data['metrics']['lap'] = {
            'rmse' : metrics.mean_squared_error(data[3].ravel(), mean_l.ravel(), squared=False),
            'r2' : metrics.r2_score(data[3].ravel(), mean_l.ravel())
        }
    exp_data['metrics']['gaus'] = {
            'rmse' : metrics.mean_squared_error(data[3].ravel(), mean_g.ravel(), squared=False),
            'r2' : metrics.r2_score(data[3].ravel(), mean_g.ravel())
        }

    return exp_data

# Datasets

In [3]:
# [X_train, y_train, X_test, y_test, norm : bool, noise : bool, name : str]
datasets = []

In [4]:
concrete = pd.read_csv('./real_world_data/concrete.csv', header=0)

names_c = concrete.columns

X = concrete.drop(names_c[-1], axis=1)
X = X.to_numpy()
y = concrete[names_c[-1]]
y = y.to_numpy()

X_t = np.zeros_like(X)
for i in range(0, X.shape[1]):
    X_t[:,i] = (X[:,i] - np.mean(X[:,i])) / np.std(X[:,i])

y_t = y

X_train, X_test, y_train, y_test = train_test_split(X_t, y_t, test_size=0.25, random_state=13450978)

datasets.append([X_train, y_train, X_test, y_test, False, False, 'Concrete'])
datasets.append([X_train, y_train, X_test, y_test, False, True, 'Concrete'])

X_train, X_test, y_train, y_test = train_test_split(normalize(X_t, axis=1), y_t, test_size=0.25, random_state=13450978)

datasets.append([X_train, y_train, X_test, y_test, True, False, 'Concrete'])
datasets.append([X_train, y_train, X_test, y_test, True, True, 'Concrete'])

In [6]:
forest_fires = pd.read_csv('./real_world_data/forestfires.csv', header=0)
# forest_fires.drop(['month', 'day'], axis=1, inplace=True)
# forest_fires.drop(['month', 'day', 'X', 'Y'], axis=1, inplace=True)
forest_fires.drop(['month', 'day', 'X', 'Y', 'FFMC', 'DMC', 'DC', 'ISI'], axis=1, inplace=True)

names_f = forest_fires.columns

X = forest_fires.drop('area', axis=1)
X = X.to_numpy()
y = forest_fires['area']
y = y.to_numpy()

X_t = np.zeros_like(X)
for i in range(0, X.shape[1]):
    X_t[:,i] = (X[:,i] - np.mean(X[:,i])) / np.std(X[:,i])

y_t = np.log(y + 1)

X_train, X_test, y_train, y_test = train_test_split(X_t, y_t, test_size=0.25, random_state=13450978)

datasets.append([X_train, y_train, X_test, y_test, False, False, 'Fires'])
datasets.append([X_train, y_train, X_test, y_test, False, True, 'Fires'])

X_train, X_test, y_train, y_test = train_test_split(normalize(X_t, axis=1), y_t, test_size=0.25, random_state=13450978)

datasets.append([X_train, y_train, X_test, y_test, True, False, 'Fires'])
datasets.append([X_train, y_train, X_test, y_test, True, True, 'Fires'])

# Results

In [7]:
depths = (2, 3, 10)
arrays = [
    ["Concrete", "Fires"],
    ['rmse', 'r2'],
    [False, True],  # Noise
]

index = pd.MultiIndex.from_product(
    arrays, names=['Dataset', 'Metric', 'Noise'])


df_rd = pd.DataFrame(
    index=index,
    columns=['NTK D=2', 'NTK D=3', 'NTK D=10', 'Laplace', 'Gaussian']
)

df_sd = pd.DataFrame(
    index=index,
    columns=['NTK D=2', 'NTK D=3', 'NTK D=10', 'Laplace', 'Gaussian']
)

df_rd

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,NTK D=2,NTK D=3,NTK D=10,Laplace,Gaussian
Dataset,Metric,Noise,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Concrete,rmse,False,,,,,
Concrete,rmse,True,,,,,
Concrete,r2,False,,,,,
Concrete,r2,True,,,,,
Fires,rmse,False,,,,,
Fires,rmse,True,,,,,
Fires,r2,False,,,,,
Fires,r2,True,,,,,


In [8]:
%%capture
experiment_outputs = []
depths = (2, 3, 10)
for data in datasets:
    experiment_outputs.append(real_experiment(data, depths))

In [11]:
save_data(experiment_outputs, 'exp_real')

In [27]:
experiment_outputs = load_data('exp_real')

In [10]:
experiment_outputs[7]['kernel']

{'ntk_2': {'C': 0.07497901611203577,
  'W': 0.937067664649129,
  'depth': 2,
  'bias': 192.7234185412923},
 'ntk_3': {'C': 0.0749314253607406,
  'W': 0.9203341969346668,
  'depth': 3,
  'bias': 183.83011869157457},
 'ntk_10': {'C': 0.07488368380737231,
  'W': 0.8356832622141922,
  'depth': 10,
  'bias': 214.87291938448763},
 'lap': {'C': 9.999999999999997e-06,
  'W': 0.9980877866913521,
  'ell': 286.49267417827105},
 'gaus': {'C': 9.999999999999997e-06,
  'W': 1.0009083894197603,
  'ell': 24.22180068419968}}

In [12]:
depths = (2,3,10)
for exp in experiment_outputs:
    name = exp['dataset']['name']
    noise = exp['dataset']['noise']


    # if exp['dataset']['norm']:
    #     for depth in depths:
    #         df_sd[f'NTK D={depth}'][name, 'rmse', noise] = exp[f'ntk_{depth}']['rmse']
    #         df_sd[f'NTK D={depth}'][name, 'r2', noise] = exp[f'ntk_{depth}']['r2']
        
    #     df_sd['Laplace'][name, 'rmse', noise] = exp['lap']['rmse']
    #     df_sd['Laplace'][name, 'r2', noise] = exp['lap']['r2']
    #     df_sd['Gaussian'][name, 'rmse', noise] = exp['gaus']['rmse']
    #     df_sd['Gaussian'][name, 'r2', noise] = exp['gaus']['r2']
    # else:
    #     for depth in depths:
    #         df_rd[f'NTK D={depth}'][name, 'rmse', noise] = exp[f'ntk_{depth}']['rmse']
    #         df_rd[f'NTK D={depth}'][name, 'r2', noise] = exp[f'ntk_{depth}']['r2']
        
    #     df_rd['Laplace'][name, 'rmse', noise] = exp['lap']['rmse']
    #     df_rd['Laplace'][name, 'r2', noise] = exp['lap']['r2']
    #     df_rd['Gaussian'][name, 'rmse', noise] = exp['gaus']['rmse']
    #     df_rd['Gaussian'][name, 'r2', noise] = exp['gaus']['r2']

    if exp['dataset']['norm']:
        for depth in depths:
            df_sd[f'NTK D={depth}'][name, 'rmse', noise] = exp['metrics'][f'ntk_{depth}']['rmse']
            df_sd[f'NTK D={depth}'][name, 'r2', noise] = exp['metrics'][f'ntk_{depth}']['r2']
        
        df_sd['Laplace'][name, 'rmse', noise] = exp['metrics']['lap']['rmse']
        df_sd['Laplace'][name, 'r2', noise] = exp['metrics']['lap']['r2']
        df_sd['Gaussian'][name, 'rmse', noise] = exp['metrics']['gaus']['rmse']
        df_sd['Gaussian'][name, 'r2', noise] = exp['metrics']['gaus']['r2']
    else:
        for depth in depths:
            df_rd[f'NTK D={depth}'][name, 'rmse', noise] = exp['metrics'][f'ntk_{depth}']['rmse']
            df_rd[f'NTK D={depth}'][name, 'r2', noise] = exp['metrics'][f'ntk_{depth}']['r2']
        
        df_rd['Laplace'][name, 'rmse', noise] = exp['metrics']['lap']['rmse']
        df_rd['Laplace'][name, 'r2', noise] = exp['metrics']['lap']['r2']
        df_rd['Gaussian'][name, 'rmse', noise] = exp['metrics']['gaus']['rmse']
        df_rd['Gaussian'][name, 'r2', noise] = exp['metrics']['gaus']['r2']

In [None]:
# save_data(experiment_outputs, 'exp_real')

In [13]:
df_rd

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,NTK D=2,NTK D=3,NTK D=10,Laplace,Gaussian
Dataset,Metric,Noise,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Concrete,rmse,False,4.541733,4.583389,5.188233,5.130875,15.485885
Concrete,rmse,True,4.956185,5.061394,5.647737,5.320974,5.305764
Concrete,r2,False,0.919441,0.917956,0.894874,0.897185,0.063421
Concrete,r2,True,0.904067,0.899951,0.875428,0.889426,0.890057
Fires,rmse,False,1.81501,1.811645,1.788682,1.623822,1.623822
Fires,rmse,True,1.586203,1.584849,1.592349,1.600751,1.601554
Fires,r2,False,-0.30422,-0.29939,-0.266658,-0.043927,-0.043927
Fires,r2,True,0.003883,0.005582,-0.003851,-0.014473,-0.015491


In [14]:
df_sd

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,NTK D=2,NTK D=3,NTK D=10,Laplace,Gaussian
Dataset,Metric,Noise,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Concrete,rmse,False,4.780826,4.801791,5.241618,5.075074,15.485835
Concrete,rmse,True,5.35711,5.290564,5.571019,5.436794,5.18693
Concrete,r2,False,0.910736,0.909951,0.892699,0.899409,0.063427
Concrete,r2,True,0.887919,0.890686,0.878789,0.88456,0.894927
Fires,rmse,False,1.764876,1.762058,1.754268,1.623822,1.623822
Fires,rmse,True,1.589066,1.586511,1.592348,1.600026,1.600483
Fires,r2,False,-0.233167,-0.22923,-0.218386,-0.043927,-0.043927
Fires,r2,True,0.000283,0.003495,-0.00385,-0.013555,-0.014134
