# modules

In [172]:
import numpy as np
import pandas as pd

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold

import matplotlib.pyplot as plt

# data

In [37]:
df = pd.read_csv('../../../dataset/surrogate.csv')

# process

In [40]:
cat_cols = df.select_dtypes(include=['object']).columns

In [41]:
df = pd.get_dummies(df, columns=cat_cols)

# split

In [42]:
X = df.drop(columns=['train_accuracy', 'test_accuracy'])
y = df['test_accuracy']
y_ = df['train_accuracy']

In [44]:
y.mean(), y.std(), y_.mean(), y_.std()

(np.float64(71.8639393939394),
 5.25439514798917,
 np.float64(73.40521212121213),
 6.075126705250557)

In [207]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

In [200]:
X_train = X
X_val = X
y_train = y
y_val = y

In [201]:
selector = VarianceThreshold(threshold=0.01)
X_train = selector.fit_transform(X_train)
X_val = selector.transform(X_val)

In [208]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [218]:
pca = PCA(n_components=0.96)
X_train = pca.fit_transform(X_train)
X_val = pca.transform(X_val)

In [219]:
X_train.shape, X_val.shape

((29, 21), (4, 21))

# model

In [220]:
kernel = C(5.0, (1e-3, 1e3)) * RBF(length_scale=0.5, length_scale_bounds=(1e-3, 1e3))

gpr = GaussianProcessRegressor(kernel=kernel, alpha=1e-3, normalize_y=True, n_restarts_optimizer=200)

gpr.fit(X_train, y_train)
print("Fitted kernel:", gpr.kernel_)

Fitted kernel: 0.996**2 * RBF(length_scale=5.27)


In [221]:
y_pred_val, y_std_val = gpr.predict(X_val, return_std=True)
mse = mean_squared_error(y_val, y_pred_val)
print(f"\nValidation RMSE: {np.sqrt(mse):.4f}\n")


Validation RMSE: 3.5607



In [213]:
y_pred_val

array([73.4406133 , 72.67081498, 74.01383729, 71.03440903])

In [139]:
confidence_z = {
    0.50: 0.674,   # 50%
    0.60: 0.841,   # 60%
    0.70: 1.036,   # 70%
    0.80: 1.282,   # 80%
    0.85: 1.440,   # 85%
    0.90: 1.645,   # 90%
    0.95: 1.960,   # 95%
    0.98: 2.326,   # 98%
    0.99: 2.576,   # 99%
    0.999: 3.291,  # 99.9%
    0.9999: 3.891  # 99.99%
}

In [222]:
confidence = 0.6
z = confidence_z[confidence]
print(f"Confidence level: {confidence*100:.1f}%")

for x_val, y_true in zip(X_val, y_val):
    x_val = x_val.reshape(1, -1)
    y_pred, y_std = gpr.predict(x_val, return_std=True)
    mean = y_pred[0]
    std = y_std[0]
    ci = z * std
    print(f"Val Sample:  True = {y_true:.3f}, Predicted = {mean:.3f}, CI ≈ [{mean - ci:.3f}, {mean + ci:.3f}]")

Confidence level: 60.0%
Val Sample:  True = 75.290, Predicted = 73.661, CI ≈ [69.867, 77.456]
Val Sample:  True = 77.850, Predicted = 72.484, CI ≈ [68.696, 76.273]
Val Sample:  True = 78.230, Predicted = 74.103, CI ≈ [70.045, 78.160]
Val Sample:  True = 72.420, Predicted = 70.924, CI ≈ [67.183, 74.666]


In [112]:
y.shape, y.mean(), y.std()

((33,), np.float64(71.8639393939394), 5.25439514798917)

In [None]:


# ----- 5. Inference: Predict on new (unlabeled) architectures -----
# Simulate 10 new architectures
X_new = np.random.rand(10, 5)

y_new_pred, y_new_std = gpr.predict(X_new, return_std=True)

# Select best by predicted accuracy
best_idx = np.argmax(y_new_pred)
print(f"\nBest predicted architecture: Index {best_idx}, Predicted accuracy = {y_new_pred[best_idx]:.3f}")

# Optionally: Use UCB (exploration + exploitation)
kappa = 1.96
ucb = y_new_pred + kappa * y_new_std
best_ucb_idx = np.argmax(ucb)
print(f"Best UCB architecture: Index {best_ucb_idx}, UCB score = {ucb[best_ucb_idx]:.3f}")

# ----- 6. (Optional) Plot prediction intervals -----
plt.figure(figsize=(10, 5))
plt.errorbar(range(len(y_new_pred)), y_new_pred, yerr=1.96 * y_new_std, fmt='o', capsize=4)
plt.title("Predicted Accuracies with 95% Confidence Intervals")
plt.xlabel("Candidate Index")
plt.ylabel("Predicted Accuracy")
plt.grid(True)
plt.show()
