In [1]:
pip install ucimlrepo




# Regression Datsets

In [2]:
import numpy as np
import pandas as pd
import time
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [3]:
# Define activation and utility functions
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def get_all_H(X, W_list, b_list):
    H_list = []
    H = sigmoid(X @ W_list[0] + b_list[0])
    H_list.append(H)
    for l in range(1, len(W_list)):
        XH = np.concatenate([X, H_list[-1]], axis=1)
        H = sigmoid(XH @ W_list[l] + b_list[l])
        H_list.append(H)
    return H_list

def get_D_deep(X, H_list, l):
    return np.concatenate([X, H_list[l]], axis=1)

def init_fit(X0, y0, in_dim, hid_dim, L, lam, rng):
    beta_list, P_list, W_list, b_list = [], [], [], []
    input_dims = [in_dim] + [in_dim + hid_dim] * (L - 1)
    for l in range(L):
        W = rng.standard_normal((input_dims[l], hid_dim))
        b = rng.standard_normal((hid_dim,))
        W_list.append(W)
        b_list.append(b)
    H_list = get_all_H(X0, W_list, b_list)
    for l in range(L):
        D = get_D_deep(X0, H_list, l)
        P = np.linalg.inv(D.T @ D + lam * np.eye(D.shape[1]))
        beta = P @ D.T @ y0
        P_list.append(P)
        beta_list.append(beta)
    return beta_list, P_list, W_list, b_list

def partial_fit(Xk, yk, beta_list, P_list, W_list, b_list, L):
    H_list = get_all_H(Xk, W_list, b_list)
    for l in range(L):
        Dk = get_D_deep(Xk, H_list, l)
        P = P_list[l]
        beta = beta_list[l]
        P_new = P - P @ Dk.T @ np.linalg.inv(np.eye(Dk.shape[0]) + Dk @ P @ Dk.T) @ Dk @ P
        beta_new = beta + P_new @ Dk.T @ (yk - Dk @ beta)
        P_list[l] = P_new
        beta_list[l] = beta_new

def predict_average(X, beta_list, W_list, b_list, L):
    H_list = get_all_H(X, W_list, b_list)
    preds = []
    for l in range(L):
        D = get_D_deep(X, H_list, l)
        preds.append(D @ beta_list[l])
    return np.mean(preds, axis=0)


## Airfoil_Noise


In [4]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Step 1: Fetch dataset from UCI repo by ID (291 = Airfoil Self Noise)
airfoil_self_noise = fetch_ucirepo(id=291)

# Step 2: Extract features and target
X = airfoil_self_noise.data.features.to_numpy()
y = airfoil_self_noise.data.targets.to_numpy().astype(np.float32).ravel()

# Step 3: Metadata & variable info (optional)
print("\nFeature sample:")
print(pd.DataFrame(X).head(10))
print(f"\nDataset dimensions: {X.shape[0]} rows × {X.shape[1]} columns")

# Step 4: 70:30 train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42
)

# Confirm dimensions
print(f"\nTrain set: {X_train.shape[0]} samples")
print(f"Test set : {X_test.shape[0]} samples")



Feature sample:
        0    1       2     3         4
0   800.0  0.0  0.3048  71.3  0.002663
1  1000.0  0.0  0.3048  71.3  0.002663
2  1250.0  0.0  0.3048  71.3  0.002663
3  1600.0  0.0  0.3048  71.3  0.002663
4  2000.0  0.0  0.3048  71.3  0.002663
5  2500.0  0.0  0.3048  71.3  0.002663
6  3150.0  0.0  0.3048  71.3  0.002663
7  4000.0  0.0  0.3048  71.3  0.002663
8  5000.0  0.0  0.3048  71.3  0.002663
9  6300.0  0.0  0.3048  71.3  0.002663

Dataset dimensions: 1503 rows × 5 columns

Train set: 1052 samples
Test set : 451 samples


In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import numpy as np

# Step 1: Standardization (fit only on training data)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 2: Hyperparameter search space
lam_powers = list(range(-6, 13, 2))                         # exponents for 2^x
lam_values = [1 / (2 ** x) for x in lam_powers]             # regularization values
L_values = list(range(2, 10))                               # number of hidden layers
N_values = [256, 512, 1024]                                 # number of hidden units
batch_sizes = [20, 22, 24, 26, 28, 30]                      # online batch sizes

# Step 3: Setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rng = np.random.default_rng(42)

best_rmse = float("inf")
best_config = None
results = []

# Step 4: Grid search
for L in L_values:
    for N in N_values:
        for lam in lam_values:
            for batch_size in batch_sizes:
                fold_rmse = []
                for train_idx, val_idx in kf.split(X_train):
                    X_fold_train, X_val = X_train[train_idx], X_train[val_idx]
                    y_fold_train, y_val = y_train[train_idx], y_train[val_idx]

                    # Initial batch
                    init_size = len(X_fold_train) // 2
                    X0, y0 = X_fold_train[:init_size], y_fold_train[:init_size]

                    beta_list, P_list, W_list, b_list = init_fit(
                        X0, y0, X.shape[1], N, L, lam, rng
                    )

                    # Online Update
                    for i in range(init_size, len(X_fold_train), batch_size):
                        Xk = X_fold_train[i:i+batch_size]
                        yk = y_fold_train[i:i+batch_size]
                        partial_fit(Xk, yk, beta_list, P_list, W_list, b_list, L)

                    # Validation prediction
                    pred_val = predict_average(X_val, beta_list, W_list, b_list, L)
                    rmse = np.sqrt(mean_squared_error(y_val, pred_val))
                    fold_rmse.append(rmse)

                mean_rmse = np.mean(fold_rmse)
                std_rmse = np.std(fold_rmse)
                results.append((L, N, lam, batch_size, mean_rmse, std_rmse))

                if mean_rmse < best_rmse:
                    best_rmse = mean_rmse
                    best_config = (L, N, lam, batch_size, std_rmse)

# Step 6: Report
print(f"\n✅ Best RMSE = {best_rmse:.4f} ± {best_config[4]:.4f}")
print(f"    at L={best_config[0]}, N={best_config[1]}, λ={best_config[2]:.2e}, batch_size={best_config[3]}")



✅ Best RMSE = 2.5073 ± 0.2197
    at L=2, N=1024, λ=2.50e-01, batch_size=22


In [6]:
# Unpack best hyperparameters
L, N, lam, batch_size, _ = best_config

# Evaluate over multiple runs for stable estimate (e.g. 5 trials)
num_trials = 5
rmse_list = []

for _ in range(num_trials):
    # Re-initialize model
    init_size = len(X_train) // 2
    X0, y0 = X_train[:init_size], y_train[:init_size]
    
    beta_list, P_list, W_list, b_list = init_fit(
        X0, y0, X.shape[1], N, L, lam, np.random.default_rng()
    )

    # Online training with remaining train set
    for i in range(init_size, len(X_train), batch_size):
        Xk = X_train[i:i+batch_size]
        yk = y_train[i:i+batch_size]
        partial_fit(Xk, yk, beta_list, P_list, W_list, b_list, L)

    # Evaluate on X_test
    pred_test = predict_average(X_test, beta_list, W_list, b_list, L)
    rmse = np.sqrt(mean_squared_error(y_test, pred_test))
    rmse_list.append(rmse)

# Final results
mean_rmse = np.mean(rmse_list)
std_rmse = np.std(rmse_list)

print(f"\n📊 Final Test RMSE = {mean_rmse:.4f} ± {std_rmse:.4f}")



📊 Final Test RMSE = 2.4445 ± 0.0354


## Daily Demand

In [20]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Step 1: Fetch dataset from UCI repo by ID (60 = Daily Demand Forecasting Orders)
daily_demand = fetch_ucirepo(id=60)

# Step 2: Extract features and target
X = daily_demand.data.features.to_numpy()
y = daily_demand.data.targets.to_numpy().astype(np.float32).ravel()

# Step 3: Metadata & variable info (optional)
print("\nFeature sample:")
print(pd.DataFrame(X).head(10))
print(f"\nDataset dimensions: {X.shape[0]} rows × {X.shape[1]} columns")

# Step 4: 70:30 train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42
)

# Confirm dimensions
print(f"\nTrain set: {X_train.shape[0]} samples")
print(f"Test set : {X_test.shape[0]} samples")



Feature sample:
    0   1   2   3   4
0  85  92  45  27  31
1  85  64  59  32  23
2  86  54  33  16  54
3  91  78  34  24  36
4  87  70  12  28  10
5  98  55  13  17  17
6  88  62  20  17   9
7  88  67  21  11  11
8  92  54  22  20   7
9  90  60  25  19   5

Dataset dimensions: 345 rows × 5 columns

Train set: 241 samples
Test set : 104 samples


In [21]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import numpy as np

# Step 1: Standardization (fit only on training data)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 2: Hyperparameter search space
lam_powers = list(range(-6, 13, 2))                         # exponents for 2^x
lam_values = [1 / (2 ** x) for x in lam_powers]             # regularization values
L_values = list(range(2, 10))                               # number of hidden layers
N_values = [256, 512, 1024]                                 # number of hidden units
batch_sizes = [20, 22, 24, 26, 28, 30]                      # online batch sizes

# Step 3: Setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rng = np.random.default_rng(42)

best_rmse = float("inf")
best_config = None
results = []

# Step 4: Grid search
for L in L_values:
    for N in N_values:
        for lam in lam_values:
            for batch_size in batch_sizes:
                fold_rmse = []
                for train_idx, val_idx in kf.split(X_train):
                    X_fold_train, X_val = X_train[train_idx], X_train[val_idx]
                    y_fold_train, y_val = y_train[train_idx], y_train[val_idx]

                    # Initial batch
                    init_size = len(X_fold_train) // 2
                    X0, y0 = X_fold_train[:init_size], y_fold_train[:init_size]

                    beta_list, P_list, W_list, b_list = init_fit(
                        X0, y0, X.shape[1], N, L, lam, rng
                    )

                    # Online Update
                    for i in range(init_size, len(X_fold_train), batch_size):
                        Xk = X_fold_train[i:i+batch_size]
                        yk = y_fold_train[i:i+batch_size]
                        partial_fit(Xk, yk, beta_list, P_list, W_list, b_list, L)

                    # Validation prediction
                    pred_val = predict_average(X_val, beta_list, W_list, b_list, L)
                    rmse = np.sqrt(mean_squared_error(y_val, pred_val))
                    fold_rmse.append(rmse)

                mean_rmse = np.mean(fold_rmse)
                std_rmse = np.std(fold_rmse)
                results.append((L, N, lam, batch_size, mean_rmse, std_rmse))

                if mean_rmse < best_rmse:
                    best_rmse = mean_rmse
                    best_config = (L, N, lam, batch_size, std_rmse)

# Step 6: Report
print(f"\n✅ Best RMSE = {best_rmse:.4f} ± {best_config[4]:.4f}")
print(f"    at L={best_config[0]}, N={best_config[1]}, λ={best_config[2]:.2e}, batch_size={best_config[3]}")



✅ Best RMSE = 3.0378 ± 0.3824
    at L=7, N=512, λ=6.40e+01, batch_size=20


In [22]:
# Unpack best hyperparameters
L, N, lam, batch_size, _ = best_config

# Evaluate over multiple runs for stable estimate (e.g. 5 trials)
num_trials = 5
rmse_list = []

for _ in range(num_trials):
    # Re-initialize model
    init_size = len(X_train) // 2
    X0, y0 = X_train[:init_size], y_train[:init_size]
    
    beta_list, P_list, W_list, b_list = init_fit(
        X0, y0, X.shape[1], N, L, lam, np.random.default_rng()
    )

    # Online training with remaining train set
    for i in range(init_size, len(X_train), batch_size):
        Xk = X_train[i:i+batch_size]
        yk = y_train[i:i+batch_size]
        partial_fit(Xk, yk, beta_list, P_list, W_list, b_list, L)

    # Evaluate on X_test
    pred_test = predict_average(X_test, beta_list, W_list, b_list, L)
    rmse = np.sqrt(mean_squared_error(y_test, pred_test))
    rmse_list.append(rmse)

# Final results
mean_rmse = np.mean(rmse_list)
std_rmse = np.std(rmse_list)

print(f"\n📊 Final Test RMSE = {mean_rmse:.4f} ± {std_rmse:.4f}")



📊 Final Test RMSE = 2.8548 ± 0.0226


# Clasification

In [34]:
# Activation function
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Forward pass through all L hidden layers
def get_all_H(X, W_list, b_list):
    H_list = []
    H = sigmoid(X @ W_list[0] + b_list[0])
    H_list.append(H)
    for l in range(1, len(W_list)):
        XH = np.concatenate([X, H_list[-1]], axis=1)
        H = sigmoid(XH @ W_list[l] + b_list[l])
        H_list.append(H)
    return H_list

# Construct deep feature matrix D for each layer
def get_D_deep(X, H_list, l):
    return np.concatenate([X, H_list[l]], axis=1)

# One-hot encode y for classification
def one_hot_encode(y, num_classes):
    return np.eye(num_classes)[y]

# Initial training for classification
def init_fit(X0, y0, in_dim, hid_dim, L, lam, rng, num_classes):
    beta_list, P_list, W_list, b_list = [], [], [], []
    input_dims = [in_dim] + [in_dim + hid_dim] * (L - 1)

    for l in range(L):
        W = rng.standard_normal((input_dims[l], hid_dim))
        b = rng.standard_normal((hid_dim,))
        W_list.append(W)
        b_list.append(b)

    H_list = get_all_H(X0, W_list, b_list)
    Y0 = one_hot_encode(y0, num_classes)

    for l in range(L):
        D = get_D_deep(X0, H_list, l)
        P = np.linalg.inv(D.T @ D + lam * np.eye(D.shape[1]))
        beta = P @ D.T @ Y0
        P_list.append(P)
        beta_list.append(beta)

    return beta_list, P_list, W_list, b_list

# Online update with new batch for classification
def partial_fit(Xk, yk, beta_list, P_list, W_list, b_list, L, num_classes):
    H_list = get_all_H(Xk, W_list, b_list)
    Yk = one_hot_encode(yk, num_classes)

    for l in range(L):
        Dk = get_D_deep(Xk, H_list, l)
        P = P_list[l]
        beta = beta_list[l]
        P_new = P - P @ Dk.T @ np.linalg.inv(np.eye(Dk.shape[0]) + Dk @ P @ Dk.T) @ Dk @ P
        beta_new = beta + P_new @ Dk.T @ (Yk - Dk @ beta)
        P_list[l] = P_new
        beta_list[l] = beta_new

# Predict using majority vote from L layers
def predict_majority(X, beta_list, W_list, b_list, L):
    H_list = get_all_H(X, W_list, b_list)
    preds = []

    for l in range(L):
        D = get_D_deep(X, H_list, l)
        out = D @ beta_list[l]
        preds.append(np.argmax(out, axis=1))  # class prediction per layer

    preds = np.array(preds)  # shape: (L, n_samples)
    final_preds = []
    for i in range(preds.shape[1]):
        votes = preds[:, i]
        final_preds.append(np.bincount(votes).argmax())  # majority voting

    return np.array(final_preds)


## Iris

In [35]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# Load original Iris dataset
data = load_iris()
X = data.data   # shape (150, 4)
y = data.target # labels: 0, 1, 2

# Print basic info
print("\nFeature sample:")
print(pd.DataFrame(X, columns=data.feature_names).head(10))
print(f"\nDataset dimensions: {X.shape[0]} rows × {X.shape[1]} columns")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=42
)

# Confirm
print(f"\nTrain set: {X_train.shape[0]} samples")
print(f"Test set : {X_test.shape[0]} samples")



Feature sample:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2
5                5.4               3.9                1.7               0.4
6                4.6               3.4                1.4               0.3
7                5.0               3.4                1.5               0.2
8                4.4               2.9                1.4               0.2
9                4.9               3.1                1.5               0.1

Dataset dimensions: 150 rows × 4 columns

Train set: 105 samples
Test set : 45 samples


In [36]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

# Step 1: Standardize input features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 2: Hyperparameter space
lam_powers = list(range(-6, 13, 2))
lam_values = [1 / (2 ** x) for x in lam_powers]
L_values = list(range(2, 10))
N_values = [256, 512, 1024]
batch_sizes = [20, 22, 24, 26, 28, 30]

# Step 3: Setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rng = np.random.default_rng(42)
num_classes = len(np.unique(y_train))

best_acc = -1
best_config = None
results = []

# Step 4: Grid search
for L in L_values:
    for N in N_values:
        for lam in lam_values:
            for batch_size in batch_sizes:
                fold_acc = []

                for train_idx, val_idx in kf.split(X_train):
                    X_fold_train, X_val = X_train[train_idx], X_train[val_idx]
                    y_fold_train, y_val = y_train[train_idx], y_train[val_idx]

                    init_size = len(X_fold_train) // 2
                    X0, y0 = X_fold_train[:init_size], y_fold_train[:init_size]

                    beta_list, P_list, W_list, b_list = init_fit(
                        X0, y0, X.shape[1], N, L, lam, rng, num_classes
                    )

                    for i in range(init_size, len(X_fold_train), batch_size):
                        Xk = X_fold_train[i:i+batch_size]
                        yk = y_fold_train[i:i+batch_size]
                        partial_fit(Xk, yk, beta_list, P_list, W_list, b_list, L, num_classes)

                    pred_val = predict_majority(X_val, beta_list, W_list, b_list, L)
                    acc = np.mean(pred_val == y_val)
                    fold_acc.append(acc)

                mean_acc = np.mean(fold_acc)
                std_acc = np.std(fold_acc)
                results.append((L, N, lam, batch_size, mean_acc, std_acc))

                if mean_acc > best_acc:
                    best_acc = mean_acc
                    best_config = (L, N, lam, batch_size, std_acc)

# Final result
print(f"\n✅ Best Accuracy = {best_acc:.4f} ± {best_config[4]:.4f}")
print(f"    at L={best_config[0]}, N={best_config[1]}, λ={best_config[2]:.2e}, batch_size={best_config[3]}")



✅ Best Accuracy = 0.9905 ± 0.0190
    at L=6, N=512, λ=2.44e-04, batch_size=28


In [37]:
# Unpack best hyperparameters
L, N, lam, batch_size, _ = best_config

# Number of evaluation runs (for statistical confidence)
num_trials = 5
acc_list = []

for _ in range(num_trials):
    # Initial batch: first 50% of X_train
    init_size = len(X_train) // 2
    X0, y0 = X_train[:init_size], y_train[:init_size]

    # Train initial model
    beta_list, P_list, W_list, b_list = init_fit(
        X0, y0, X.shape[1], N, L, lam, np.random.default_rng(), num_classes
    )

    # Online learning with remaining batches
    for i in range(init_size, len(X_train), batch_size):
        Xk = X_train[i:i+batch_size]
        yk = y_train[i:i+batch_size]
        partial_fit(Xk, yk, beta_list, P_list, W_list, b_list, L, num_classes)

    # Predict and evaluate on test set
    pred_test = predict_majority(X_test, beta_list, W_list, b_list, L)
    acc = np.mean(pred_test == y_test)
    acc_list.append(acc)

# Final averaged accuracy and standard deviation
mean_acc = np.mean(acc_list)
std_acc = np.std(acc_list)

print(f"\n📊 Final Test Accuracy = {mean_acc:.4f} ± {std_acc:.4f}")



📊 Final Test Accuracy = 0.9156 ± 0.0089


## Breast Cancer

In [40]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# Load Breast Cancer dataset
data = load_breast_cancer()
X = data.data
y = data.target  # 0 = malignant, 1 = benign

# Print basic info
print("\nFeature sample:")
print(pd.DataFrame(X, columns=data.feature_names).head(10))
print(f"\nDataset dimensions: {X.shape[0]} rows × {X.shape[1]} columns")

# 70:30 Train-test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=42
)

# Confirm splits
print(f"\nTrain set: {X_train.shape[0]} samples")
print(f"Test set : {X_test.shape[0]} samples")



Feature sample:
   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   
5        12.45         15.70           82.57      477.1          0.12780   
6        18.25         19.98          119.60     1040.0          0.09463   
7        13.71         20.83           90.20      577.9          0.11890   
8        13.00         21.82           87.50      519.8          0.12730   
9        12.46         24.04           83.97      475.9          0.11860   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760         0.30010              0.14710         0.2419

In [41]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

# Step 1: Standardize input features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 2: Hyperparameter space
lam_powers = list(range(-6, 13, 2))
lam_values = [1 / (2 ** x) for x in lam_powers]
L_values = list(range(2, 10))
N_values = [256, 512, 1024]
batch_sizes = [20, 22, 24, 26, 28, 30]

# Step 3: Setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rng = np.random.default_rng(42)
num_classes = len(np.unique(y_train))

best_acc = -1
best_config = None
results = []

# Step 4: Grid search
for L in L_values:
    for N in N_values:
        for lam in lam_values:
            for batch_size in batch_sizes:
                fold_acc = []

                for train_idx, val_idx in kf.split(X_train):
                    X_fold_train, X_val = X_train[train_idx], X_train[val_idx]
                    y_fold_train, y_val = y_train[train_idx], y_train[val_idx]

                    init_size = len(X_fold_train) // 2
                    X0, y0 = X_fold_train[:init_size], y_fold_train[:init_size]

                    beta_list, P_list, W_list, b_list = init_fit(
                        X0, y0, X.shape[1], N, L, lam, rng, num_classes
                    )

                    for i in range(init_size, len(X_fold_train), batch_size):
                        Xk = X_fold_train[i:i+batch_size]
                        yk = y_fold_train[i:i+batch_size]
                        partial_fit(Xk, yk, beta_list, P_list, W_list, b_list, L, num_classes)

                    pred_val = predict_majority(X_val, beta_list, W_list, b_list, L)
                    acc = np.mean(pred_val == y_val)
                    fold_acc.append(acc)

                mean_acc = np.mean(fold_acc)
                std_acc = np.std(fold_acc)
                results.append((L, N, lam, batch_size, mean_acc, std_acc))

                if mean_acc > best_acc:
                    best_acc = mean_acc
                    best_config = (L, N, lam, batch_size, std_acc)

# Final result
print(f"\n✅ Best Accuracy = {best_acc:.4f} ± {best_config[4]:.4f}")
print(f"    at L={best_config[0]}, N={best_config[1]}, λ={best_config[2]:.2e}, batch_size={best_config[3]}")



✅ Best Accuracy = 0.9799 ± 0.0170
    at L=4, N=1024, λ=4.00e+00, batch_size=26


In [42]:
# Unpack best hyperparameters
L, N, lam, batch_size, _ = best_config

# Number of evaluation runs (for statistical confidence)
num_trials = 5
acc_list = []

for _ in range(num_trials):
    # Initial batch: first 50% of X_train
    init_size = len(X_train) // 2
    X0, y0 = X_train[:init_size], y_train[:init_size]

    # Train initial model
    beta_list, P_list, W_list, b_list = init_fit(
        X0, y0, X.shape[1], N, L, lam, np.random.default_rng(), num_classes
    )

    # Online learning with remaining batches
    for i in range(init_size, len(X_train), batch_size):
        Xk = X_train[i:i+batch_size]
        yk = y_train[i:i+batch_size]
        partial_fit(Xk, yk, beta_list, P_list, W_list, b_list, L, num_classes)

    # Predict and evaluate on test set
    pred_test = predict_majority(X_test, beta_list, W_list, b_list, L)
    acc = np.mean(pred_test == y_test)
    acc_list.append(acc)

# Final averaged accuracy and standard deviation
mean_acc = np.mean(acc_list)
std_acc = np.std(acc_list)

print(f"\n📊 Final Test Accuracy = {mean_acc:.4f} ± {std_acc:.4f}")



📊 Final Test Accuracy = 0.9731 ± 0.0047
