🍁 Preprocessing

In [5]:
from keras.datasets import mnist
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pickle
import numpy as np

# Import
(x_train, y_train), (_, _) = mnist.load_data()
print(f"x_train shape: {x_train.shape}, y_train shape: {y_train.shape}")

# 1. Flatten
x_train_flat = x_train.reshape(x_train.shape[0], -1)
print(f"x_train_flat shape: {x_train_flat.shape}, y_train shape: {y_train.shape}")

# 2. Subsets
subset_size = 500
x_train_subset = x_train_flat[:subset_size]
y_train_subset = y_train[:subset_size]
print(f"x_train_subset shape: {x_train_subset.shape}")

# 3. PCA
n_components = 10
pca = PCA(n_components=n_components)
x_train_pca = pca.fit_transform(x_train_subset)
print(f"x_train_pca shape: {x_train_pca.shape}")
variance = np.sum(pca.explained_variance_ratio_)
print(f"variance = {variance}")  # Verify how much variance is retained

# # 4. Normalize
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler(feature_range=(0, 1))
# x_train_norm = scaler.fit_transform(x_train_pca)
# print(f"x_train_norm shape: {x_train_norm.shape}")
# x_train_norm.shape[0]

# 4. Binarize
threshold_value = 0
x_train_norm = (x_train_pca > threshold_value).astype(int)
print(f"x_train_binary shape: {x_train_norm.shape}")
print(x_train_norm)

x_train shape: (60000, 28, 28), y_train shape: (60000,)
x_train_flat shape: (60000, 784), y_train shape: (60000,)
x_train_subset shape: (500, 784)
x_train_pca shape: (500, 10)
variance = 0.5235613405019189
x_train_binary shape: (500, 10)
[[1 1 0 ... 1 1 1]
 [1 1 0 ... 0 0 1]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 1 ... 1 1 0]
 [0 0 1 ... 0 0 0]
 [0 1 1 ... 1 0 0]]


ssssss

In [6]:
def bisection_loop(x_train_norm, y_binary, uL, uH, precision, n_components):
    optimal_alpha, optimal_beta, optimal_theta = None, None, None
    z_values = []

    while uH - uL > precision:
        z = (uL + uH) / 2
        z_values.append(z)

        # Feasibility Check
        feasible, alpha_coefficients, beta_coefficients, theta = check_feasibility_and_compute_coefficients(
            z, x_train_norm, y_binary, n_components
        )

        # Update bounds based on feasibility
        if feasible:
            uH = z
            optimal_alpha, optimal_beta, optimal_theta = alpha_coefficients, beta_coefficients, theta
        else:
            uL = z

    return uH, optimal_alpha, optimal_beta, optimal_theta, z_values

In [7]:
uL = 0
uH = 10
precision = 1e-3
# n_components = 2
uH, optimal_alpha, optimal_beta, optimal_theta, z_values = bisection_loop(
    x_train_norm, y_binary, uL, uH, precision, n_components
)
print(f"Final z: {uH}, Optimal Coefficients: {optimal_alpha}, {optimal_beta}")


NameError: name 'y_binary' is not defined

In [4]:
import numpy as np
import time
import pickle
import os
from sklearn.metrics import accuracy_score
from keras.datasets import mnist
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pickle
import numpy as np

# Directory containing saved models
models_dir = "/home/ajay2425/rclass/models_grb1/"

# Load Test Data
(_, _), (x_test, y_test) = mnist.load_data()

# Preprocess Test Data
x_test_flat = x_test.reshape(x_test.shape[0], -1)  # Flatten
x_test_subset = x_test_flat[:1000]  # Test on a smaller subset
y_test_subset = y_test[:1000]

# Apply PCA (same as training)
n_components = 10
pca = PCA(n_components=n_components)
x_test_pca = pca.transform(x_test_subset)  # Use the same PCA model from training

# Binarize
threshold_value = 0
x_train_norm = (x_train_pca > threshold_value).astype(int)
print(f"x_train_norm shape: {x_train_norm.shape}")

# # Normalize
# x_test_norm = (x_test_pca > 0).astype(int)

# Accuracy, Time, and Computations
overall_start_time = time.time()  # Track total time
accuracies = []
computation_counts = []

# Test each digit classifier
for digit in range(10):
    print(f"Testing classifier for digit {digit}...")

    # Load the model
    model_path = f"{models_dir}/classifier_{digit}.pkl"
    if not os.path.exists(model_path):
        print(f"Model for digit {digit} not found! Skipping...")
        continue

    with open(model_path, "rb") as file:
        model_data = pickle.load(file)

    alpha = model_data["alpha"]
    beta = model_data["beta"]
    theta = model_data["theta"]
    multi_indices = model_data["multi_indices"]

    # Start timer for this classifier
    start_time = time.time()

    # Test predictions using the rational function
    num_coefficients = len(alpha)
    n = x_test_norm.shape[1]  # Number of features
    d = model_data["degree"]  # Degree of the rational function

    # Compute G and H matrices for the test data
    G, H, _ = construct_G_H_matrices(x_test_norm, n, d)

    # Predict for each test sample
    y_pred_binary = []
    for i in range(len(x_test_norm)):
        G_x = np.dot(G[i], alpha)  # αᵀG(xᵢ)
        H_x = np.dot(H[i], beta)   # βᵀH(xᵢ)
        rational_value = G_x / H_x if H_x != 0 else float("inf")
        y_pred_binary.append(1 if rational_value < theta else 0)

    # End timer
    end_time = time.time()

    # Convert binary predictions back to digit predictions
    y_pred = [digit if pred == 1 else -1 for pred in y_pred_binary]

    # Filter only relevant samples
    relevant_indices = np.where((y_test_subset == digit) | (np.array(y_pred) == digit))[0]
    y_true_filtered = y_test_subset[relevant_indices]
    y_pred_filtered = np.array(y_pred)[relevant_indices]

    # Compute accuracy
    accuracy = accuracy_score(y_true_filtered, y_pred_filtered)
    accuracies.append(accuracy)

    # Estimate computations (e.g., matrix multiplications)
    computations = len(x_test_norm) * num_coefficients * 2  # G_x and H_x calculations
    computation_counts.append(computations)

    print(f"Digit {digit}: Accuracy = {accuracy * 100:.2f}%, Time = {end_time - start_time:.2f}s, Computations = {computations}")

# Calculate total testing time
overall_end_time = time.time()
total_time = overall_end_time - overall_start_time

# Summary
print(f"\nOverall Testing Time: {total_time:.2f}s")
print(f"Average Accuracy: {np.mean(accuracies) * 100:.2f}%")
print(f"Total Computations: {np.sum(computation_counts):,}")

# Save results for comparison
results = {
    "accuracies": accuracies,
    "computation_counts": computation_counts,
    "total_time": total_time
}
# with open(f"{models_dir}/test_results.json", "w") as file:
#     json.dump(results, file, indent=4)

AttributeError: 'PCA' object has no attribute 'components_'