In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier


def do_inner_fold(getModel, inner_cv, X_train_outer, y_train_outer, lambdas):
        # ---------------------------------------
    # Inner CV for Logistic Regression
    # ---------------------------------------
    
    best_lambda = None
    best_logreg_score = np.inf  # lower is better since we use error rate
    for lam in lambdas:
        val_errors = []
        # Tune on inner folds
        for train_inner_idx, test_inner_idx in inner_cv.split(X_train_outer, y_train_outer):
            X_train, y_train = X_train_outer[train_inner_idx], y_train_outer[train_inner_idx]
            X_test, y_test = X_train_outer[test_inner_idx], y_train_outer[test_inner_idx]
            
            print(X_train.shape)
            # Note: C = 1/lam
            model = getModel(lam)
            model.fit(X_train, y_train)
            y_test_pred = model.predict(X_test)
            # Classification error = misclassified observations / N_test = 1 - accuracy
            test_error = calculate_errors(y_test, y_test_pred)
            overfitting_error = calculate_errors(y_train, model.predict(X_train)) # degugging purpose
            print(f"(overfitting check) Validation error for lambda {lam}: training {overfitting_error:.4f}, test  {test_error:.4f}") # degugging purpose
            val_errors.append(test_error)
        mean_val_error = np.mean(val_errors)
        if mean_val_error < best_logreg_score:
            best_logreg_score = mean_val_error
            best_lambda = lam

    # Retrain logistic regression on full outer training set with best lambda
    best_model = getModel(best_lambda)
    best_model.fit(X_train_outer, y_train_outer)
    
    return best_lambda, best_model

In [9]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

def calculate_errors(y_true, y_pred):
    """
    Calculate classification error.
    :param y_true: True labels
    :param y_pred: Predicted labels
    :return: Classification error (1 - accuracy)
    """
    misclassified = np.sum(y_true != y_pred)
    COUNT = len(y_true)
    inaccuracy = (misclassified / COUNT)
    return inaccuracy

def do_logistic_regression(inner_cv, X_train_outer, y_train_outer, X_test_outer, lambdas):
        # ---------------------------------------
    # Inner CV for Logistic Regression
    # ---------------------------------------
    
    best_lambda = None
    best_logreg_score = np.inf  # lower is better since we use error rate
    for lam in lambdas:
        val_errors = []
        # Tune on inner folds
        for inner_train_idx, val_idx in inner_cv.split(X_train_outer, y_train_outer):
            X_train_inner, y_train_inner = X_train_outer[inner_train_idx], y_train_outer[inner_train_idx]
            X_val, y_val = X_train_outer[val_idx], y_train_outer[val_idx]
            
            # Note: C = 1/lam
            model = LogisticRegression(C=1/lam, penalty='l2', solver='liblinear', max_iter=1000)
            model.fit(X_train_inner, y_train_inner)
            y_val_pred = model.predict(X_val)
            # Classification error = misclassified observations / N_test = 1 - accuracy
            val_error = calculate_errors(y_val, y_val_pred)
            val_errors.append(val_error)
        mean_val_error = np.mean(val_errors)
        if mean_val_error < best_logreg_score:
            best_logreg_score = mean_val_error
            best_lambda = lam

    # Retrain logistic regression on full outer training set with best lambda
    best_logreg = LogisticRegression(C=1/best_lambda, penalty='l2', solver='liblinear', max_iter=1000)
    best_logreg.fit(X_train_outer, y_train_outer)
    y_test_pred_logreg = best_logreg.predict(X_test_outer)
    
    return best_lambda, y_test_pred_logreg

def do_ann(inner_cv, X_train_outer, y_train_outer, X_test_outer, hidden_units_list):
    # ---------------------------------------
    # Inner CV for ANN (MLPClassifier)
    # ---------------------------------------
    best_h = None
    best_ann_score = np.inf
    for h in hidden_units_list:
        val_errors = []
        for inner_train_idx, val_idx in inner_cv.split(X_train_outer, y_train_outer):
            X_train_inner, y_train_inner = X_train_outer[inner_train_idx], y_train_outer[inner_train_idx]
            X_val, y_val = X_train_outer[val_idx], y_train_outer[val_idx]
            
            # Use early stopping to help convergence and reduce iterations.
            model = MLPClassifier(hidden_layer_sizes=(h,), max_iter=3000,
                                  early_stopping=True, n_iter_no_change=10)
            model.fit(X_train_inner, y_train_inner)
            y_val_pred = model.predict(X_val)
            val_error = calculate_errors(y_val, y_val_pred)
            val_errors.append(val_error)
        mean_val_error = np.mean(val_errors)
        if mean_val_error < best_ann_score:
            best_ann_score = mean_val_error
            best_h = h

    # Retrain ANN on full outer training set with best h
    best_ann = MLPClassifier(hidden_layer_sizes=(best_h,), max_iter=3000,n_iter_no_change=10)
    best_ann.fit(X_train_outer, y_train_outer)
    y_test_pred_ann = best_ann.predict(X_test_outer)
    return best_h, y_test_pred_ann

def calculate_baseline_predictions(y_train_outer, y_test_outer):
    # ---------------------------------------
    # Baseline: Predict the majority class
    # ---------------------------------------
    majority_class = np.bincount(y_train_outer).argmax()
    baseline_preds = np.full_like(y_test_outer, majority_class)
    return baseline_preds


In [10]:

from sklearn.preprocessing import KBinsDiscretizer, StandardScaler
def get_parameters_and_target():
    # -------------------------
    # Load and preprocess data
    # -------------------------
    # Change the filename/path as needed
    df = pd.read_excel(".\\datasets\\concrete\\Concrete_Data.xls")

    # Binning the compressive strength into 6 categories
    strength_col = 'Concrete compressive strength(MPa, megapascals) '
    # Use KBinsDiscretizer to create 6 bins based on quantiles
    kbin = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='quantile')
    df['target'] = kbin.fit_transform(df[[strength_col]]).astype(int)

    # Separate features and target; drop the original target column
    X = df.drop(columns=[strength_col, 'target']).values
    y = df['target'].values

    # Normalize features: each column gets zero mean and unit variance.
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    return X, y

X,y = get_parameters_and_target()

In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold

def do_two_layer_cv(X, y):

    # -------------------------------
    # Hyperparameter grids to search
    # -------------------------------
    # For logistic regression, we use lambda (λ) values and note that scikit-learn's C = 1/λ.
    lambdas = np.logspace(-20, -10, 10)  # 10 values between 10^-4 and 10^2

    # For ANN, use the number of hidden units as the complexity controlling parameter.
    hidden_units_list = [(16,)] #[(16,), (32,), (4,4), (8,8)]  # Number of hidden units # , (4, 4), (8, 8), (16, 16), (32, 32)

    # -------------------------------------------
    # Outer cross-validation: same splits for all
    # -------------------------------------------
    K_outer = 10  # outer folds
    K_inner = 10  # inner folds for hyperparameter tuning

    outer_cv = StratifiedKFold(n_splits=K_outer, shuffle=True, random_state=42)

    # This list will store: (Fold, best_lambda, logistic_error, best_h, ann_error, baseline_error)
    results = []
    target_predictions = np.empty((len(y), 4), dtype=int)

    print(f"{'Fold':<5}{'Best λ':<10}{'LogReg Err':<12}{'Best h':<10}{'ANN Err':<12}{'Baseline Err':<15}")
    for fold, (train_idx, test_idx) in enumerate(outer_cv.split(X, y), 1):
        # Outer training and test sets
        X_train_outer, y_train_outer = X[train_idx], y[train_idx]
        X_test_outer, y_test_outer = X[test_idx], y[test_idx]
        
        inner_cv = StratifiedKFold(n_splits=K_inner, shuffle=True, random_state=fold)

        getLogisticRegModel = lambda lam: LogisticRegression(C=1/lam, penalty='l2', solver='liblinear', max_iter=1000)
        best_lambda, best_logreg_model = do_inner_fold(getLogisticRegModel, inner_cv, X_train_outer, y_train_outer, lambdas)
        y_test_pred_logreg = best_logreg_model.predict(X_test_outer)
        logreg_error = calculate_errors(y_test_outer, y_test_pred_logreg)
        
        getAnnModel = lambda h: MLPClassifier(hidden_layer_sizes=h, max_iter=3000, n_iter_no_change=10)
        best_h, best_ann_model = do_inner_fold(getAnnModel, inner_cv, X_train_outer, y_train_outer, hidden_units_list)
        y_test_pred_ann = best_ann_model.predict(X_test_outer)
        ann_error = calculate_errors(y_test_outer, y_test_pred_ann)
        
        baseline_preds = calculate_baseline_predictions(y_train_outer, y_test_outer)
        baseline_error = calculate_errors(y_test_outer, baseline_preds)
        
        # Store predictions for the current fold
        target_predictions[test_idx, 0] = y_test_outer
        target_predictions[test_idx, 1] = y_test_pred_ann
        target_predictions[test_idx, 2] = y_test_pred_logreg
        target_predictions[test_idx, 3] = baseline_preds
        
        results.append((fold, best_lambda, logreg_error, best_h, ann_error, baseline_error))
        print(f"{fold:<5}{best_lambda:<10.4f}{logreg_error:<12.4f}{best_h!s:<10}{ann_error:<12.4f}{baseline_error:<15.4f}")

    # Save results to CSV (optional)
    df_results = pd.DataFrame(results, columns=['Fold', 'Best Lambda', 'LogReg Error', 
                                                'Best Hidden Units', 'ANN Error', 'Baseline Error'])
    df_results.to_csv("combined_model_errors.csv", index=False)
    
    df_target_predictions = pd.DataFrame(target_predictions, columns=['original', 'ann_pred', 'logistic_reg_pred', 'baseline_pred'])
    
    return df_results, df_target_predictions
    
df_results, df_target_predictions = do_two_layer_cv(X, y)

Fold Best λ    LogReg Err  Best h    ANN Err     Baseline Err   
(overfitting check) Validation error for lambda 1e-20: training 0.3010, test  0.3011
(overfitting check) Validation error for lambda 1e-20: training 0.3201, test  0.2581
(overfitting check) Validation error for lambda 1e-20: training 0.2938, test  0.4409
(overfitting check) Validation error for lambda 1e-20: training 0.2938, test  0.3226
(overfitting check) Validation error for lambda 1e-20: training 0.3022, test  0.3226
(overfitting check) Validation error for lambda 1e-20: training 0.3058, test  0.2473
(overfitting check) Validation error for lambda 1e-20: training 0.3034, test  0.2581
(overfitting check) Validation error for lambda 1e-20: training 0.3138, test  0.2935
(overfitting check) Validation error for lambda 1e-20: training 0.3054, test  0.3478
(overfitting check) Validation error for lambda 1e-20: training 0.2886, test  0.3696
(overfitting check) Validation error for lambda 1.2915496650148828e-19: training 0.30

In [15]:
df_results

Unnamed: 0,Fold,Best Lambda,LogReg Error,Best Hidden Units,ANN Error,Baseline Error
0,1,1e-20,0.271845,"(16,)",0.184466,0.669903
1,2,1e-20,0.242718,"(16,)",0.135922,0.669903
2,3,1e-20,0.330097,"(16,)",0.174757,0.669903
3,4,1e-20,0.349515,"(16,)",0.213592,0.669903
4,5,1e-20,0.330097,"(16,)",0.165049,0.669903
5,6,1e-20,0.349515,"(16,)",0.213592,0.669903
6,7,1e-20,0.242718,"(16,)",0.135922,0.660194
7,8,1e-20,0.31068,"(16,)",0.203883,0.660194
8,9,1e-20,0.378641,"(16,)",0.242718,0.660194
9,10,1e-20,0.271845,"(16,)",0.165049,0.660194


In [13]:
import numpy as np
from scipy.stats import binom, beta

def do_test(y_true, y_pred_1, y_pred_2):
    # Determine correctness for each classifier:
    correct_A = (y_true == y_pred_1)
    correct_B = (y_true == y_pred_2)

    # Compute discordant counts:
    # n12: A correct, B wrong
    n12 = np.sum(correct_A & (~correct_B))
    # n21: A wrong, B correct
    n21 = np.sum((~correct_A) & correct_B)

    # Total number of discordant pairs:
    N = n12 + n21

    print("n12 (A correct, B wrong):", n12)
    print("n21 (A wrong, B correct):", n21)
    print("Total discordant pairs, N:", N)

    # Check that we have enough discordant pairs to compute a meaningful interval.
    if N < 5:
        print("Warning: n12+n21 < 5; confidence intervals may be unreliable.")

    # 1. Estimate the difference in accuracy:
    theta_hat = (n12 - n21) / N
    print("Estimated difference in accuracy, θ̂ =", theta_hat)

    # 2. Compute the p-value using the binomial distribution.
    # Let m = min(n12, n21)
    m = min(n12, n21)
    # p-value: p = 2 * BinomCDF(m; N, 0.5)
    p_value = 2 * binom.cdf(m, N, 0.5)
    # Ensure p_value does not exceed 1.
    p_value = min(p_value, 1.0)
    print("p-value =", p_value)

    # 3. Compute a confidence interval for θ.
    # We use a Beta distribution with parameters:
    f = n12 + 1
    g = n21 + 1
    alpha = 0.05  # for a 95% confidence interval

    # Compute lower and upper quantiles from the Beta distribution.
    # Note: beta.ppf gives the quantile for a given cumulative probability.
    theta_lower = 2 * beta.ppf(alpha / 2, f, g) - 1
    theta_upper = 2 * beta.ppf(1 - alpha / 2, f, g) - 1

    print("95% Confidence interval for θ: [{:.4f}, {:.4f}]".format(theta_lower, theta_upper))

    # Interpretation:
    if p_value < alpha:
        print("The difference between classifiers is statistically significant.")
    else:
        print("There is no statistically significant difference between the classifiers.")
        

# Suppose these are your test-set results:
# y_true: true labels (binary or multi-class; here correctness is determined by comparison)
# y_pred_A: predictions from classifier A
# y_pred_B: predictions from classifier B

# For demonstration, we create some example arrays:
# (In practice, replace these with your actual prediction arrays.)
y_true = df_target_predictions['original'].values  # True labels
y_pred_A = df_target_predictions['ann_pred'].values  # Predictions from Model 1 (e.g., ANN)
y_pred_B =  df_target_predictions['logistic_reg_pred'].values  # Predictions from Model 2 (e.g., Logistic Regression)
y_pred_C = df_target_predictions['baseline_pred'].values  # Predictions from Model 3 (e.g., Baseline)

do_test(y_true, y_pred_A, y_pred_B)
print("")
do_test(y_true, y_pred_A, y_pred_C)
print("")
do_test(y_true, y_pred_B, y_pred_C)


n12 (A correct, B wrong): 192
n21 (A wrong, B correct): 64
Total discordant pairs, N: 256
Estimated difference in accuracy, θ̂ = 0.5
p-value = 4.897101647677739e-16
95% Confidence interval for θ: [0.3869, 0.5981]
The difference between classifiers is statistically significant.

n12 (A correct, B wrong): 545
n21 (A wrong, B correct): 48
Total discordant pairs, N: 593
Estimated difference in accuracy, θ̂ = 0.8381112984822934
p-value = 9.856762388978446e-108
95% Confidence interval for θ: [0.7886, 0.8767]
The difference between classifiers is statistically significant.

n12 (A correct, B wrong): 410
n21 (A wrong, B correct): 41
Total discordant pairs, N: 451
Estimated difference in accuracy, θ̂ = 0.8181818181818182
p-value = 1.1599122830870454e-77
95% Confidence interval for θ: [0.7579, 0.8644]
The difference between classifiers is statistically significant.


In [14]:
from sklearn.model_selection import train_test_split
def train_final_best_logreg_model(X, y, df_results: pd.DataFrame):
    # Retrain the best logistic regression model on the entire dataset
    # Split the entire dataset into training and testing sets
    X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(X, y, test_size=0.2)

    # Retrain the best logistic regression model on the full training set
    best_lambda = df_results['Best Lambda'].median()  # Get the best lambda from the results
    best_logreg_full = LogisticRegression(C=1/best_lambda, penalty='l2', solver='liblinear', max_iter=1000)
    best_logreg_full.fit(X_train_full, y_train_full)
    y_test_pred_logreg_full = best_logreg_full.predict(X_test_full)
    logreg_error_full = calculate_errors(y_test_full, y_test_pred_logreg_full)

    # Print the results
    print(f"Logistic Regression Error (Full Dataset): {logreg_error_full:.4f}")

train_final_best_logreg_model(X, y, df_results)

Logistic Regression Error (Full Dataset): 0.3155
