In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt


# Read the Excel file
data = pd.read_excel(".\\datasets\\concrete\\Concrete_Data.xls")

In [4]:
# Rename columns to be more concise
data.columns = ['Cement', 'Blast Furnace Slag', 'Fly Ash', 'Water', 'Superplasticizer', 'Coarse Aggregate', 'Fine Aggregate', 'Age', 'Concrete Compressive Strength']

target = data.columns[-1]

In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Load dataset
df = data.copy()  # adjust path if needed

# Binary classification: define "high strength" as strength > median
median_strength = df['Concrete Compressive Strength'].median()
df['target'] = (df['Concrete Compressive Strength'] > median_strength).astype(int)

# Features and target
X = df.drop(columns=['Concrete Compressive Strength', 'target']).values
y = df['target'].values

# Normalize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Regularization strengths to try
lambdas = np.logspace(-4, 2, 10)

# Outer CV
K1 = 10
K2 = 10
outer_cv = StratifiedKFold(n_splits=K1, shuffle=True, random_state=1)

results = []

print(f"{'Fold':<5}{'Best λ':<10}{'Test Error':<12}{'Baseline Error':<15}")
for fold, (train_idx, test_idx) in enumerate(outer_cv.split(X, y), 1):
    X_train_outer, y_train_outer = X[train_idx], y[train_idx]
    X_test_outer, y_test_outer = X[test_idx], y[test_idx]

    inner_cv = StratifiedKFold(n_splits=K2, shuffle=True, random_state=fold)

    best_lambda = None
    best_score = np.inf
    for lam in lambdas:
        val_errors = []
        for inner_train_idx, val_idx in inner_cv.split(X_train_outer, y_train_outer):
            X_train_inner, y_train_inner = X_train_outer[inner_train_idx], y_train_outer[inner_train_idx]
            X_val = X_train_outer[val_idx]
            y_val = y_train_outer[val_idx]

            model = LogisticRegression(C=1/lam, penalty='l2', solver='liblinear')
            model.fit(X_train_inner, y_train_inner)
            y_val_pred = model.predict(X_val)
            val_error = 1 - accuracy_score(y_val, y_val_pred)
            val_errors.append(val_error)

        mean_val_error = np.mean(val_errors)
        if mean_val_error < best_score:
            best_score = mean_val_error
            best_lambda = lam

    # Retrain on outer training with best lambda, evaluate on outer test
    final_model = LogisticRegression(C=1/best_lambda, penalty='l2', solver='liblinear')
    final_model.fit(X_train_outer, y_train_outer)
    y_test_pred = final_model.predict(X_test_outer)
    test_error = 1 - accuracy_score(y_test_outer, y_test_pred)

    # Baseline: predict most frequent class in training
    majority_class = np.bincount(y_train_outer).argmax()
    baseline_preds = np.full_like(y_test_outer, majority_class)
    baseline_error = 1 - accuracy_score(y_test_outer, baseline_preds)

    results.append((fold, best_lambda, test_error, baseline_error))
    print(f"{fold:<5}{best_lambda:<10.4f}{test_error:<12.4f}{baseline_error:<15.4f}")

# Optionally save results
df_results = pd.DataFrame(results, columns=['Fold', 'Best Lambda', 'Test Error', 'Baseline Error'])
df_results.to_csv("logistic_regression_results.csv", index=False)


Fold Best λ    Test Error  Baseline Error 
1    0.0001    0.1845      0.5049         
2    0.0001    0.1553      0.5049         
3    0.0464    0.1553      0.5049         
4    0.0001    0.1942      0.5049         
5    0.0001    0.1553      0.5049         
6    0.0001    0.1262      0.5049         
7    0.0001    0.1456      0.5049         
8    0.0464    0.1456      0.5049         
9    0.0001    0.1359      0.5049         
10   0.0464    0.1553      0.5049         


In [11]:
np.min(X), np.max(X)

(-2.8023333336920273, 5.057676786792999)

In [20]:
def calculate_errors(y_true, y_pred):
    """
    Calculate classification error.
    :param y_true: True labels
    :param y_pred: Predicted labels
    :return: Classification error (1 - accuracy)
    """
    misclassified = np.sum(y_true != y_pred)
    COUNT = len(y_true)
    inaccuracy = (misclassified / COUNT)
    return inaccuracy

def do_logistic_regression(inner_cv, X_train_outer, y_train_outer, X_test_outer, y_test_outer, lambdas):
        # ---------------------------------------
    # Inner CV for Logistic Regression
    # ---------------------------------------
    
    best_lambda = None
    best_logreg_score = np.inf  # lower is better since we use error rate
    for lam in lambdas:
        val_errors = []
        # Tune on inner folds
        for inner_train_idx, val_idx in inner_cv.split(X_train_outer, y_train_outer):
            X_train_inner, y_train_inner = X_train_outer[inner_train_idx], y_train_outer[inner_train_idx]
            X_val, y_val = X_train_outer[val_idx], y_train_outer[val_idx]
            
            # Note: C = 1/lam
            model = LogisticRegression(C=1/lam, penalty='l2', solver='liblinear', max_iter=1000)
            model.fit(X_train_inner, y_train_inner)
            y_val_pred = model.predict(X_val)
            # Classification error = misclassified observations / N_test = 1 - accuracy
            val_error = calculate_errors(y_val, y_val_pred)
            val_errors.append(val_error)
        mean_val_error = np.mean(val_errors)
        if mean_val_error < best_logreg_score:
            best_logreg_score = mean_val_error
            best_lambda = lam

    # Retrain logistic regression on full outer training set with best lambda
    best_logreg = LogisticRegression(C=1/best_lambda, penalty='l2', solver='liblinear', max_iter=1000)
    best_logreg.fit(X_train_outer, y_train_outer)
    y_test_pred_logreg = best_logreg.predict(X_test_outer)
    logreg_error = calculate_errors(y_test_outer, y_test_pred_logreg)
    return best_lambda, logreg_error, y_test_pred_logreg

def do_ann(inner_cv, X_train_outer, y_train_outer, X_test_outer, y_test_outer, hidden_units_list):
    # ---------------------------------------
    # Inner CV for ANN (MLPClassifier)
    # ---------------------------------------
    best_h = None
    best_ann_score = np.inf
    for h in hidden_units_list:
        val_errors = []
        for inner_train_idx, val_idx in inner_cv.split(X_train_outer, y_train_outer):
            X_train_inner, y_train_inner = X_train_outer[inner_train_idx], y_train_outer[inner_train_idx]
            X_val, y_val = X_train_outer[val_idx], y_train_outer[val_idx]
            
            # Use early stopping to help convergence and reduce iterations.
            model = MLPClassifier(hidden_layer_sizes=(h,), max_iter=3000,
                                  early_stopping=True, n_iter_no_change=10,
                                  random_state=fold)
            model.fit(X_train_inner, y_train_inner)
            y_val_pred = model.predict(X_val)
            val_error = calculate_errors(y_val, y_val_pred)
            val_errors.append(val_error)
        mean_val_error = np.mean(val_errors)
        if mean_val_error < best_ann_score:
            best_ann_score = mean_val_error
            best_h = h

    # Retrain ANN on full outer training set with best h
    best_ann = MLPClassifier(hidden_layer_sizes=(best_h,), max_iter=3000,n_iter_no_change=10,
                             random_state=fold)
    best_ann.fit(X_train_outer, y_train_outer)
    y_test_pred_ann = best_ann.predict(X_test_outer)
    ann_error = calculate_errors(y_test_outer, y_test_pred_ann)
    return best_h, ann_error, y_test_pred_ann

def calculate_baseline_error(y_train_outer, y_test_outer):
    # ---------------------------------------
    # Baseline: Predict the majority class
    # ---------------------------------------
    majority_class = np.bincount(y_train_outer).argmax()
    baseline_preds = np.full_like(y_test_outer, majority_class)
    baseline_error = calculate_errors(y_test_outer, baseline_preds)
    return baseline_error, baseline_preds


In [21]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import KBinsDiscretizer, StandardScaler

# -------------------------
# Load and preprocess data
# -------------------------
# Change the filename/path as needed
df = pd.read_excel(".\\datasets\\concrete\\Concrete_Data.xls")

# Binning the compressive strength into 6 categories
strength_col = 'Concrete compressive strength(MPa, megapascals) '
# Use KBinsDiscretizer to create 6 bins based on quantiles
kbin = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='quantile')
df['target'] = kbin.fit_transform(df[[strength_col]]).astype(int)

# Separate features and target; drop the original target column
X = df.drop(columns=[strength_col, 'target']).values
y = df['target'].values

# Normalize features: each column gets zero mean and unit variance.
scaler = StandardScaler()
X = scaler.fit_transform(X)

# -------------------------------
# Hyperparameter grids to search
# -------------------------------
# For logistic regression, we use lambda (λ) values and note that scikit-learn's C = 1/λ.
lambdas = np.logspace(-10, 2, 10)  # 10 values between 10^-4 and 10^2

# For ANN, use the number of hidden units as the complexity controlling parameter.
hidden_units_list = [64]  # Number of hidden units

# -------------------------------------------
# Outer cross-validation: same splits for all
# -------------------------------------------
K_outer = 10  # outer folds
K_inner = 10  # inner folds for hyperparameter tuning

outer_cv = StratifiedKFold(n_splits=K_outer, shuffle=True, random_state=42)

# This list will store: (Fold, best_lambda, logistic_error, best_h, ann_error, baseline_error)
results = []

print(f"{'Fold':<5}{'Best λ':<10}{'LogReg Err':<12}{'Best h':<10}{'ANN Err':<12}{'Baseline Err':<15}")
for fold, (train_idx, test_idx) in enumerate(outer_cv.split(X, y), 1):
    # Outer training and test sets
    X_train_outer, y_train_outer = X[train_idx], y[train_idx]
    X_test_outer, y_test_outer = X[test_idx], y[test_idx]
    
    inner_cv = StratifiedKFold(n_splits=K_inner, shuffle=True, random_state=fold)

    best_lambda, logreg_error, y_test_pred_logreg = do_logistic_regression(inner_cv, X_train_outer, y_train_outer, X_test_outer, y_test_outer, lambdas)
    best_h, ann_error, y_test_pred_ann = do_ann(inner_cv, X_train_outer, y_train_outer, X_test_outer, y_test_outer, hidden_units_list)
    baseline_error, baseline_preds = calculate_baseline_error(y_train_outer, y_test_outer)
    
    results.append((fold, best_lambda, logreg_error, best_h, ann_error, baseline_error))
    print(f"{fold:<5}{best_lambda:<10.4f}{logreg_error:<12.4f}{best_h:<10}{ann_error:<12.4f}{baseline_error:<15.4f}")

# Save results to CSV (optional)
df_results = pd.DataFrame(results, columns=['Fold', 'Best Lambda', 'LogReg Error', 
                                              'Best Hidden Units', 'ANN Error', 'Baseline Error'])
df_results.to_csv("combined_model_errors.csv", index=False)


Fold Best λ    LogReg Err  Best h    ANN Err     Baseline Err   
1    0.0100    0.2718      64        0.1359      0.6699         
2    0.0000    0.2427      64        0.0971      0.6699         
3    0.0000    0.3301      64        0.1748      0.6699         
4    0.0000    0.3495      64        0.1748      0.6699         
5    0.0000    0.3301      64        0.1650      0.6699         
6    0.0000    0.3495      64        0.1748      0.6699         
7    0.0000    0.2427      64        0.1068      0.6602         
8    0.0000    0.3107      64        0.2233      0.6602         
9    0.0000    0.3786      64        0.2039      0.6602         
10   0.0000    0.2718      64        0.1359      0.6602         


In [18]:
import numpy as np
from scipy.stats import binom, beta

# Suppose these are your test-set results:
# y_true: true labels (binary or multi-class; here correctness is determined by comparison)
# y_pred_A: predictions from classifier A
# y_pred_B: predictions from classifier B

# For demonstration, we create some example arrays:
# (In practice, replace these with your actual prediction arrays.)
y_true = y_test_outer          # True labels
y_pred_A = y_test_pred_ann   # Predictions from Model 1 (e.g., logistic regression)
y_pred_B =  y_test_pred_logreg  # Predictions from Model 2 (e.g., ANN)
y_pred_C = baseline_preds   # Predictions from Model 2 (e.g., ANN)

def do_test(y_true, y_pred_1, y_pred_2):
    # Determine correctness for each classifier:
    correct_A = (y_true == y_pred_1)
    correct_B = (y_true == y_pred_2)

    # Compute discordant counts:
    # n12: A correct, B wrong
    n12 = np.sum(correct_A & (~correct_B))
    # n21: A wrong, B correct
    n21 = np.sum((~correct_A) & correct_B)

    # Total number of discordant pairs:
    N = n12 + n21

    print("n12 (A correct, B wrong):", n12)
    print("n21 (A wrong, B correct):", n21)
    print("Total discordant pairs, N:", N)

    # Check that we have enough discordant pairs to compute a meaningful interval.
    if N < 5:
        print("Warning: n12+n21 < 5; confidence intervals may be unreliable.")

    # 1. Estimate the difference in accuracy:
    theta_hat = (n12 - n21) / N
    print("Estimated difference in accuracy, θ̂ =", theta_hat)

    # 2. Compute the p-value using the binomial distribution.
    # Let m = min(n12, n21)
    m = min(n12, n21)
    # p-value: p = 2 * BinomCDF(m; N, 0.5)
    p_value = 2 * binom.cdf(m, N, 0.5)
    # Ensure p_value does not exceed 1.
    p_value = min(p_value, 1.0)
    print("p-value =", p_value)

    # 3. Compute a confidence interval for θ.
    # We use a Beta distribution with parameters:
    f = n12 + 1
    g = n21 + 1
    alpha = 0.05  # for a 95% confidence interval

    # Compute lower and upper quantiles from the Beta distribution.
    # Note: beta.ppf gives the quantile for a given cumulative probability.
    theta_lower = 2 * beta.ppf(alpha / 2, f, g) - 1
    theta_upper = 2 * beta.ppf(1 - alpha / 2, f, g) - 1

    print("95% Confidence interval for θ: [{:.4f}, {:.4f}]".format(theta_lower, theta_upper))

    # Interpretation:
    if p_value < alpha:
        print("The difference between classifiers is statistically significant.")
    else:
        print("There is no statistically significant difference between the classifiers.")
        
do_test(y_true, y_pred_A, y_pred_B)
print("")
do_test(y_true, y_pred_A, y_pred_C)
print("")
do_test(y_true, y_pred_B, y_pred_C)


n12 (A correct, B wrong): 19
n21 (A wrong, B correct): 5
Total discordant pairs, N: 24
Estimated difference in accuracy, θ̂ = 0.5833333333333334
p-value = 0.006610751152038574
95% Confidence interval for θ: [0.1859, 0.8129]
The difference between classifiers is statistically significant.

n12 (A correct, B wrong): 56
n21 (A wrong, B correct): 2
Total discordant pairs, N: 58
Estimated difference in accuracy, θ̂ = 0.9310344827586207
p-value = 1.1879386363489175e-14
95% Confidence interval for θ: [0.7657, 0.9788]
The difference between classifiers is statistically significant.

n12 (A correct, B wrong): 43
n21 (A wrong, B correct): 3
Total discordant pairs, N: 46
Estimated difference in accuracy, θ̂ = 0.8695652173913043
p-value = 4.6219383875722997e-10
95% Confidence interval for θ: [0.6492, 0.9526]
The difference between classifiers is statistically significant.


In [93]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


the_best_lambda = 0.000001 # I am not sure what the best lambda is, so I will use a small value for now.
model = LogisticRegression(C=1/the_best_lambda, penalty='l2', solver='liblinear', max_iter=1000)
model.fit(X_train, y_train)

# Use the model to predict on the test set
y_test_pred = model.predict(X_test)

# Generate a classification report
print("Error rate:" , calculate_errors(y_test, y_test_pred))

Error rate: 0.2766990291262136
