# Now let us find the RMSE and R Squared for each of the k features and select the optimal number

In [19]:
import pickle
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Load model predictions
with open("Data/model_predictions.pkl", "rb") as f:
    model_predictions = pickle.load(f)

summary = []

for dataset_name, k_data in model_predictions.items():
    val_rmse_scores = {}
    val_r2_scores = {}

    for k, models in k_data.items():
        if "LinearRegression" not in models or "error" in models["LinearRegression"]:
            continue

        try:
            y_val = models["LinearRegression"]["y_val"]
            y_val_pred = models["LinearRegression"]["y_val_pred"]
            y_test = models["LinearRegression"]["y_test"]
            y_test_pred = models["LinearRegression"]["y_test_pred"]

            # Compute validation metrics
            rmse_val = mean_squared_error(y_val, y_val_pred) ** 0.5
            r2_val = r2_score(y_val, y_val_pred)

            val_rmse_scores[k] = rmse_val
            val_r2_scores[k] = r2_val

        except Exception as e:
            print('It hit here')
            continue

    if len(val_rmse_scores) == 0:
        continue  # Skip if nothing was computed

    # Get k with best RMSE and best R² on validation
    best_k_rmse = min(val_rmse_scores, key=val_rmse_scores.get)
    best_k_r2 = max(val_r2_scores, key=val_r2_scores.get)

    # Compute test RMSE and R² at best k
    test_rmse = mean_squared_error(
        model_predictions[dataset_name][best_k_rmse]["LinearRegression"]["y_test"],
        model_predictions[dataset_name][best_k_rmse]["LinearRegression"]["y_test_pred"]
    ) ** 0.5

    test_r2 = r2_score(
        model_predictions[dataset_name][best_k_r2]["LinearRegression"]["y_test"],
        model_predictions[dataset_name][best_k_r2]["LinearRegression"]["y_test_pred"]
    )

    summary.append({
        "Dataset": dataset_name,
        "Best_k_RMSE": best_k_rmse,
        "Val_RMSE": val_rmse_scores[best_k_rmse],
        "Test_RMSE": test_rmse,
        "Best_k_R2": best_k_r2,
        "Val_R2": val_r2_scores[best_k_r2],
        "Test_R2": test_r2
    })

# Convert to DataFrame
df_summary = pd.DataFrame(summary)

Finding the RMSE and R Squared for Dynamically Selected K

In [20]:
# Load dynamic model predictions
with open("Data/dynamic_model_predictions.pkl", "rb") as f:
    dynamic_model_predictions = pickle.load(f)

# Extract RMSE and R² for Linear Regression
summary_dynamic_lr = []

for dataset_name, model_data in dynamic_model_predictions.items():
    lr_data = model_data.get("LinearRegression", {})

    if "error" in lr_data:
        print('IT HIT HERE')
        continue

    try:
        y_test = lr_data["y_test"]
        y_test_pred = lr_data["y_test_pred"]

        test_mse = mean_squared_error(y_test, y_test_pred)
        test_rmse = np.sqrt(test_mse)
        test_r2 = r2_score(y_test, y_test_pred)

        summary_dynamic_lr.append({
            "Dataset": dataset_name,
            "Selected_k": model_data.get("k", "N/A"),
            "Test_RMSE": test_rmse,
            "Test_R2": test_r2
        })
    except Exception as e:
        print(f"⚠️ Error in {dataset_name}: {e}")
        continue

# Convert to DataFrame
df_dynamic_lr_summary = pd.DataFrame(summary_dynamic_lr)

Concatenating the two

In [21]:
# Drop 'Val_RMSE' and 'Val_R2' from df_summary
df_summary_trimmed = df_summary.drop(columns=['Val_RMSE', 'Val_R2'])

# Rename columns in df_dynamic_lr_summary for clarity
df_dynamic_renamed = df_dynamic_lr_summary.rename(columns={
    "Selected_k": "Selected_k_Dynamic",
    "Test_RMSE": "Test_RMSE_Dynamic",
    "Test_R2": "Test_R2_Dynamic"
})

# Merge the two DataFrames on 'Dataset'
final_df = pd.merge(df_summary_trimmed, df_dynamic_renamed, on='Dataset')

Comparing results

In [22]:
final_df["Better_RMSE_Dynamic"] = final_df["Test_RMSE_Dynamic"] < final_df["Test_RMSE"]
final_df["Better_R2_Dynamic"] = final_df["Test_R2_Dynamic"] > final_df["Test_R2"]

# Convert boolean values to "Yes"/"No"
final_df["Better_RMSE_Dynamic"] = final_df["Better_RMSE_Dynamic"].map({True: "Yes", False: "No"})
final_df["Better_R2_Dynamic"] = final_df["Better_R2_Dynamic"].map({True: "Yes", False: "No"})


In [23]:
final_df

Unnamed: 0,Dataset,Best_k_RMSE,Test_RMSE,Best_k_R2,Test_R2,Selected_k_Dynamic,Test_RMSE_Dynamic,Test_R2_Dynamic,Better_RMSE_Dynamic,Better_R2_Dynamic
0,fri_c1_500_50,8,0.883819,8,0.247109,18,0.883592,0.247496,Yes,Yes
1,fri_c3_1000_50,23,0.834926,23,0.193367,19,0.857734,0.148695,No,No
2,fri_c4_500_50,28,0.840683,28,0.167965,21,0.816967,0.214247,Yes,Yes
3,fri_c4_1000_50,29,0.852417,29,0.181952,20,0.84911,0.188287,Yes,Yes
4,fri_c2_1000_25,16,0.92219,16,0.294176,11,0.928713,0.284155,No,No
5,fri_c1_1000_25,22,0.87878,22,0.235004,9,0.893102,0.209865,No,No
6,fri_c3_1000_25,18,0.874033,18,0.289077,12,0.86679,0.300811,Yes,Yes
7,BodyFat,2,0.869759,2,0.986881,6,0.969631,0.983695,No,No
8,Forest_Fires,6,123.111111,6,-0.000422,6,123.111111,-0.000422,No,No
9,Quakes,2,0.187251,2,-0.007823,1,0.187092,-0.00611,Yes,Yes


Comparing Differences

In [24]:
from scipy.stats import wilcoxon
df = final_df
# Run the test
stat, p = wilcoxon(df["Test_RMSE"], df["Test_RMSE_Dynamic"])

print(f"Wilcoxon statistic: {stat}")
print(f"P-value: {p}")

# Interpretation
if p < 0.05:
    print("✅ Significant difference in RMSE between methods (p < 0.05)")
else:
    print("❌ No significant difference in RMSE between methods (p ≥ 0.05)")


Wilcoxon statistic: 36.0
P-value: 0.017582795345746867
✅ Significant difference in RMSE between methods (p < 0.05)


Checking which performed better

In [28]:
from scipy.stats import wilcoxon

# H0: the difference between the paired observations in the population is zero.
# H1: the difference between the paired observations is less than zero.  

stat, p = wilcoxon(final_df["Test_RMSE"], final_df["Test_RMSE_Dynamic"], alternative='less')
print(f"📉 One-tailed Wilcoxon (Test_RMSE < Test_RMSE_Dynamic): p = {p:.4f}")



📉 One-tailed Wilcoxon (Test_RMSE < Test_RMSE_Dynamic): p = 0.0088


# Let us now compare with the full feature set

In [29]:
import pickle
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Load model predictions
with open("Data/model_predictions.pkl", "rb") as f:
    model_predictions = pickle.load(f)

summary = []

for dataset_name, k_data in model_predictions.items():
    # Determine the k with the largest number of features
    max_k = max(k_data.keys(), key=lambda k: int(k))  # assumes k is integer or string of int

    if "LinearRegression" not in k_data[max_k] or "error" in k_data[max_k]["LinearRegression"]:
        continue

    try:
        y_test = k_data[max_k]["LinearRegression"]["y_test"]
        y_test_pred = k_data[max_k]["LinearRegression"]["y_test_pred"]

        # Compute metrics
        test_rmse = mean_squared_error(y_test, y_test_pred) ** 0.5
        test_r2 = r2_score(y_test, y_test_pred)

        summary.append({
            "Dataset": dataset_name,
            "Full_k": max_k,
            "Full_Test_RMSE": test_rmse,
            "Full_Test_R2": test_r2
        })

    except Exception as e:
        print(f"Error processing {dataset_name} at k={max_k}: {e}")
        continue

# Convert to DataFrame
df_full_features = pd.DataFrame(summary)


Finding for the dynamically selected k

In [30]:
# Load dynamic model predictions
with open("Data/dynamic_model_predictions.pkl", "rb") as f:
    dynamic_model_predictions = pickle.load(f)

# Extract RMSE and R² for Linear Regression
summary_dynamic_lr = []

for dataset_name, model_data in dynamic_model_predictions.items():
    lr_data = model_data.get("LinearRegression", {})

    if "error" in lr_data:
        print('IT HIT HERE')
        continue

    try:
        y_test = lr_data["y_test"]
        y_test_pred = lr_data["y_test_pred"]

        test_mse = mean_squared_error(y_test, y_test_pred)
        test_rmse = np.sqrt(test_mse)
        test_r2 = r2_score(y_test, y_test_pred)

        summary_dynamic_lr.append({
            "Dataset": dataset_name,
            "Selected_k": model_data.get("k", "N/A"),
            "Test_RMSE": test_rmse,
            "Test_R2": test_r2
        })
    except Exception as e:
        print(f"⚠️ Error in {dataset_name}: {e}")
        continue

# Convert to DataFrame
df_dynamic_lr_summary = pd.DataFrame(summary_dynamic_lr)

Concatenating the two

In [31]:
df_summary_trimmed = df_full_features

# Rename columns in df_dynamic_lr_summary for clarity
df_dynamic_renamed = df_dynamic_lr_summary.rename(columns={
    "Selected_k": "Selected_k_Dynamic",
    "Test_RMSE": "Test_RMSE_Dynamic",
    "Test_R2": "Test_R2_Dynamic"
})

# Merge the two DataFrames on 'Dataset'
final_df = pd.merge(df_summary_trimmed, df_dynamic_renamed, on='Dataset')

Comparing the results

In [32]:
final_df["Better_RMSE_Dynamic"] = final_df["Test_RMSE_Dynamic"] < final_df["Full_Test_RMSE"]
final_df["Better_R2_Dynamic"] = final_df["Test_R2_Dynamic"] > final_df["Full_Test_R2"]

# Convert boolean values to "Yes"/"No"
final_df["Better_RMSE_Dynamic"] = final_df["Better_RMSE_Dynamic"].map({True: "Yes", False: "No"})
final_df["Better_R2_Dynamic"] = final_df["Better_R2_Dynamic"].map({True: "Yes", False: "No"})


Comparing the results

In [38]:
from scipy.stats import wilcoxon

# H0: the difference between the paired observations in the population is zero.
# H1: the difference between the paired observations is less than zero.  

stat, p = wilcoxon(final_df["Full_Test_RMSE"], final_df["Test_RMSE_Dynamic"], alternative='two-sided')
print(f"🔍 Two-tailed Wilcoxon test: Is there any significant difference? p-value = {p:.4f}")


🔍 Two-tailed Wilcoxon test: Is there any significant difference? p-value = 0.2455


In [39]:
final_df

Unnamed: 0,Dataset,Full_k,Full_Test_RMSE,Full_Test_R2,Selected_k_Dynamic,Test_RMSE_Dynamic,Test_R2_Dynamic,Better_RMSE_Dynamic,Better_R2_Dynamic
0,fri_c1_500_50,50,0.939282,0.149651,18,0.883592,0.247496,Yes,Yes
1,fri_c3_1000_50,50,0.831584,0.199813,19,0.857734,0.148695,No,No
2,fri_c4_500_50,50,0.950181,-0.062894,21,0.816967,0.214247,Yes,Yes
3,fri_c4_1000_50,50,0.841395,0.20297,20,0.84911,0.188287,No,No
4,fri_c2_1000_25,25,0.932532,0.278256,11,0.928713,0.284155,Yes,Yes
5,fri_c1_1000_25,25,0.877412,0.237384,9,0.893102,0.209865,No,No
6,fri_c3_1000_25,25,0.879234,0.280591,12,0.86679,0.300811,Yes,Yes
7,BodyFat,14,0.979403,0.983365,6,0.969631,0.983695,Yes,Yes
8,Forest_Fires,27,121.706344,0.022278,6,123.111111,-0.000422,No,No
9,Quakes,3,0.187683,-0.012481,1,0.187092,-0.00611,Yes,Yes


# FCBF TESTS

Getting the coil-20 Dataset

In [13]:
from sklearn.datasets import fetch_openml
from FCBF_Regression import FCBF_MI
# Load dataset ID 46783 (COIL‑20, ~1000 features)
X, y = fetch_openml(data_id=46783, return_X_y=True, as_frame=False, parser="auto")

print("Feature matrix shape:", X.shape)
print("Target vector shape:", y.shape)


Feature matrix shape: (1440, 1024)
Target vector shape: (1440,)


Doing FS and fitting Linear Regression

In [15]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# 1. Mark all features as continuous (no discretization needed)
discrete_flags = [False] * X.shape[1]

# 2. Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 3. Fit Linear Regression on all features
lr_all = LinearRegression()
lr_all.fit(X_train, y_train)
y_pred_all = lr_all.predict(X_test)
rmse_all = mean_squared_error(y_test, y_pred_all)**0.5

# 4. Apply FCBF feature selection
fcbf = FCBF_MI(delta=0.0, discrete_features=discrete_flags)
fcbf.fit(X_train, y_train)

# 5. Transform both train and test using selected features
X_train_selected = X_train[:, fcbf.selected_features_]
X_test_selected = X_test[:, fcbf.selected_features_]

# 6. Fit Linear Regression on selected features
lr_fcbf = LinearRegression()
lr_fcbf.fit(X_train_selected, y_train)
y_pred_fcbf = lr_fcbf.predict(X_test_selected)
rmse_fcbf = mean_squared_error(y_test, y_pred_fcbf)**0.5

# 7. Output results
print("Test RMSE without feature selection:", round(rmse_all, 4))
print("Test RMSE with FCBF feature selection:", round(rmse_fcbf, 4))
print("Selected feature indices:", fcbf.selected_features_)


Test RMSE without feature selection: 18.0959
Test RMSE with FCBF feature selection: 2.785
Selected feature indices: [425, 422, 456, 445, 521, 427, 388, 361, 449, 358, 428, 451, 518, 391, 510, 485, 514, 508, 585, 379, 491, 487, 548, 547, 394, 614, 605, 409, 429, 536, 524, 578, 431, 647, 545, 174, 525, 490, 466, 280, 777, 612, 296, 587, 846, 232, 261, 176, 815, 355, 332, 713, 398, 848, 911, 635, 780, 760, 470, 480, 344, 677, 136, 315, 841, 601, 299, 382, 402, 630, 790, 407, 618, 701, 710, 112, 544, 202, 641, 249, 620, 554, 266, 172, 246, 643, 474, 236, 903, 844, 369, 621, 697, 882, 497, 215, 238, 284, 731, 416, 824, 559, 109, 212, 909, 782, 726, 464, 291, 333, 179, 533, 341, 237, 302, 148, 435, 939, 787, 146, 511, 594, 785, 271, 558, 684, 277, 335, 9, 372, 322, 368, 820, 338, 806, 747, 706, 740, 43, 867, 1001, 885, 655, 795, 531, 194, 240, 243, 218, 945, 654, 151, 71, 718, 717, 592, 596, 69, 274, 82, 723, 752, 933, 47, 657, 974, 307, 607, 640, 889, 85, 948, 221, 919, 383, 798, 319, 56, 9

Doing Decision Tree Algorithm

In [16]:
from sklearn.tree import DecisionTreeRegressor

# 4. Fit Decision Tree on all features
dt_all = DecisionTreeRegressor()
dt_all.fit(X_train, y_train)
y_pred_all = dt_all.predict(X_test)
rmse_all = mean_squared_error(y_test, y_pred_all)**0.5

# 7. Fit Decision Tree on selected features
dt_fcbf = DecisionTreeRegressor()
dt_fcbf.fit(X_train_selected, y_train)
y_pred_fcbf = dt_fcbf.predict(X_test_selected)
rmse_fcbf = mean_squared_error(y_test, y_pred_fcbf)**0.5

# 8. Output results
print("Test RMSE without feature selection:", round(rmse_all, 4))
print("Test RMSE with FCBF feature selection:", round(rmse_fcbf, 4))

Test RMSE without feature selection: 1.881
Test RMSE with FCBF feature selection: 2.4467


Doing Neural Network Algorithm

In [None]:
from sklearn.neural_network import MLPRegressor

# 4. Fit MLP (Neural Network) on all features
mlp_all = MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
mlp_all.fit(X_train, y_train)
y_pred_all = mlp_all.predict(X_test)
rmse_all = mean_squared_error(y_test, y_pred_all) ** 0.5

# 7. Fit MLP on selected features
mlp_fcbf = MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
mlp_fcbf.fit(X_train_selected, y_train)
y_pred_fcbf = mlp_fcbf.predict(X_test_selected)
rmse_fcbf = mean_squared_error(y_test, y_pred_fcbf) **0.5

# 8. Output results
print("Test RMSE without feature selection (MLP):", round(rmse_all, 4))
print("Test RMSE with FCBF feature selection (MLP):", round(rmse_fcbf, 4))

Test RMSE without feature selection (MLP): 8.2727
Test RMSE with FCBF feature selection (MLP): 10.1339


## Doing the same tests with the dynamic method

In [1]:
# from sklearn.datasets import fetch_openml
# from FCBF_Regression import FCBF_MI
# # Load dataset ID 46783 (COIL‑20, ~1000 features)
# X, y = fetch_openml(data_id=46783, return_X_y=True, as_frame=False, parser="auto")

# print("Feature matrix shape:", X.shape)
# print("Target vector shape:", y.shape)


In [2]:
# import pandas as pd
# X = pd.DataFrame(X)


In [3]:
# import numpy as np
# import pandas as pd
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import mean_squared_error
# from sklearn.model_selection import train_test_split

# # 1. Mark all features as continuous (no discretization needed)
# discrete_flags = [False] * X.shape[1]

# # 2. Train-test split (80-20)
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.2, random_state=42
# )

# # 3. Fit Linear Regression on all features
# lr_all = LinearRegression()
# lr_all.fit(X_train, y_train)
# y_pred_all = lr_all.predict(X_test)
# rmse_all = mean_squared_error(y_test, y_pred_all)**0.5

# # 4. Apply FCBF feature selection
# from mrmr_dynamic_selection import MRMR

# # Run dynamic MRMR
# selector = MRMR(
#     method="MID",
#     regression=True,
#     random_state=42
# )

# selector.fit(X_train, y_train)

# # Transform the datasets
# X_train_sel = selector.transform(X_train)
# X_test_sel = selector.transform(X_test)

# # 5. Fit Linear Regression on selected features
# lr_mrmr = LinearRegression()
# lr_mrmr.fit(X_train_selected, y_train)
# y_pred_mrmr = lr_mrmr.predict(X_test_selected)
# rmse_mrmr = mean_squared_error(y_test, y_pred_mrmr)**0.5

# # 7. Output results
# print("Test RMSE without feature selection:", round(rmse_all, 4))
# print("Test RMSE with mrmr feature selection:", round(rmse_mrmr, 4))
