Now let us find the RMSE and R Squared for each of the k features and select the optimal number

In [2]:
import pickle
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Load model predictions
with open("Data/model_predictions.pkl", "rb") as f:
    model_predictions = pickle.load(f)

summary = []

for dataset_name, k_data in model_predictions.items():
    val_rmse_scores = {}
    val_r2_scores = {}

    for k, models in k_data.items():
        if "LinearRegression" not in models or "error" in models["LinearRegression"]:
            continue

        try:
            y_val = models["LinearRegression"]["y_val"]
            y_val_pred = models["LinearRegression"]["y_val_pred"]
            y_test = models["LinearRegression"]["y_test"]
            y_test_pred = models["LinearRegression"]["y_test_pred"]

            # Compute validation metrics
            rmse_val = mean_squared_error(y_val, y_val_pred) ** 0.5
            r2_val = r2_score(y_val, y_val_pred)

            val_rmse_scores[k] = rmse_val
            val_r2_scores[k] = r2_val

        except Exception as e:
            print('It hit here')
            continue

    if len(val_rmse_scores) == 0:
        continue  # Skip if nothing was computed

    # Get k with best RMSE and best R¬≤ on validation
    best_k_rmse = min(val_rmse_scores, key=val_rmse_scores.get)
    best_k_r2 = max(val_r2_scores, key=val_r2_scores.get)

    # Compute test RMSE and R¬≤ at best k
    test_rmse = mean_squared_error(
        model_predictions[dataset_name][best_k_rmse]["LinearRegression"]["y_test"],
        model_predictions[dataset_name][best_k_rmse]["LinearRegression"]["y_test_pred"]
    ) ** 0.5

    test_r2 = r2_score(
        model_predictions[dataset_name][best_k_r2]["LinearRegression"]["y_test"],
        model_predictions[dataset_name][best_k_r2]["LinearRegression"]["y_test_pred"]
    )

    summary.append({
        "Dataset": dataset_name,
        "Best_k_RMSE": best_k_rmse,
        "Val_RMSE": val_rmse_scores[best_k_rmse],
        "Test_RMSE": test_rmse,
        "Best_k_R2": best_k_r2,
        "Val_R2": val_r2_scores[best_k_r2],
        "Test_R2": test_r2
    })

# Convert to DataFrame
df_summary = pd.DataFrame(summary)

Finding the RMSE and R Squared for Dynamically Selected K

In [8]:
# Load dynamic model predictions
with open("Data/dynamic_model_predictions.pkl", "rb") as f:
    dynamic_model_predictions = pickle.load(f)

# Extract RMSE and R¬≤ for Linear Regression
summary_dynamic_lr = []

for dataset_name, model_data in dynamic_model_predictions.items():
    lr_data = model_data.get("LinearRegression", {})

    if "error" in lr_data:
        print('IT HIT HERE')
        continue

    try:
        y_test = lr_data["y_test"]
        y_test_pred = lr_data["y_test_pred"]

        test_mse = mean_squared_error(y_test, y_test_pred)
        test_rmse = np.sqrt(test_mse)
        test_r2 = r2_score(y_test, y_test_pred)

        summary_dynamic_lr.append({
            "Dataset": dataset_name,
            "Selected_k": model_data.get("k", "N/A"),
            "Test_RMSE": test_rmse,
            "Test_R2": test_r2
        })
    except Exception as e:
        print(f"‚ö†Ô∏è Error in {dataset_name}: {e}")
        continue

# Convert to DataFrame
df_dynamic_lr_summary = pd.DataFrame(summary_dynamic_lr)

Concatenating the two

In [14]:
# Drop 'Val_RMSE' and 'Val_R2' from df_summary
df_summary_trimmed = df_summary.drop(columns=['Val_RMSE', 'Val_R2'])

# Rename columns in df_dynamic_lr_summary for clarity
df_dynamic_renamed = df_dynamic_lr_summary.rename(columns={
    "Selected_k": "Selected_k_Dynamic",
    "Test_RMSE": "Test_RMSE_Dynamic",
    "Test_R2": "Test_R2_Dynamic"
})

# Merge the two DataFrames on 'Dataset'
final_df = pd.merge(df_summary_trimmed, df_dynamic_renamed, on='Dataset')

Comparing results

In [16]:
final_df["Better_RMSE_Dynamic"] = final_df["Test_RMSE_Dynamic"] < final_df["Test_RMSE"]
final_df["Better_R2_Dynamic"] = final_df["Test_R2_Dynamic"] > final_df["Test_R2"]

# Convert boolean values to "Yes"/"No"
final_df["Better_RMSE_Dynamic"] = final_df["Better_RMSE_Dynamic"].map({True: "Yes", False: "No"})
final_df["Better_R2_Dynamic"] = final_df["Better_R2_Dynamic"].map({True: "Yes", False: "No"})


Comparing Differences

In [22]:
from scipy.stats import wilcoxon
df = final_df
# Run the test
stat, p = wilcoxon(df["Test_RMSE"], df["Test_RMSE_Dynamic"])

print(f"Wilcoxon statistic: {stat}")
print(f"P-value: {p}")

# Interpretation
if p < 0.05:
    print("‚úÖ Significant difference in RMSE between methods (p < 0.05)")
else:
    print("‚ùå No significant difference in RMSE between methods (p ‚â• 0.05)")


Wilcoxon statistic: 36.0
P-value: 0.017582795345746867
‚úÖ Significant difference in RMSE between methods (p < 0.05)


In [23]:
import pandas as pd
import numpy as np
from scipy.stats import wilcoxon

# Assume final_df already exists and contains:
# - 'Test_RMSE' (baseline)
# - 'Test_RMSE_Dynamic' (dynamic method)

# Step 1: Compute the RMSE difference
diff = final_df["Test_RMSE"] - final_df["Test_RMSE_Dynamic"]

# Step 2: Print basic summary statistics
print("üîé Mean RMSE Difference:", diff.mean())
print("üîé Median RMSE Difference:", diff.median())

# Step 3: Run Wilcoxon signed-rank test
stat, p = wilcoxon(final_df["Test_RMSE"], final_df["Test_RMSE_Dynamic"])
print(f"üìä Wilcoxon Test Statistic = {stat:.4f}, P-value = {p:.4f}")

# Step 4: Interpret result
if p < 0.05:
    if diff.mean() > 0:
        print("‚úÖ Dynamic method performs significantly better (lower RMSE)")
    else:
        print("‚ùå Dynamic method performs significantly worse (higher RMSE)")
else:
    print("‚ö†Ô∏è No significant difference between baseline and dynamic method")


üîé Mean RMSE Difference: -4.5720714901065875
üîé Median RMSE Difference: -0.020679636980247634
üìä Wilcoxon Test Statistic = 36.0000, P-value = 0.0176
‚ùå Dynamic method performs significantly worse (higher RMSE)
