Getting the RMSE

In [1]:
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

# --- Load files ---
with open('Data/Model_Predictions_Exhaustive.pkl', 'rb') as f:
    preds_all = pickle.load(f)

with open('Data/Validation_Data.pkl', 'rb') as f:
    data_all = pickle.load(f)

# Choose which split to evaluate: 'train', 'val', or 'test'
SPLIT = 'val'

pred_key_map = {
    'train': 'y_train_broken',
    'val':   'y_val_broken',
    'test':  'y_test',
}

true_key_map = {
    'train': 'Training_Dependent_Broken',
    'val':   'Validation_Dependent_Broken',
    'test':  'Testing_Dependent_Full',
}

pred_key = pred_key_map[SPLIT]
true_key = true_key_map[SPLIT]

# --- Aggregate RMSEs and average across folds ---
avg_rmse = {}   # nested dict: dataset -> k -> model -> avg_rmse
rows = []       # flat records for a tidy DataFrame

for dataset_name, folds in preds_all.items():
    # Collect per-(k, model) RMSEs across folds
    per_k_model_rmse = {}  # k -> model -> list[rmse]
    
    for fold_name, k_dict in folds.items():
        y_true = np.asarray(data_all[dataset_name][fold_name][true_key]).ravel()
        
        for k, model_preds in k_dict.items():
            for model_name, pred_bundle in model_preds.items():
                y_pred = np.asarray(pred_bundle[pred_key]).ravel()
                rmse = mean_squared_error(y_true, y_pred)**0.5
                
                per_k_model_rmse \
                    .setdefault(k, {}) \
                    .setdefault(model_name, []) \
                    .append(rmse)
    
    # Compute averages
    avg_rmse[dataset_name] = {}
    for k, model_dict in per_k_model_rmse.items():
        avg_rmse[dataset_name][k] = {}
        for model_name, rmses in model_dict.items():
            mean_rmse = float(np.mean(rmses))
            avg_rmse[dataset_name][k][model_name] = mean_rmse
            # for tidy table
            rows.append({
                'dataset': dataset_name,
                'k': int(k) if isinstance(k, str) and k.isdigit() else k,
                'model': model_name,
                'avg_rmse': mean_rmse
            })

# --- Make a tidy DataFrame and (optionally) save it ---
df_avg_rmse = pd.DataFrame(rows)
df_avg_rmse = df_avg_rmse.sort_values(['dataset', 'k', 'avg_rmse', 'model']).reset_index(drop=True)

# Optional: save artifacts
with open('Data/Validation_RMSE_Scores.pkl', 'wb') as f:
    pickle.dump(avg_rmse, f)

# df_avg_rmse.to_csv('Data/Avg_RMSE_By_Dataset_K_Model.csv', index=False)

# print(df_avg_rmse.head(20))


Finding the best k value for each model

In [2]:
# --- Create dictionary: dataset -> model -> k_with_min_avg_rmse ---
best_k_by_model = {}

for dataset_name, k_dict in avg_rmse.items():
    best_k_by_model[dataset_name] = {}
    
    # Invert the nesting so we can loop over models easily
    model_rmse_map = {}  # model -> {k: rmse}
    for k, model_dict in k_dict.items():
        for model_name, rmse_val in model_dict.items():
            model_rmse_map.setdefault(model_name, {})[k] = rmse_val
    
    # Now pick min RMSE k for each model
    for model_name, k_rmse_dict in model_rmse_map.items():
        best_k = min(k_rmse_dict, key=lambda kk: k_rmse_dict[kk])
        best_k_by_model[dataset_name][model_name] = best_k

# Optional: save
with open('Data/Best_K_By_Model.pkl', 'wb') as f:
    pickle.dump(best_k_by_model, f)


Open the Data

In [3]:
import pickle

# Load feature rankings
with open('Data/_Reordered.pkl', 'rb') as f:
    Feature_Ranking_By_MRMR = pickle.load(f)

# Load feature rankings
with open('Data/Best_K_By_Model.pkl', 'rb') as f:
    Best_K_By_Model = pickle.load(f)

with open('Data/Validation_Data.pkl', 'rb') as f:
    data_all = pickle.load(f)


Now lets us compare the full RMSE with the best K RMSE

In [None]:
import numpy as np
import pandas as pd
from sklearn.base import clone

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import (
    RandomForestRegressor,
    AdaBoostRegressor,
    GradientBoostingRegressor,
    ExtraTreesRegressor
)
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
import lightgbm as lgb
import xgboost as xgb

# -----------------------------
# Models
# -----------------------------
regression_models = {
    "Linear Regression": LinearRegression(),
    "KNN Regression": KNeighborsRegressor(),
    "SVM Regression": SVR(),
    "Random Forest Regression": RandomForestRegressor(random_state=42),
    "AdaBoost Regression": AdaBoostRegressor(random_state=42),
    "MLP Regression": MLPRegressor(random_state=42),
    "Decision Tree Regression": DecisionTreeRegressor(random_state=42),
    "Extremely Randomized Trees Regression": ExtraTreesRegressor(random_state=42),
    "Gradient Boosting Regression": GradientBoostingRegressor(random_state=42),
    "LightGBM Regression": lgb.LGBMRegressor(random_state=42, verbose=-1),
    "XGBoost Regression": xgb.XGBRegressor(random_state=42)
}

# -----------------------------
# Assumptions:
# - data_all[dataset][fold] has:
#   'Training_Independent_Full', 'Training_Dependent_Full',
#   'Testing_Independent_Full',  'Testing_Dependent_Full'
# - Best_K_By_Model[dataset][fold][model_name] -> int
# - Feature_Ranking_By_MRMR[dataset][fold][k] -> list of top-k feature names
# -----------------------------

predictions = {}

for dataset_name, folds in data_all.items():
    ds_results = {}

    for fold_name, blobs in folds.items():
        # Unpack full splits
        X_tr_full = blobs["Training_Independent_Full"]
        y_tr_full = blobs["Training_Dependent_Full"]
        X_te_full = blobs["Testing_Independent_Full"]
        y_te_full = blobs["Testing_Dependent_Full"]

        # Ensure column names are strings (safeguard)
        X_tr_full.columns = X_tr_full.columns.astype(str)
        X_te_full.columns = X_te_full.columns.astype(str)

        fold_results = {}

        for model_name, base_model in regression_models.items():
            # ---- FULL FEATURES ----
            model_full = clone(base_model)
            model_full.fit(X_tr_full, y_tr_full)
            y_test_full_pred = model_full.predict(X_te_full)

            # ---- BEST-K FEATURES ----
            best_k = None
            y_test_bestk_pred = None
            used_features_bestk = None

            # get best k (if present)
            if dataset_name in Best_K_By_Model:
                best_k = Best_K_By_Model[dataset_name].get(model_name, None)

            if isinstance(best_k, int) and best_k > 0:
                # exact top-k features from your MRMR dict
                rank_map = Feature_Ranking_By_MRMR.get(dataset_name, {}).get(fold_name, {})
                topk_feats = rank_map.get(best_k, None)

                if topk_feats:
                    # keep only features present in this split
                    topk_feats = [f for f in topk_feats if f in X_tr_full.columns]
                    if len(topk_feats) > 0:
                        model_k = clone(base_model)
                        model_k.fit(X_tr_full[topk_feats], y_tr_full)
                        y_test_bestk_pred = model_k.predict(X_te_full[topk_feats])
                        used_features_bestk = topk_feats

            # store per-model results (simple dict)
            fold_results[model_name] = {
                "y_test": np.asarray(y_te_full),
                "y_test_full_pred": np.asarray(y_test_full_pred),
                "y_test_bestk_pred": None if y_test_bestk_pred is None else np.asarray(y_test_bestk_pred),
                "best_k": best_k,
                "used_features_bestk": used_features_bestk
            }

        # attach fold results
        ds_results[fold_name] = fold_results

    # attach dataset results
    predictions[dataset_name] = ds_results

# `predictions` structure:
# predictions[dataset][fold][model] = {
#   "y_test", "y_test_full_pred", "y_test_bestk_pred",
#   "best_k", "used_features_bestk"
# }


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)


Saving the predictions

In [19]:
import pickle

# Save all_data as Data.pkl
with open('Data/Final_Predictions.pkl', 'wb') as f:
    pickle.dump(predictions, f)
