# Fitting Models for Exhaustive Evaluations

Import libraries

In [1]:
import pickle
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import (
    RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor,
    ExtraTreesRegressor
)
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
import lightgbm as lgb
import xgboost as xgb
from sklearn.base import clone
import time

Fit models and get predictions

In [None]:
import warnings
warnings.filterwarnings("ignore")

# Load feature rankings
with open('Data/Feature_Ranking_By_MRMR_Reordered.pkl', 'rb') as f:
    Feature_Ranking_By_MRMR = pickle.load(f)

# Load dataset
with open('Data/Validation_Data.pkl', 'rb') as f:
    Data = pickle.load(f)

# Define regression models
regression_models = {
    "Linear Regression": LinearRegression(),
    "KNN Regression": KNeighborsRegressor(),
    "SVM Regression": SVR(),
    "Random Forest Regression": RandomForestRegressor(random_state=42),
    "AdaBoost Regression": AdaBoostRegressor(random_state=42),
    "MLP Regression": MLPRegressor(random_state=42),
    "Decision Tree Regression": DecisionTreeRegressor(random_state=42),
    "Extremely Randomized Trees Regression": ExtraTreesRegressor(random_state=42),
    "Gradient Boosting Regression": GradientBoostingRegressor(random_state=42),
    "LightGBM Regression": lgb.LGBMRegressor(random_state=42,verbose = -1),
    "XGBoost Regression": xgb.XGBRegressor(random_state=42)
}

# Final results dictionary
results = {}

# Main loop
for dataset_name in Feature_Ranking_By_MRMR:
    print(dataset_name)
    results[dataset_name] = {}

    for fold_name in Feature_Ranking_By_MRMR[dataset_name]:
        results[dataset_name][fold_name] = {}

        # Get the fold data
        fold_data = Data[dataset_name][fold_name]

        # y targets
        y_train_broken = fold_data['Training_Dependent_Broken']
        y_val_broken = fold_data['Validation_Dependent_Broken']
        y_test = fold_data['Testing_Dependent_Full']
        y_train_full = fold_data['Training_Dependent_Full']

        for k, top_k_features in Feature_Ranking_By_MRMR[dataset_name][fold_name].items():
            results[dataset_name][fold_name][k] = {}

            # Subset X by top-k features
            X_train_broken = fold_data['Training_Independent_Broken'][top_k_features]
            X_val_broken = fold_data['Validation_Independent_Broken'][top_k_features]
            X_test = fold_data['Testing_Independent_Full'][top_k_features]
            X_train_full = fold_data['Training_Independent_Full'][top_k_features]

            for model_name, model in regression_models.items():
                model_instance = clone(model)
                time_start=time.time()
                model_instance.fit(X_train_broken, y_train_broken)
                time_end = time.time()
                time_elapsed = time_end-time_start
                # Make predictions
                preds_train = model_instance.predict(X_train_broken)
                preds_val = model_instance.predict(X_val_broken)
                preds_test = model_instance.predict(X_test)

                # Store predictions
                results[dataset_name][fold_name][k][model_name] = {
                    'y_train_broken': preds_train,
                    'y_val_broken': preds_val,
                    'y_test': preds_test,
                    'time': time_elapsed
                }


In [7]:
import pickle

# Save all_data as Data.pkl
with open('Data/Model_Predictions_Exhaustive.pkl', 'wb') as f:
    pickle.dump(results, f)
