# Initialization. Load previous state. Load modules

In [2]:
import dill
# Restore the entire session
#dill.load_session('PTRMS_tun_class_all.db')

In [3]:
#check last result on disk
#dir()

In [4]:
import json

import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
from sklearn.svm import SVR

In [5]:
def predict_reg_by_label(IDs, y, X, regressor, param_grid={}, cv=5, verbose=True):
    """
    Calculates predictions for test sets, each with a unique label in IDs, and selects the best parameters using GridSearchCV.

    Args:
        IDs: A pandas Series containing labels.
        y: A pandas Series or numpy array containing target labels.
        X: A pandas DataFrame or numpy array containing features.
        regressor: A scikit-learn regressor class.
        param_grid: A dictionary specifying parameter options for GridSearchCV.
        cv: Number of folds for cross-validation.
        verbose: If True, prints progress and metrics.

    Returns:
        predictions: Predictions for y as test sets.
        arv.
        best_params_per_label: Dictionary with the best parameters for each label.
    """
    unique_labels = IDs.unique()
    predictions = np.zeros_like(y, dtype=float)
    best_params_per_label = {}
    
    # Ensure X and y are numpy arrays for compatibility
    if not isinstance(X, np.ndarray):
        X = X.to_numpy()
    if not isinstance(y, np.ndarray):
        y = y.to_numpy()

    for label in unique_labels:
        mask = (IDs == label)
        X_train, X_test = X[~mask], X[mask]
        y_train, y_test = y[~mask], y[mask]

        # Use GridSearchCV to find the best parameters
        if cv>1:
            grid_search = GridSearchCV(
                estimator=regressor(),
                param_grid=param_grid,
                scoring='neg_mean_squared_error',
                cv=cv,
                verbose=verbose
            )
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_

            # Store the best parameters
            best_params_per_label[label] = grid_search.best_params_

        else:
            best_model = regressor(**param_grid)
            best_model.fit(X_train, y_train)
            best_params_per_label[label] = param_grid

        # Predict on the test set
        predictions[mask] = best_model.predict(X_test)

        if verbose:
            print(f"Label: {label}, Best Params: {best_params_per_label[label]}")
            print(f"Test Set MSE for label {label}: {round(mean_squared_error(y_test, predictions[mask]), 4)}")

    # Final metrics
    final_mse = mean_squared_error(y, predictions)
    final_arv = final_mse / np.var(y)
    
    if verbose:
        print(f"\nOverall Mean Squared Error: {round(final_mse, 4)}")
        print(f"Overall Adjusted Residual Variance (ARV): {round(final_arv, 4)}")

    return predictions, final_arv, best_params_per_label

# Load datasets froms json file

In [6]:
# Load data from JSON
with open('data_reg.json', 'r') as json_file:
    loaded_datasets = json.load(json_file)


# Train methods

## All datasets

In [36]:
datasets = ["Gum1b","Gum2b","Gum3b","Gum1s","Gum2s","Gum3s","Noc_S","Noc_O","Noc_3T"]

# Dictionary to store results
results = {dataset: {} for dataset in datasets}

# Define model parameters
models = {
    "RF": {
        "regressor": RandomForestRegressor,
        "param_grid": {
            'n_estimators': 1000,
            'max_features': 'sqrt'
        },
        "cv": 0,
        "gpu": False
    },
    "XGBoost_full": {
        "regressor": XGBRegressor,
        "param_grid": {
            'device': ['cpu'],
            'n_estimators': [1000], 
            'eta': [0.25,0.10,0.05], 
            'max_depth': [1,3,5], 
            'subsample': [1,0.8,0.6], 
            'colsample_bytree': [1,0.75,0.5]
        },
        "cv": 3,
        "gpu": False
    },
    "XGBoost_red": {
        "regressor": XGBRegressor,
        "param_grid": {
            'device': ['cpu'],
            'n_estimators': [1000], 
            'eta': [0.12], 
            'max_depth': [1,3,5], 
            'subsample': [1,0.8,0.6], 
            'colsample_bytree': [0.75]
        },
        "cv": 3,
        "gpu": False
    },
    "XGBoost_fix": {
        "regressor": XGBRegressor,
        "param_grid": {
            'device': 'cpu',
            'n_estimators': 1000, 
            'eta': 0.12, 
            'max_depth': 3, 
            'subsample': 0.8, 
            'colsample_bytree': 0.75
        },
        "cv": 0,
        "gpu": False
    }
}

# Loop through datasets
for dataset in datasets:
    this_data = loaded_datasets[dataset]
    
    IDs = pd.Series(this_data["IDs"])
    y = np.array(this_data["y"])
    X = pd.DataFrame(this_data["X"])
    
    for model_name, model_info in models.items():
        predict = predict_reg_by_label(
            cv=model_info["cv"], 
            IDs=IDs, 
            y=y, 
            X=X, 
            regressor=model_info["regressor"], 
            param_grid=model_info["param_grid"], 
            verbose=False
        )
        results[dataset][model_name] = predict
        print(f"{dataset}-{model_name}: {round(predict[1],4)}")

Peperoncini-RF: 0.2708
Peperoncini-XGBoost_full: 0.3333
Peperoncini-XGBoost_red: 0.2917
Peperoncini-XGBoost_fix: 0.2917
Lab-RF: 0.0784
Lab-XGBoost_full: 0.0882
Lab-XGBoost_red: 0.1765
Lab-XGBoost_fix: 0.098


In [9]:
# Print header
print("Dataset".ljust(12) + " | " + " | ".join(model_name.ljust(15) for model_name in models))

# Print separator line
print("-" * (12 + 3 + len(models) * 18))

# Print results
for dataset in datasets:
    result_line = f"{dataset.ljust(12)} | " + " | ".join(f"{round(results[dataset][model_name][1],4):<15}" for model_name in models)
    print(result_line)


Dataset      | RF              | XGBoost_full    | XGBoost_red     | XGBoost_fix    
---------------------------------------------------------------------------------------
Gum1b        | 0.2844          | 0.3555          | 0.5014          | 0.2657         
Gum2b        | 0.4159          | 0.3569          | 0.5474          | 0.3409         
Gum3b        | 0.4584          | 0.5848          | 0.672           | 0.4587         
Gum1s        | 0.6529          | 0.6054          | 0.747           | 0.5327         
Gum2s        | 0.6174          | 0.3374          | 0.5625          | 0.4907         
Gum3s        | 0.8709          | 1.0655          | 1.0466          | 0.9396         
Noc_S        | 0.1537          | 0.2269          | 0.3576          | 0.1606         
Noc_O        | 0.2906          | 0.325           | 0.3657          | 0.3143         
Noc_3T       | 0.7602          | 0.6928          | 0.8467          | 0.3288         


In [10]:
dill.dump_session('PTRMS_tun_reg_all.db')