# Initialization. Load previous state. Load modules

In [1]:
import dill
# Restore the entire session
#dill.load_session('PTRMS_tun_class_all.db')

In [2]:
#check last result on disk
#dir()

In [1]:
import json

import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from xgboost import XGBRegressor


In [2]:
def predict_reg_by_label(IDs, y, X, regressor, param_grid={}, cv=5, verbose=True):
    """
    Calculates predictions for test sets, each with a unique label in IDs, and selects the best parameters using GridSearchCV.

    Args:
        IDs: A pandas Series containing labels.
        y: A pandas Series or numpy array containing target labels.
        X: A pandas DataFrame or numpy array containing features.
        regressor: A scikit-learn regressor class.
        param_grid: A dictionary specifying parameter options for GridSearchCV.
        cv: Number of folds for cross-validation.
        verbose: If True, prints progress and metrics.

    Returns:
        predictions: Predictions for y as test sets.
        arv.
        best_params_per_label: Dictionary with the best parameters for each label.
    """
    unique_labels = IDs.unique()
    predictions = np.zeros_like(y, dtype=float)
    best_params_per_label = {}
    
    # Ensure X and y are numpy arrays for compatibility
    if not isinstance(X, np.ndarray):
        X = X.to_numpy()
    if not isinstance(y, np.ndarray):
        y = y.to_numpy()

    for label in unique_labels:
        mask = (IDs == label)
        X_train, X_test = X[~mask], X[mask]
        y_train, y_test = y[~mask], y[mask]

        # Use GridSearchCV to find the best parameters
        if cv>1:
            grid_search = GridSearchCV(
                estimator=regressor(),
                param_grid=param_grid,
                scoring='neg_mean_squared_error',
                cv=cv,
                verbose=verbose
            )
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_
            
            # Store the best parameters
            best_params_per_label[label] = grid_search.best_params_
            
        elif cv == 0:
            best_model = regressor(**param_grid)
            best_model.fit(X_train, y_train)
            best_params_per_label[label] = param_grid
            
        else:
            X_lear, X_val, y_lear, y_val = train_test_split(X_train, y_train, train_size=(1.0-cv))
            best_model = regressor(**param_grid)
            best_model.fit(X_lear, y_lear, eval_set=[(X_val, y_val)],verbose=False)
            best_params_per_label[label] = param_grid
        
                
        # Predict on the test set
        predictions[mask] = best_model.predict(X_test)
        
        if verbose:
            print(f"Label: {label}, Best Params: {best_params_per_label[label]}")
            print(f"Test Set MSE for label {label}: {round(mean_squared_error(y_test, predictions[mask]), 4)}")

    # Final metrics
    final_mse = mean_squared_error(y, predictions)
    final_arv = final_mse / np.var(y)
    
    if verbose:
        print(f"\nOverall Mean Squared Error: {round(final_mse, 4)}")
        print(f"Overall Adjusted Residual Variance (ARV): {round(final_arv, 4)}")

    return predictions, final_arv, best_params_per_label

# Load datasets froms json file

In [3]:
# Load data from JSON
with open('data_reg.json', 'r') as json_file:
    loaded_datasets = json.load(json_file)


# Train methods

## All datasets

In [4]:
datasets = ["Gum1b","Gum2b","Gum3b","Gum1s","Gum2s","Gum3s","Noc_S","Noc_O","Noc_3T"]

# Crear una lista para almacenar los resultados
data_info = []

for dataset in datasets:
    this_data = loaded_datasets[dataset]
    
    IDs = pd.Series(this_data["IDs"])
    y = np.array(this_data["y"])
    X = pd.DataFrame(this_data["X"])

    # Obtener dimensiones de X, valores únicos de y e IDs
    X_shape = X.shape
    unique_IDs = len(np.unique(IDs))

    # Agregar la información a la lista
    data_info.append([dataset, X_shape[0], unique_IDs , X_shape[1], round(X_shape[0]/X_shape[1],2)])

# Crear un DataFrame con los resultados
df_info = pd.DataFrame(data_info, columns=["Dataset", "Samples", "Batchs", "Peaks", "S/P"])

# Mostrar la tabla
print(df_info)

latex_table = df_info.to_latex(index=False, caption="Dataset Information", label="tab:data_info")

print(latex_table)


  Dataset  Samples  Batchs  Peaks   S/P
0   Gum1b      267       3    167  1.60
1   Gum2b      267       3    167  1.60
2   Gum3b      267       3    167  1.60
3   Gum1s      267      27    167  1.60
4   Gum2s      267      27    167  1.60
5   Gum3s      267      27    167  1.60
6   Noc_S       72      24    380  0.19
7   Noc_O       72       3    380  0.19
8  Noc_3T       60      20    383  0.16
\begin{table}
\caption{Dataset Information}
\label{tab:data_info}
\begin{tabular}{lrrrr}
\toprule
Dataset & Samples & Batchs & Peaks & S/P \\
\midrule
Gum1b & 267 & 3 & 167 & 1.600000 \\
Gum2b & 267 & 3 & 167 & 1.600000 \\
Gum3b & 267 & 3 & 167 & 1.600000 \\
Gum1s & 267 & 27 & 167 & 1.600000 \\
Gum2s & 267 & 27 & 167 & 1.600000 \\
Gum3s & 267 & 27 & 167 & 1.600000 \\
Noc_S & 72 & 24 & 380 & 0.190000 \\
Noc_O & 72 & 3 & 380 & 0.190000 \\
Noc_3T & 60 & 20 & 383 & 0.160000 \\
\bottomrule
\end{tabular}
\end{table}



In [6]:
datasets = ["Gum1b","Gum2b","Gum3b","Gum1s","Gum2s","Gum3s","Noc_S","Noc_O","Noc_3T"]

# Dictionary to store results
results = {dataset: {} for dataset in datasets}

# Define model parameters
models = {
    "XGBoost_full": {
        "regressor": XGBRegressor,
        "param_grid": {
            'device': ['cpu'],
            'n_estimators': [125,250,500,1000], 
            'eta': [0.12], 
            'max_depth': [3], 
            'subsample': [0.8], 
            'colsample_bytree': [0.75]
        },
        "cv": 3
    },
    "XGBoost_025": {
        "regressor": XGBRegressor,
        "param_grid": {
            'early_stopping_rounds':5,
            'device': 'cpu',
            'n_estimators': 1000, 
            'eta': 0.12, 
            'max_depth': 3, 
            'subsample': 0.8, 
            'colsample_bytree': 0.75
        },
        "cv": 0.25
    },
    "XGBoost_fix": {
        "regressor": XGBRegressor,
        "param_grid": {
            'device': 'cpu',
            'n_estimators': 1000, 
            'eta': 0.12, 
            'max_depth': 3, 
            'subsample': 0.8, 
            'colsample_bytree': 0.75
        },
        "cv": 0
    }
}

# Loop through datasets
for dataset in datasets:
    this_data = loaded_datasets[dataset]
    
    IDs = pd.Series(this_data["IDs"])
    y = np.array(this_data["y"])
    X = pd.DataFrame(this_data["X"])
    
    for model_name, model_info in models.items():
        predict = predict_reg_by_label(
            cv=model_info["cv"], 
            IDs=IDs, 
            y=y, 
            X=X, 
            regressor=model_info["regressor"], 
            param_grid=model_info["param_grid"], 
            verbose=False
        )
        results[dataset][model_name] = predict
        print(f"{dataset}-{model_name}: {round(predict[1],4)}")

Gum1b-XGBoost_full: 0.3695
Gum1b-XGBoost_025: 0.4127
Gum1b-XGBoost_fix: 0.3695
Gum2b-XGBoost_full: 0.4436
Gum2b-XGBoost_025: 0.3929
Gum2b-XGBoost_fix: 0.4436
Gum3b-XGBoost_full: 0.5852
Gum3b-XGBoost_025: 0.5244
Gum3b-XGBoost_fix: 0.5852
Gum1s-XGBoost_full: 0.6097
Gum1s-XGBoost_025: 0.7244
Gum1s-XGBoost_fix: 0.6097
Gum2s-XGBoost_full: 0.6082
Gum2s-XGBoost_025: 0.5386
Gum2s-XGBoost_fix: 0.6082
Gum3s-XGBoost_full: 0.9541
Gum3s-XGBoost_025: 0.9756
Gum3s-XGBoost_fix: 0.9541
Noc_S-XGBoost_full: 0.271
Noc_S-XGBoost_025: 0.2942
Noc_S-XGBoost_fix: 0.271
Noc_O-XGBoost_full: 0.3195
Noc_O-XGBoost_025: 0.3456
Noc_O-XGBoost_fix: 0.3195
Noc_3T-XGBoost_full: 0.5013
Noc_3T-XGBoost_025: 0.577
Noc_3T-XGBoost_fix: 0.5013


In [7]:
# Print header
print("Dataset".ljust(12) + " | " + " | ".join(model_name.ljust(15) for model_name in models))

# Print separator line
print("-" * (12 + 3 + len(models) * 18))

# Print results
for dataset in datasets:
    result_line = f"{dataset.ljust(12)} | " + " | ".join(f"{round(results[dataset][model_name][1],4):<15}" for model_name in models)
    print(result_line)


Dataset      | XGBoost_full    | XGBoost_025     | XGBoost_fix    
---------------------------------------------------------------------
Gum1b        | 0.3695          | 0.4127          | 0.3695         
Gum2b        | 0.4436          | 0.3929          | 0.4436         
Gum3b        | 0.5852          | 0.5244          | 0.5852         
Gum1s        | 0.6097          | 0.7244          | 0.6097         
Gum2s        | 0.6082          | 0.5386          | 0.6082         
Gum3s        | 0.9541          | 0.9756          | 0.9541         
Noc_S        | 0.271           | 0.2942          | 0.271          
Noc_O        | 0.3195          | 0.3456          | 0.3195         
Noc_3T       | 0.5013          | 0.577           | 0.5013         


In [8]:
dill.dump_session('PTRMS_epocs_reg_all.db')