# Initialization. Load previous state. Load modules

In [1]:
import dill
# Restore the entire session
#dill.load_session('PTRMS_tun_class_all.db')

In [2]:
#check last result on disk
#dir()

In [1]:
import json

import pandas as pd
import numpy as np
#import cupy as cp

from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier


In [2]:
def predict_class_by_label(IDs, y, X, classifier, param_grid={}, cv=5, gpu=False, verbose=True):
    """
    Calculates predictions for test sets, each with a unique label in IDs, and selects the best parameters using GridSearchCV.

    Args:
        IDs: A pandas Series containing labels.
        y: A pandas Series or numpy array containing target labels.
        X: A pandas DataFrame or numpy array containing features.
        classifier: A scikit-learn classifier class.
        param_grid: A dictionary specifying parameter options for GridSearchCV.
        cv: Number of folds for cross-validation. If cv=0 it directly tunes a single model
        gpu: If True, move X to gpu before training.
        verbose: If True, prints progress and metrics.

    Returns:
        predictions: Predictions for y as test sets.
        classification error.
        best_params_per_label: Dictionary with the best parameters for each label.
    """
    unique_labels = IDs.unique()
    best_params_per_label = {}
    
    # Ensure X and y are numpy arrays for compatibility
    if not isinstance(X, np.ndarray):
        X = X.to_numpy()
    if not isinstance(y, np.ndarray):
        y = y.to_numpy()

    predictions = y[:]*0

    for label in unique_labels:
        mask = (IDs == label)
            
        if gpu:
            X_train, X_test = cp.array(X[~mask]), cp.array(X[mask])
        else:
            X_train, X_test = X[~mask], X[mask]            
        y_train, y_test = y[~mask], y[mask]
        
        if cv>1:
            # Use GridSearchCV to find the best parameters
            grid_search = GridSearchCV(
                estimator=classifier(),
                param_grid=param_grid,
                scoring='accuracy',
                cv=cv,
                verbose=verbose
            )
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_

            # Store the best parameters
            best_params_per_label[label] = grid_search.best_params_

        elif cv == 0:
            best_model = classifier(**param_grid)
            best_model.fit(X_train, y_train)
            best_params_per_label[label] = param_grid
 
        else:
            X_lear, X_val, y_lear, y_val = train_test_split(X_train, y_train, stratify=y_train, train_size=(1.0-cv))
            best_model = classifier(**param_grid)
            best_model.fit(X_lear, y_lear, eval_set=[(X_val, y_val)],verbose=False)
            best_params_per_label[label] = param_grid

        # Predict on the test set
        predictions[mask] = best_model.predict(X_test)
 
        if verbose:
            print(f"Label: {label}, Best Params: {best_params_per_label[label]}")
            print(f"Test Set error for label {label}: {round(1.0-accuracy_score(y_test, predictions[mask]), 4)}")

    # Final metrics
    final_error = 1.0-accuracy_score(y, predictions)
    
    if verbose:
        print(f"\nOverall Classification Error: {round(final_error, 4)}")

    return predictions, final_error, best_params_per_label


# Load datasets froms json file

In [3]:
# Load data from JSON
with open('data_classif.json', 'r') as json_file:
    loaded_datasets = json.load(json_file)


# Train methods

## All datasets

In [11]:
datasets = ["Tea", "Gum2", "Gum3", "Cafe", "Ham", "Pesce", "Spinaci", "Peperoncini", "Funghi13", "Funghi20", "Funghi21","Lab"]

# Crear una lista para almacenar los resultados
data_info = []

for dataset in datasets:
    this_data = loaded_datasets[dataset]
    
    IDs = pd.Series(this_data["IDs"])
    y = np.array(this_data["y"])
    X = pd.DataFrame(this_data["X"])

    # Obtener dimensiones de X, valores únicos de y e IDs
    X_shape = X.shape
    unique_y = len(np.unique(y))
    unique_IDs = len(np.unique(IDs))

    # Agregar la información a la lista
    data_info.append([dataset, X_shape[0], unique_IDs , X_shape[1], unique_y, round(X_shape[0]/X_shape[1],2)])

# Crear un DataFrame con los resultados
df_info = pd.DataFrame(data_info, columns=["Dataset", "Samples", "Batchs", "Peaks", "Classes", "S/P"])

# Mostrar la tabla
print(df_info)

latex_table = df_info.to_latex(index=False, caption="Dataset Information", label="tab:data_info")

print(latex_table)


        Dataset  Samples  Batchs  Peaks  Classes   S/P
0           Tea      456      21    161        4  2.83
1          Gum2      267      27    167        2  1.60
2          Gum3      267      27    167        2  1.60
3          Cafe       36      12    563        6  0.06
4           Ham       54      18    427        3  0.13
5         Pesce      104      32    259        3  0.40
6       Spinaci       72      24    333        2  0.22
7   Peperoncini       96      32    253        2  0.38
8      Funghi13       54      19    125        6  0.43
9      Funghi20      396      65    402       12  0.99
10     Funghi21      593      50    383        6  1.55
11          Lab      102      20    798        2  0.13
\begin{table}
\caption{Dataset Information}
\label{tab:data_info}
\begin{tabular}{lrrrrr}
\toprule
Dataset & Samples & Batchs & Peaks & Classes & S/P \\
\midrule
Tea & 456 & 21 & 161 & 4 & 2.830000 \\
Gum2 & 267 & 27 & 167 & 2 & 1.600000 \\
Gum3 & 267 & 27 & 167 & 2 & 1.600000 \\
Cafe

In [30]:
datasets = ["Tea", "Gum2", "Gum3", "Cafe", "Ham", "Pesce", "Spinaci", "Peperoncini", "Funghi13", "Funghi20", "Funghi21","Lab"]

# Dictionary to store results
results = {dataset: {} for dataset in datasets}

# Define model parameters
models = {
    "XGBoost_full": {
        "classifier": XGBClassifier,
        "param_grid": {
            'device': ['cpu'],
            'n_estimators': [125,250,500,1000], 
            'eta': [0.12], 
            'max_depth': [3], 
            'subsample': [0.8], 
            'colsample_bytree': [0.75]
        },
        "cv": 3,
        "gpu": False
    },
    "XGBoost_025": {
        "classifier": XGBClassifier,
        "param_grid": {
            'early_stopping_rounds':5,
            'device': 'cpu',
            'n_estimators': 1000, 
            'eta': 0.12, 
            'max_depth': 3, 
            'subsample': 0.8, 
            'colsample_bytree': 0.75
        },
        "cv": 0.25,
        "gpu": False
    },
    "XGBoost_fix": {
        "classifier": XGBClassifier,
        "param_grid": {
            'device': 'cpu',
            'n_estimators': 1000, 
            'eta': 0.12, 
            'max_depth': 3, 
            'subsample': 0.8, 
            'colsample_bytree': 0.75
        },
        "cv": 0,
        "gpu": False
    }
}

# Loop through datasets
for dataset in datasets:
    this_data = loaded_datasets[dataset]
    
    IDs = pd.Series(this_data["IDs"])
    y = np.array(this_data["y"])
    X = pd.DataFrame(this_data["X"])
    
    for model_name, model_info in models.items():
        predict = predict_class_by_label(
            cv=model_info["cv"], 
            gpu=model_info["gpu"], 
            IDs=IDs, 
            y=y, 
            X=X, 
            classifier=model_info["classifier"], 
            param_grid=model_info["param_grid"], 
            verbose=False
        )
        results[dataset][model_name] = predict
        print(f"{dataset}-{model_name}: {round(predict[1],4)}")

Tea-XGBoost_full: 0.5044
Tea-XGBoost_025: 0.5088
Tea-XGBoost_fix: 0.5219


KeyboardInterrupt: 

In [55]:
# Print header
print("Dataset".ljust(12) + " | " + " | ".join(model_name.ljust(15) for model_name in models))

# Print separator line
print("-" * (12 + 3 + len(models) * 18))

# Print results
for dataset in datasets:
    result_line = f"{dataset.ljust(12)} | " + " | ".join(f"{round(results[dataset][model_name][1],4):<15}" for model_name in models)
    print(result_line)


Dataset      | RF              | XGBoost_full    | XGBoost_red     | XGBoost_fix    
---------------------------------------------------------------------------------------
Peperoncini  | 0.2708          | 0.3333          | 0.2917          | 0.2917         
Lab          | 0.0784          | 0.0882          | 0.1765          | 0.098          


In [7]:
dill.dump_session('PTRMS_nepocs_class_all.db')