In [12]:
# Step 1: Import packages
import numpy as np
import scipy
import sympy as sp
import torch
import lightgbm as lgb
import optuna
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_pinball_loss
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder
from scipy.stats import friedmanchisquare, wilcoxon
import statsmodels.api as sm
from statsmodels.regression.quantile_regression import QuantReg
import pmlb
from pmlb import fetch_data, regression_dataset_names
import random

# Step 2: Import pysr AFTER Julia dependencies are configured
from pysr import PySRRegressor


In [13]:
def get_feature_type(data_column):
  """
  This function determines the feature type (categorical or binary) for a NumPy array column.

  **Note:** This function assumes the data doesn't contain missing values.
          If your data might have missing values, you'll need to handle them
          before using this function (e.g., impute missing values or remove rows).

  Args:
      data_column: A NumPy array representing the data column.

  Returns:
      A string indicating the feature type: "categorical" or "binary".
  """

  # Check for distinct values and data type
  unique_values = np.unique(data_column)
  num_unique_values = len(unique_values)

  # Categorical data has a limited number of distinct values (adjust threshold as needed)
  if num_unique_values <= 10:  # Adjust threshold based on your data and analysis goals
    return "categorical"

  # Binary data has only two distinct values
  if num_unique_values == 2:
    return "binary"

  # If there are more than 10 distinct values and not binary, assume numerical
  # (This might need further refinement depending on your domain knowledge)
  return "numerical"  # Consider a different label for non-categoric

def get_categorical_features(X):
  """
  This function identifies the indices of categorical features in a NumPy array representing a dataset.

  **Note:** This function assumes the data doesn't contain missing values.
          If your data might have missing values, you'll need to handle them
          before using this function (e.g., impute missing values or remove rows).

  Args:
      X: A 2D NumPy array representing the dataset (n_samples x n_features).

  Returns:
      A list of integers representing the indices of categorical features in X.
  """

  categorical_features = []
  for i, col in enumerate(X.T):  # Enumerate to get column index (i)
    unique_values = np.unique(col)
    num_unique_values = len(unique_values)
    data_type = col.dtype

    # Categorical data has a limited number of distinct values (adjust threshold as needed)
    if num_unique_values <= 10:  # Adjust threshold based on your data and analysis goals
      categorical_features.append(i)

  return categorical_features


def create_dummy_variables(X, categorical_features):
    """
    This function creates dummy variables for categorical features in a dataset.

    Args:
        X: A 2D NumPy array representing the dataset (n_samples x n_features).
        categorical_features: A list of integers representing the indices of categorical features in X.

    Returns:
        A 2D NumPy array representing the dataset with dummy variables for categorical features.
    """
    if categorical_features == []:
        return X
    else:
        # Select categorical features from the data
        X_categorical = X[:, categorical_features]

        # Create one-hot encoder
        encoder = OneHotEncoder(sparse_output=False)

        # Fit the encoder on the categorical features
        encoder.fit(X_categorical)

        # Transform the categorical features into dummy variables
        X_dummy_categorical = encoder.transform(X_categorical)

        # Get the original non-categorical features (assuming they are numerical)
        X_numerical = np.delete(X, categorical_features, axis=1)  # Delete categorical feature columns

        # Combine the dummy variables and numerical features
        X_with_dummies = np.concatenate([X_numerical, X_dummy_categorical], axis=1)

        return X_with_dummies


In [8]:
regression_dataset_namestry = regression_dataset_names[2:3]   #remove [] for complete PMLB
print(regression_dataset_namestry)

['1029_LEV']


SQR BENCHMARK **90TH** QUANTILE

In [None]:
# Set seed for reproducibility
SEED = 42  # Change as needed

# Set NumPy seed
np.random.seed(SEED)

# Set Python random seed
random.seed(SEED)

# Set PyTorch seed (if using)
torch.manual_seed(SEED)

# Set Optuna seed
optuna.logging.set_verbosity(optuna.logging.WARNING)  # Reduce logging clutter
optuna_seed = SEED

# Global quantile setting (EXCEPT FOR PYSR, THIS NEEDS MANUAL ADJUSTMENT)
QUANTILE = 0.9 #(CHANGE PYSR QUANTILE MANUALLY)

# Function to calculate pinball loss
def pinball_loss(y_true, y_pred, tau=QUANTILE):
    residuals = y_true - y_pred
    loss = np.where(residuals >= 0, tau * residuals, (1 - tau) * -residuals)
    return np.mean(loss)

# Function to calculate normalized pinball loss using the global dataset range
def normalized_pinball_loss(y_true, y_pred, global_min, global_max, tau=QUANTILE):
    range_y = global_max - global_min
    loss = pinball_loss(y_true, y_pred, tau)
    return loss / range_y if range_y != 0 else 0  # Avoid division by zero

# Function to calculate absolute coverage error
def absolute_coverage_error(y_true, y_pred, tau=QUANTILE):
    coverage = np.mean(y_pred >= y_true)
    return np.abs(coverage - tau)

# Function to calculate expression complexity
def calculate_expression_complexity(expression, complexity_of_operators):
    try:
        expr = sp.sympify(expression)
    except sp.SympifyError:
        raise ValueError("Invalid expression")

    complexity = 0
    for atom in sp.preorder_traversal(expr):
        if isinstance(atom, sp.Symbol):  # Variables (e.g., x1, x2)
            complexity += 1
        elif isinstance(atom, (int, float, sp.Integer, sp.Float)):  # Constants
            complexity += 1
        elif atom in complexity_of_operators:  # Operators
            complexity += complexity_of_operators[atom]
    return complexity

# LightGBM objective function for Optuna
def objective_lgb(trial, train_X, train_y, val_X, val_y):
    params = {
        'objective': 'quantile',
        'alpha': QUANTILE,
        'num_leaves': trial.suggest_int('num_leaves', 2, 100),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5, log=True),
        'max_depth': trial.suggest_int('max_depth', 1, 20),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'random_state': SEED,
        'bagging_seed': SEED,
        'feature_fraction_seed': SEED,
        'data_random_seed': SEED,
    }

    if params['min_child_samples'] >= params['num_leaves']:
        raise optuna.exceptions.TrialPruned()

    model = lgb.LGBMRegressor(**params)
    model.fit(train_X, train_y)
    y_pred = model.predict(val_X)
    return pinball_loss(val_y, y_pred, tau=QUANTILE)

# Quantile regression objective function for Optuna
def objective_linear(trial, train_X, train_y, val_X, val_y, tau=QUANTILE):
    max_iter = trial.suggest_int('max_iter', 1000, 5000)
    model = QuantReg(train_y, train_X)
    results = model.fit(q=tau, max_iter=max_iter)
    y_pred = results.predict(val_X)
    return pinball_loss(val_y, y_pred, tau)

# Quantile Decision Tree Regressor
class QuantileDecisionTreeRegressor:
    def __init__(self, quantile=QUANTILE, min_samples_leaf=5, random_state=SEED):
        self.quantile = quantile
        self.min_samples_leaf = min_samples_leaf
        self.tree = DecisionTreeRegressor(min_samples_leaf=min_samples_leaf, random_state=random_state)

    def fit(self, X, y):
        self.tree.fit(X, y)
        self._add_quantile_info(X, y)

    def _add_quantile_info(self, X, y):
        leaf_indices = self.tree.apply(X)
        unique_leaves = np.unique(leaf_indices)
        self.quantile_values = {}
        for leaf in unique_leaves:
            leaf_y = y[leaf_indices == leaf]
            self.quantile_values[leaf] = np.percentile(leaf_y, self.quantile * 100)

    def predict(self, X):
        leaf_indices = self.tree.apply(X)
        predictions = np.array([self.quantile_values[leaf] for leaf in leaf_indices])
        return predictions

# Optuna Objective Function for Decision Tree
def objective_tree(trial, train_X, train_y, val_X, val_y):
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 2, 50)

    model = QuantileDecisionTreeRegressor(quantile=QUANTILE, min_samples_leaf=min_samples_leaf)
    model.fit(train_X, train_y)
    y_pred = model.predict(val_X)

    return pinball_loss(val_y, y_pred, tau=QUANTILE)

# Complexity parameters
binary_operators = ["+", "*", "/", "-"]
unary_operators = ["exp", "sin", "cos", "log", "square"]
complexity_of_operators = {
    "+": 1,
    "-": 1,
    "*": 1,
    "/": 2,
    "exp": 4,
    "sin": 3,
    "cos": 3,
    "log": 3,
    "square": 2,
}

# results90 storage
results90 = {
    "SQR": {"losses": [], "coverage": [], "complexity": []},
    "LightGBM": {"losses": [], "coverage": []},
    "DecisionTree": {"losses": [], "coverage": [], "complexity": []},
    "LinearQuantile": {"losses": [], "coverage": [], "complexity": []},
}

def process_fold_scores(model_name, fold_scores):
    for metric, scores in fold_scores.items():
        results90[model_name][metric].extend(scores)


# Iterate over datasets
for regression_dataset in regression_dataset_namestry:
    try:
        print(regression_dataset)
        X1, y = fetch_data(regression_dataset, return_X_y=True)
        global_min, global_max = np.min(y), np.max(y)  # Global range for determ. normalization

        X = create_dummy_variables(X1, get_categorical_features(X1))

        kf = KFold(n_splits=5, shuffle=True, random_state=SEED)

        fold_scores_sqr = {"losses": [], "coverage": [], "complexity": []}
        fold_scores_lgb = {"losses": [], "coverage": []}
        fold_scores_tree = {"losses": [], "coverage": [], "complexity": []}
        fold_scores_linear = {"losses": [], "coverage": [], "complexity": []}

        for train_index, test_index in kf.split(X):
            train_X, test_X = X[train_index], X[test_index]
            train_y, test_y = y[train_index], y[test_index]

            # Symbolic Quantile Regression
            modelq = PySRRegressor(
                niterations=100, #imrpove for better results90
                binary_operators=binary_operators,
                unary_operators=unary_operators,
                complexity_of_operators=complexity_of_operators,
                elementwise_loss="pinball_loss(y_true, y_pred) = max.(0.1 * (y_true - y_pred), (0.1 - 1) * (y_true - y_pred))", #DONT FORGET TO CHANGE WHEN CHANGING QUANTILE (JULIA SYNTAX)
                temp_equation_file=True,
                random_state=SEED
            )

            modelq.fit(train_X, train_y)
            y_pred_symbolic = modelq.predict(test_X)

            # Metrics for SQR
            fold_scores_sqr["losses"].append(normalized_pinball_loss(test_y, y_pred_symbolic, global_min, global_max))
            fold_scores_sqr["coverage"].append(absolute_coverage_error(test_y, y_pred_symbolic))
            fold_scores_sqr["complexity"].append(calculate_expression_complexity(modelq.sympy(), complexity_of_operators))

            # LightGBM Quantile Regression
            study_lgb = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=SEED))
            study_lgb.optimize(lambda trial: objective_lgb(trial, train_X, train_y, test_X, test_y), n_trials=10)

            best_params_lgb = study_lgb.best_params
            model_lgb = lgb.LGBMRegressor(objective='quantile', alpha=QUANTILE, **best_params_lgb)
            model_lgb.fit(train_X, train_y)
            y_pred_lgb = model_lgb.predict(test_X)

            # Metrics for LightGBM
            fold_scores_lgb["losses"].append(normalized_pinball_loss(test_y, y_pred_lgb, global_min, global_max))
            fold_scores_lgb["coverage"].append(absolute_coverage_error(test_y, y_pred_lgb))

            # Inside the main loop for dataset processing (NEW)
            study_tree = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=SEED))
            study_tree.optimize(lambda trial: objective_tree(trial, train_X, train_y, test_X, test_y), n_trials=10)

            best_params_tree = study_tree.best_params  # Get best hyperparameter

            # Train the best Decision Tree model with optimized min_samples_leaf
            model_tree = QuantileDecisionTreeRegressor(quantile=QUANTILE, min_samples_leaf=best_params_tree['min_samples_leaf'])
            model_tree.fit(train_X, train_y)
            y_pred_tree = model_tree.predict(test_X)

            # Metrics for Decision Tree (NEW complexity calculation)
            fold_scores_tree["losses"].append(normalized_pinball_loss(test_y, y_pred_tree, global_min, global_max))
            fold_scores_tree["coverage"].append(absolute_coverage_error(test_y, y_pred_tree))
            fold_scores_tree["complexity"].append(model_tree.tree.tree_.node_count)  # NEW: Store tree complexity

            # Linear Quantile Regression
            study_linear = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=SEED))
            study_linear.optimize(lambda trial: objective_linear(trial, train_X, train_y, test_X, test_y), n_trials=10)

            best_params_linear = study_linear.best_params
            model_linear = QuantReg(train_y, train_X).fit(q=QUANTILE, max_iter=best_params_linear['max_iter'])
            y_pred_linear = model_linear.predict(test_X)

            # Metrics for Linear Quantile Regression
            fold_scores_linear["losses"].append(normalized_pinball_loss(test_y, y_pred_linear, global_min, global_max))
            fold_scores_linear["coverage"].append(absolute_coverage_error(test_y, y_pred_linear))
            fold_scores_linear["complexity"].append(X.shape[1])

        process_fold_scores("SQR", fold_scores_sqr)
        process_fold_scores("LightGBM", fold_scores_lgb)
        process_fold_scores("DecisionTree", fold_scores_tree)
        process_fold_scores("LinearQuantile", fold_scores_linear)
    except Exception as e:
        print(f"Error processing {regression_dataset}: {e}")

# Display results90
print(results90)


1029_LEV


[ Info: Started!



Expressions evaluated per second: 1.850e+05
Progress: 993 / 3100 total iterations (32.032%)
════════════════════════════════════════════════════════════════════════════════════════════════════
───────────────────────────────────────────────────────────────────────────────────────────────────
Complexity  Loss       Score      Equation
1           1.511e-01  1.594e+01  y = 3
3           1.376e-01  4.676e-02  y = 3.0001 - x₆
5           1.311e-01  2.419e-02  y = (3.0001 - x₆) - x₅
7           1.152e-01  6.491e-02  y = ((x₄ + x₉) + x₈) + 2.0007
11          1.083e-01  1.531e-02  y = x₄ + (((x₁₉ * x₁₄) + (x₈ + x₉)) + 2.0012)
13          1.055e-01  1.322e-02  y = x₉ + (((x₈ + ((x₁₉ + x₃) * x₉)) + x₄) + 2)
20          1.042e-01  1.793e-03  y = (x₄ + x₉) + ((x₈ + (x₉ * (x₃ + x₁₉))) + (cos(x₁₀ * x₁₆...
                                      ) * 2.004))
22          1.031e-01  5.072e-03  y = (x₄ + x₉) + (x₈ + ((x₉ * (x₃ + x₁₉)) + (cos((x₁₃ - x₁₀...
                                      ) * x₁₆) * 

[ Info: Final population:
[ Info: Results saved to:


───────────────────────────────────────────────────────────────────────────────────────────────────
Complexity  Loss       Score      Equation
1           1.511e-01  1.594e+01  y = 3
3           1.376e-01  4.679e-02  y = 3 - x₆
5           1.311e-01  2.419e-02  y = (3 - x₆) - x₅
7           1.151e-01  6.507e-02  y = ((x₉ + x₄) + 2) + x₈
9           1.149e-01  1.076e-03  y = (((x₁₄ + 2) + x₈) + x₉) + x₁₃
11          1.083e-01  2.971e-02  y = 2 + (x₄ + (x₈ + (x₉ + (x₁₄ * x₁₉))))
13          1.051e-01  1.455e-02  y = (x₈ + (x₉ + 2.0003)) + ((x₉ + x₄) * (x₁₄ + x₁₃))
15          1.035e-01  7.831e-03  y = (((x₉ * x₃) + x₉) + (x₄ + 2.0002)) + ((x₁₄ * x₁₉) + x₈...
                                      )
17          1.035e-01  6.586e-06  y = (((x₄ + 2.0001) + x₉) + ((x₉ * x₃) + ((x₁₄ - x₈) * x₁₉...
                                      ))) + x₈
19          1.028e-01  3.670e-03  y = (x₄ + x₈) + ((((x₁₄ - x₈) * x₁₄) * x₁₉) + ((x₉ * (x₃ +...
                                       x₉)) + 2))
20    

[ Info: Started!


  - /var/folders/v0/2pg4g4d55fjcq1qrmyjmjp680000gp/T/tmpyv2n_dbm/20250219_163839_5tzjjB/hall_of_fame.csv

Expressions evaluated per second: 1.940e+05
Progress: 1041 / 3100 total iterations (33.581%)
════════════════════════════════════════════════════════════════════════════════════════════════════
───────────────────────────────────────────────────────────────────────────────────────────────────
Complexity  Loss       Score      Equation
1           1.483e-01  1.594e+01  y = 3
3           1.357e-01  4.438e-02  y = 3.0005 - x₆
5           1.312e-01  1.689e-02  y = (3.0005 - x₆) - x₅
7           1.080e-01  9.700e-02  y = (x₈ + 2.0004) + (x₉ + x₄)
17          1.080e-01  1.706e-05  y = ((x₄ - ((x₅ - (0.29201 - (x₆ + -1.944))) + 0.52964)) +...
                                       1.7698) - (x₇ + 0.47611)
───────────────────────────────────────────────────────────────────────────────────────────────────
══════════════════════════════════════════════════════════════════════════════════════

[ Info: Final population:
[ Info: Results saved to:


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000298 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 40
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 20
[LightGBM] [Info] Start training from score 3.000000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000068 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 40
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 20
[LightGBM] [Info] Start training from score 3.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000016 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 40
[LightGBM] [Info] Number of data points in the train set: 800, num

[ Info: Started!


  - /var/folders/v0/2pg4g4d55fjcq1qrmyjmjp680000gp/T/tmpxvmnzxlv/20250219_163854_rNT7nV/hall_of_fame.csv

Expressions evaluated per second: 1.990e+05
Progress: 1003 / 3100 total iterations (32.355%)
════════════════════════════════════════════════════════════════════════════════════════════════════
───────────────────────────────────────────────────────────────────────────────────────────────────
Complexity  Loss       Score      Equation
1           1.496e-01  1.594e+01  y = 3
3           1.328e-01  5.964e-02  y = 2.9997 - x₆
5           1.287e-01  1.579e-02  y = (3.0029 - x₆) - x₁
7           1.111e-01  7.322e-02  y = (x₈ + (x₄ + x₉)) + 2.0003
9           1.096e-01  7.057e-03  y = x₈ + (((x₄ * x₁₄) + x₉) + 2.0016)
11          1.026e-01  3.280e-02  y = ((x₉ + 2.0001) + x₈) + ((x₄ + x₁₉) * x₁₄)
───────────────────────────────────────────────────────────────────────────────────────────────────
══════════════════════════════════════════════════════════════════════════════════════════════

[ Info: Final population:
[ Info: Results saved to:


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000221 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 40
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 20
[LightGBM] [Info] Start training from score 3.000000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000086 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 40
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 20
[LightGBM] [Info] Start training from score 3.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000014 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 40
[LightGBM] [Info] Number of data points in the train set: 800, num

[ Info: Started!


  - /var/folders/v0/2pg4g4d55fjcq1qrmyjmjp680000gp/T/tmph237kt11/20250219_163909_p4we9O/hall_of_fame.csv

Expressions evaluated per second: 2.130e+05
Progress: 1106 / 3100 total iterations (35.677%)
════════════════════════════════════════════════════════════════════════════════════════════════════
───────────────────────────────────────────────────────────────────────────────────────────────────
Complexity  Loss       Score      Equation
1           1.473e-01  1.594e+01  y = 3
3           1.345e-01  4.519e-02  y = 3.0004 - x₆
5           1.284e-01  2.329e-02  y = (3.0005 - x₆) - x₅
7           1.135e-01  6.165e-02  y = ((x₉ + x₄) + x₈) - -2.0002
9           1.119e-01  7.052e-03  y = (x₈ + x₉) + ((x₁₄ * x₄) - -2.0035)
11          1.064e-01  2.507e-02  y = x₈ + ((x₉ + (x₁₄ * (x₄ + x₁₉))) - -2.0035)
13          1.064e-01  1.424e-04  y = 0.9988 * ((x₉ + ((x₁₄ * (x₁₉ + x₄)) + x₈)) - -2.0059)
15          1.058e-01  2.757e-03  y = (x₈ + (x₉ + ((((0.99493 - x₈) * x₁₉) + x₄) * x₁₄))) - ...
   

[ Info: Final population:
[ Info: Results saved to:


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000129 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 40
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 20
[LightGBM] [Info] Start training from score 3.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000014 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 40
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 20
[LightGBM] [Info] Start training from score 3.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000012 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 40
[L

[ Info: Started!


  - /var/folders/v0/2pg4g4d55fjcq1qrmyjmjp680000gp/T/tmpg92a8gmw/20250219_163923_Xsn29c/hall_of_fame.csv

Expressions evaluated per second: 2.120e+05
Progress: 1168 / 3100 total iterations (37.677%)
════════════════════════════════════════════════════════════════════════════════════════════════════
───────────────────────────────────────────────────────────────────────────────────────────────────
Complexity  Loss       Score      Equation
1           1.462e-01  1.594e+01  y = 3
3           1.315e-01  5.315e-02  y = 3 - x₆
5           1.281e-01  1.300e-02  y = (3 - x₆) - x₁
7           1.201e-01  3.237e-02  y = ((x₄ + 3.0032) - x₇) - x₆
9           1.090e-01  4.868e-02  y = (((3.0032 - x₇) + x₄) - x₆) - x₅
11          1.088e-01  8.847e-04  y = (((x₄ + 0.015833) - x₆) - x₅) + (2.9843 - x₇)
13          1.050e-01  1.768e-02  y = (((3.0152 - x₇) - (x₆ - (x₃ * x₉))) - x₅) + x₄
17          1.028e-01  5.241e-03  y = x₄ + ((((2.9834 - x₇) + 0.44194) - (x₆ - (x₁₈ * x₁₃)))...
                    

[ Info: Final population:
[ Info: Results saved to:


───────────────────────────────────────────────────────────────────────────────────────────────────
Complexity  Loss       Score      Equation
1           1.462e-01  1.594e+01  y = 3
3           1.315e-01  5.315e-02  y = 3 - x₆
5           1.281e-01  1.300e-02  y = (3 - x₆) - x₁
7           1.199e-01  3.324e-02  y = 3.0001 - (x₇ - (x₄ - x₆))
9           1.088e-01  4.870e-02  y = (((x₄ - x₅) - x₆) + 3.0001) - x₇
11          1.088e-01  2.691e-05  y = ((x₄ + 2.9672) - x₇) - (x₆ - (0.032847 - x₅))
13          1.026e-01  2.890e-02  y = ((x₄ + (3.0003 - x₇)) - (x₆ - (x₁₈ * x₁₃))) - x₅
15          1.026e-01  5.433e-05  y = ((x₄ + ((2.6495 - x₇) - (x₆ - (x₁₈ * x₁₃)))) - x₅) - -...
                                      0.35068
17          9.939e-02  1.609e-02  y = (x₄ + (((2.6495 - x₇) - x₅) - (x₆ - (x₁₈ * (x₁₃ + x₅))...
                                      ))) - -0.35068
19          9.788e-02  7.611e-03  y = (((x₄ - x₇) + (2.6495 - x₅)) - (x₆ - (((x₁₄ - x₁₅) - x...
                           

SQR BENCHMARK **50TH** QUANTILE

In [14]:
# Set seed for reproducibility
SEED = 42  # Change as needed

# Set NumPy seed
np.random.seed(SEED)

# Set Python random seed
random.seed(SEED)

# Set PyTorch seed (if using)
torch.manual_seed(SEED)

# Set Optuna seed
optuna.logging.set_verbosity(optuna.logging.WARNING)  # Reduce logging clutter
optuna_seed = SEED

# Global quantile setting (EXCEPT FOR PYSR, THIS NEEDS MANUAL ADJUSTMENT)
QUANTILE = 0.5 #(CHANGE PYSR QUANTILE MANUALLY)

# Function to calculate pinball loss
def pinball_loss(y_true, y_pred, tau=QUANTILE):
    residuals = y_true - y_pred
    loss = np.where(residuals >= 0, tau * residuals, (1 - tau) * -residuals)
    return np.mean(loss)

# Function to calculate normalized pinball loss using the global dataset range
def normalized_pinball_loss(y_true, y_pred, global_min, global_max, tau=QUANTILE):
    range_y = global_max - global_min
    loss = pinball_loss(y_true, y_pred, tau)
    return loss / range_y if range_y != 0 else 0  # Avoid division by zero

# Function to calculate absolute coverage error
def absolute_coverage_error(y_true, y_pred, tau=QUANTILE):
    coverage = np.mean(y_pred >= y_true)
    return np.abs(coverage - tau)

# Function to calculate expression complexity
def calculate_expression_complexity(expression, complexity_of_operators):
    try:
        expr = sp.sympify(expression)
    except sp.SympifyError:
        raise ValueError("Invalid expression")

    complexity = 0
    for atom in sp.preorder_traversal(expr):
        if isinstance(atom, sp.Symbol):  # Variables (e.g., x1, x2)
            complexity += 1
        elif isinstance(atom, (int, float, sp.Integer, sp.Float)):  # Constants
            complexity += 1
        elif atom in complexity_of_operators:  # Operators
            complexity += complexity_of_operators[atom]
    return complexity

# LightGBM objective function for Optuna
def objective_lgb(trial, train_X, train_y, val_X, val_y):
    params = {
        'objective': 'quantile',
        'alpha': QUANTILE,
        'num_leaves': trial.suggest_int('num_leaves', 2, 100),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5, log=True),
        'max_depth': trial.suggest_int('max_depth', 1, 20),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'random_state': SEED,
        'bagging_seed': SEED,
        'feature_fraction_seed': SEED,
        'data_random_seed': SEED,
    }

    if params['min_child_samples'] >= params['num_leaves']:
        raise optuna.exceptions.TrialPruned()

    model = lgb.LGBMRegressor(**params)
    model.fit(train_X, train_y)
    y_pred = model.predict(val_X)
    return pinball_loss(val_y, y_pred, tau=QUANTILE)

# Quantile regression objective function for Optuna
def objective_linear(trial, train_X, train_y, val_X, val_y, tau=QUANTILE):
    max_iter = trial.suggest_int('max_iter', 1000, 5000)
    model = QuantReg(train_y, train_X)
    results50 = model.fit(q=tau, max_iter=max_iter)
    y_pred = results50.predict(val_X)
    return pinball_loss(val_y, y_pred, tau)

# Quantile Decision Tree Regressor
class QuantileDecisionTreeRegressor:
    def __init__(self, quantile=QUANTILE, min_samples_leaf=5, random_state=SEED):
        self.quantile = quantile
        self.min_samples_leaf = min_samples_leaf
        self.tree = DecisionTreeRegressor(min_samples_leaf=min_samples_leaf, random_state=random_state)

    def fit(self, X, y):
        self.tree.fit(X, y)
        self._add_quantile_info(X, y)

    def _add_quantile_info(self, X, y):
        leaf_indices = self.tree.apply(X)
        unique_leaves = np.unique(leaf_indices)
        self.quantile_values = {}
        for leaf in unique_leaves:
            leaf_y = y[leaf_indices == leaf]
            self.quantile_values[leaf] = np.percentile(leaf_y, self.quantile * 100)

    def predict(self, X):
        leaf_indices = self.tree.apply(X)
        predictions = np.array([self.quantile_values[leaf] for leaf in leaf_indices])
        return predictions

# Optuna Objective Function for Decision Tree
def objective_tree(trial, train_X, train_y, val_X, val_y):
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 2, 50)

    model = QuantileDecisionTreeRegressor(quantile=QUANTILE, min_samples_leaf=min_samples_leaf)
    model.fit(train_X, train_y)
    y_pred = model.predict(val_X)

    return pinball_loss(val_y, y_pred, tau=QUANTILE)

# Complexity parameters
binary_operators = ["+", "*", "/", "-"]
unary_operators = ["exp", "sin", "cos", "log", "square"]
complexity_of_operators = {
    "+": 1,
    "-": 1,
    "*": 1,
    "/": 2,
    "exp": 4,
    "sin": 3,
    "cos": 3,
    "log": 3,
    "square": 2,
}

# results50 storage
results50 = {
    "SQR": {"losses": [], "coverage": [], "complexity": []},
    "LightGBM": {"losses": [], "coverage": []},
    "DecisionTree": {"losses": [], "coverage": [], "complexity": []},
    "LinearQuantile": {"losses": [], "coverage": [], "complexity": []},
}

def process_fold_scores(model_name, fold_scores):
    for metric, scores in fold_scores.items():
        results50[model_name][metric].extend(scores)


# Iterate over datasets
for regression_dataset in regression_dataset_namestry:
    try:
        print(regression_dataset)
        X1, y = fetch_data(regression_dataset, return_X_y=True)
        global_min, global_max = np.min(y), np.max(y)  # Global range for determ. normalization

        X = create_dummy_variables(X1, get_categorical_features(X1))

        kf = KFold(n_splits=5, shuffle=True, random_state=SEED)

        fold_scores_sqr = {"losses": [], "coverage": [], "complexity": []}
        fold_scores_lgb = {"losses": [], "coverage": []}
        fold_scores_tree = {"losses": [], "coverage": [], "complexity": []}
        fold_scores_linear = {"losses": [], "coverage": [], "complexity": []}

        for train_index, test_index in kf.split(X):
            train_X, test_X = X[train_index], X[test_index]
            train_y, test_y = y[train_index], y[test_index]

            # Symbolic Quantile Regression
            modelq = PySRRegressor(
                niterations=100, #imrpove for better results50
                binary_operators=binary_operators,
                unary_operators=unary_operators,
                complexity_of_operators=complexity_of_operators,
                elementwise_loss="pinball_loss(y_true, y_pred) = max.(0.5 * (y_true - y_pred), (0.5 - 1) * (y_true - y_pred))", #DONT FORGET TO CHANGE WHEN CHANGING QUANTILE (JULIA SYNTAX)
                temp_equation_file=True,
                random_state=SEED
            )

            modelq.fit(train_X, train_y)
            y_pred_symbolic = modelq.predict(test_X)

            # Metrics for SQR
            fold_scores_sqr["losses"].append(normalized_pinball_loss(test_y, y_pred_symbolic, global_min, global_max))
            fold_scores_sqr["coverage"].append(absolute_coverage_error(test_y, y_pred_symbolic))
            fold_scores_sqr["complexity"].append(calculate_expression_complexity(modelq.sympy(), complexity_of_operators))

            # LightGBM Quantile Regression
            study_lgb = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=SEED))
            study_lgb.optimize(lambda trial: objective_lgb(trial, train_X, train_y, test_X, test_y), n_trials=10)

            best_params_lgb = study_lgb.best_params
            model_lgb = lgb.LGBMRegressor(objective='quantile', alpha=QUANTILE, **best_params_lgb)
            model_lgb.fit(train_X, train_y)
            y_pred_lgb = model_lgb.predict(test_X)

            # Metrics for LightGBM
            fold_scores_lgb["losses"].append(normalized_pinball_loss(test_y, y_pred_lgb, global_min, global_max))
            fold_scores_lgb["coverage"].append(absolute_coverage_error(test_y, y_pred_lgb))

            # Inside the main loop for dataset processing (NEW)
            study_tree = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=SEED))
            study_tree.optimize(lambda trial: objective_tree(trial, train_X, train_y, test_X, test_y), n_trials=10)

            best_params_tree = study_tree.best_params  # Get best hyperparameter

            # Train the best Decision Tree model with optimized min_samples_leaf
            model_tree = QuantileDecisionTreeRegressor(quantile=QUANTILE, min_samples_leaf=best_params_tree['min_samples_leaf'])
            model_tree.fit(train_X, train_y)
            y_pred_tree = model_tree.predict(test_X)

            # Metrics for Decision Tree (NEW complexity calculation)
            fold_scores_tree["losses"].append(normalized_pinball_loss(test_y, y_pred_tree, global_min, global_max))
            fold_scores_tree["coverage"].append(absolute_coverage_error(test_y, y_pred_tree))
            fold_scores_tree["complexity"].append(model_tree.tree.tree_.node_count)  # NEW: Store tree complexity

            # Linear Quantile Regression
            study_linear = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=SEED))
            study_linear.optimize(lambda trial: objective_linear(trial, train_X, train_y, test_X, test_y), n_trials=10)

            best_params_linear = study_linear.best_params
            model_linear = QuantReg(train_y, train_X).fit(q=QUANTILE, max_iter=best_params_linear['max_iter'])
            y_pred_linear = model_linear.predict(test_X)

            # Metrics for Linear Quantile Regression
            fold_scores_linear["losses"].append(normalized_pinball_loss(test_y, y_pred_linear, global_min, global_max))
            fold_scores_linear["coverage"].append(absolute_coverage_error(test_y, y_pred_linear))
            fold_scores_linear["complexity"].append(X.shape[1])

        process_fold_scores("SQR", fold_scores_sqr)
        process_fold_scores("LightGBM", fold_scores_lgb)
        process_fold_scores("DecisionTree", fold_scores_tree)
        process_fold_scores("LinearQuantile", fold_scores_linear)
    except Exception as e:
        print(f"Error processing {regression_dataset}: {e}")

# Display results50
print(results50)


1029_LEV


[ Info: Started!



Expressions evaluated per second: 2.030e+05
Progress: 1102 / 3100 total iterations (35.548%)
════════════════════════════════════════════════════════════════════════════════════════════════════
───────────────────────────────────────────────────────────────────────────────────────────────────
Complexity  Loss       Score      Equation
1           3.656e-01  1.594e+01  y = 2
3           3.169e-01  7.155e-02  y = 2 - x₅
5           2.894e-01  4.533e-02  y = (2.0002 - x₆) - x₅
7           2.745e-01  2.650e-02  y = (2.0004 - (x₅ + x₆)) + x₃
9           2.744e-01  1.819e-04  y = (x₉ + x₈) + (x₃ + (x₇ + 1))
11          2.612e-01  2.451e-02  y = ((x₉ + x₈) + (x₃ + (x₇ + 1))) + x₄
13          2.500e-01  2.201e-02  y = (x₇ + x₉) + (x₄ + (((x₃ + x₈) - x₁₅) + 1))
15          2.438e-01  1.266e-02  y = (((x₃ + (x₄ + x₉)) + ((x₇ + x₈) - x₁₅)) + 1) + x₉
19          2.425e-01  1.285e-03  y = (((x₇ + (x₈ + x₄)) - x₁₅) + 1) + (((x₁₅ - x₆) * x₃) + ...
                                      (x₃ + x₉))
21 

[ Info: Final population:
[ Info: Results saved to:
[ Info: Started!


  - /var/folders/v0/2pg4g4d55fjcq1qrmyjmjp680000gp/T/tmplsfabtqc/20250219_171922_tWTMyf/hall_of_fame.csv

Expressions evaluated per second: 1.950e+05
Progress: 1121 / 3100 total iterations (36.161%)
════════════════════════════════════════════════════════════════════════════════════════════════════
───────────────────────────────────────────────────────────────────────────────────────────────────
Complexity  Loss       Score      Equation
1           3.613e-01  1.594e+01  y = 2
3           3.187e-01  6.258e-02  y = 2 - x₅
5           2.994e-01  3.135e-02  y = (2 - x₆) - x₅
7           2.800e-01  3.345e-02  y = ((x₃ + 2) - x₆) - x₅
9           2.637e-01  2.989e-02  y = ((x₃ - x₆) + (x₄ - x₅)) + 2
11          2.569e-01  1.321e-02  y = (x₃ - x₆) + (((x₄ - x₅) - x₁₅) + 2)
13          2.475e-01  1.859e-02  y = (((x₃ - x₆) + (2 + (x₄ - x₅))) - x₁₅) + x₉
15          2.425e-01  1.020e-02  y = (((x₁₅ * x₉) + ((x₄ + (x₃ - x₆)) - x₅)) + 2) - x₁₅
───────────────────────────────────────────────────

[ Info: Final population:
[ Info: Results saved to:


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000099 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 40
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 20
[LightGBM] [Info] Start training from score 2.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000014 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 40
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 20
[LightGBM] [Info] Start training from score 2.000000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000077 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 40
[LightGBM] [Info] Number of data points in the train set: 800, num

[ Info: Started!


  - /var/folders/v0/2pg4g4d55fjcq1qrmyjmjp680000gp/T/tmps6e5eqf3/20250219_171935_cDnqHs/hall_of_fame.csv

Expressions evaluated per second: 2.140e+05
Progress: 1097 / 3100 total iterations (35.387%)
════════════════════════════════════════════════════════════════════════════════════════════════════
───────────────────────────────────────────────────────────────────────────────────────────────────
Complexity  Loss       Score      Equation
1           3.569e-01  1.594e+01  y = 2
3           3.169e-01  5.944e-02  y = 2 - x₅
5           2.938e-01  3.787e-02  y = (2 - x₅) - x₆
7           2.756e-01  3.186e-02  y = ((x₉ - x₅) - x₆) + 2
9           2.613e-01  2.677e-02  y = (x₉ - x₅) + ((x₄ - x₆) + 2)
11          2.613e-01  5.811e-06  y = (((x₉ - x₅) + 1.0872) + (x₄ - x₆)) + 0.9128
15          2.563e-01  4.831e-03  y = (((x₉ - x₅) + 1.0872) + ((x₄ - (x₀ * x₅)) - x₆)) + 0.9...
                                      128
───────────────────────────────────────────────────────────────────────────

[ Info: Final population:
[ Info: Results saved to:


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000077 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 40
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 20
[LightGBM] [Info] Start training from score 2.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000014 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 40
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 20
[LightGBM] [Info] Start training from score 2.000000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000047 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 40
[LightGBM] [Info] Number of data points in the train set: 800, num

[ Info: Started!


  - /var/folders/v0/2pg4g4d55fjcq1qrmyjmjp680000gp/T/tmp_gcb3t2m/20250219_171949_we9OXV/hall_of_fame.csv

Expressions evaluated per second: 1.980e+05
Progress: 1089 / 3100 total iterations (35.129%)
════════════════════════════════════════════════════════════════════════════════════════════════════
───────────────────────────────────────────────────────────────────────────────────────────────────
Complexity  Loss       Score      Equation
1           3.562e-01  1.594e+01  y = 2
3           3.119e-01  6.651e-02  y = 2 - x₅
5           2.957e-01  2.658e-02  y = (2.0006 - x₆) - x₅
7           2.745e-01  3.719e-02  y = ((2.0006 - x₆) - x₅) + x₃
11          2.625e-01  1.119e-02  y = (x₉ + x₈) + ((x₃ + (x₄ + x₇)) + 0.99996)
13          2.606e-01  3.599e-03  y = ((x₉ + (x₄ + (x₇ + x₈))) + (x₉ + 1)) - x₀
15          2.581e-01  4.819e-03  y = ((((x₄ + ((x₇ + x₈) + x₈)) + x₉) + x₉) + 1) - x₀
22          2.548e-01  1.861e-03  y = x₉ + ((x₇ + exp(x₈)) + (x₉ + ((x₄ - x₀) / exp(x₁₉))))
24          2

[ Info: Final population:
[ Info: Results saved to:




[ Info: Started!


  - /var/folders/v0/2pg4g4d55fjcq1qrmyjmjp680000gp/T/tmp1a5383og/20250219_172002_rNT7nV/hall_of_fame.csv

Expressions evaluated per second: 1.890e+05
Progress: 1053 / 3100 total iterations (33.968%)
════════════════════════════════════════════════════════════════════════════════════════════════════
───────────────────────────────────────────────────────────────────────────────────────────────────
Complexity  Loss       Score      Equation
1           3.525e-01  1.594e+01  y = 2
3           3.131e-01  5.922e-02  y = 2 - x₅
5           2.919e-01  3.514e-02  y = (2 - x₆) - x₅
7           2.750e-01  2.978e-02  y = ((2 - x₆) - x₅) + x₄
9           2.594e-01  2.925e-02  y = (((2 + x₄) - x₆) - x₅) + x₃
11          2.506e-01  1.716e-02  y = ((x₃ + 2) + (x₄ - (x₆ + x₁₅))) - x₅
13          2.412e-01  1.906e-02  y = (x₃ + (x₄ - (x₆ + (x₀ * x₁₅)))) + (2 - x₅)
17          2.375e-01  3.916e-03  y = ((x₄ + (x₃ + 2)) - (x₅ + (x₁₅ * x₅))) - (x₆ + (x₇ * x₁...
                                      ₅))
19

[ Info: Final population:
[ Info: Results saved to:


───────────────────────────────────────────────────────────────────────────────────────────────────
Complexity  Loss       Score      Equation
1           3.525e-01  1.594e+01  y = 2
3           3.131e-01  5.922e-02  y = 2 - x₅
5           2.919e-01  3.514e-02  y = (2 - x₆) - x₅
7           2.750e-01  2.978e-02  y = ((2 - x₆) - x₅) + x₄
9           2.594e-01  2.925e-02  y = (((2 + x₄) - x₆) - x₅) + x₃
11          2.506e-01  1.716e-02  y = ((x₃ + 2) + (x₄ - (x₆ + x₁₅))) - x₅
13          2.412e-01  1.906e-02  y = (x₃ + (x₄ - (x₆ + (x₀ * x₁₅)))) + (2 - x₅)
15          2.375e-01  7.833e-03  y = (x₃ + (2 + (x₄ - x₅))) - (x₆ + (x₁₅ * (x₅ + x₇)))
17          2.338e-01  7.958e-03  y = ((x₄ + (x₃ - ((x₁₅ * x₅) + x₅))) + 2) - ((x₁₀ * x₀) + ...
                                      x₆)
19          2.319e-01  4.027e-03  y = x₄ + (((x₃ - ((x₇ + x₅) * (x₁₅ + x₅))) + 2) - (x₆ + (x...
                                      ₃ * x₇)))
21          2.313e-01  1.349e-03  y = (((x₃ + x₄) - ((x₅ + x₁₅) * x₅))



{'SQR': {'losses': [np.float64(0.0775), np.float64(0.075625), np.float64(0.0775), np.float64(0.0825), np.float64(0.081250004375)], 'coverage': [np.float64(0.19999999999999996), np.float64(0.265), np.float64(0.25), np.float64(0.25), np.float64(0.175)], 'complexity': [3, 3, 3, 3, 3]}, 'LightGBM': {'losses': [np.float64(0.0676338824688943), np.float64(0.0641968982293419), np.float64(0.06480703525507796), np.float64(0.06753785037244704), np.float64(0.06619738926178854)], 'coverage': [np.float64(0.030000000000000027), np.float64(0.04500000000000004), np.float64(0.040000000000000036), np.float64(0.09499999999999997), np.float64(0.050000000000000044)]}, 'DecisionTree': {'losses': [np.float64(0.05875), np.float64(0.0490625), np.float64(0.04375), np.float64(0.0490625), np.float64(0.0484375)], 'coverage': [np.float64(0.235), np.float64(0.32499999999999996), np.float64(0.33999999999999997), np.float64(0.275), np.float64(0.32999999999999996)], 'complexity': [179, 183, 175, 117, 175]}, 'LinearQuant