In [1]:
import numpy as np
from scipy.stats import norm

In [3]:
output = [{'learning_rate': np.float64(0.18391682407855314),
  'max_depth': 11,
  'min_child_weight': np.float64(14.072124623114522),
  'n_estimators': 131},
 {'learning_rate': np.float64(0.17463505150156303),
  'max_depth': 11,
  'min_child_weight': np.float64(10.973032817555122),
  'n_estimators': 118}]

In [None]:
mean = 19
output_points = 10
range_proportion = 0.1

def normal_distr_parameters(params_centers, 
                            range_from_mean: float = 0.1, 
                            n_params: int = 5)->list:
    """
    Generate parameter values for each key in a dictionary (or list of dictionaries)
    using an approximation of a normal distribution centered around the median value.
    Generate parameter values around each mean with a normal distribution.
    Approximate a normal distribution using a range based on a fraction of the new mean 
            (median of input parameter values).
    Get data points within distance of 3 std from the new data point mean.
    Integer values are rounded and kept as int type.

    :param best_params: A dictionary or a list of dictionaries with numeric values
                        (e.g. RandomizedSearchCV best parameter output for 
                            multi-target model with RegressorChain.)
    :param range_from_mean: Fraction of the new mean to define range from which
                            standard deviation is estimated (±range = mean * fraction).
    :param n_params: Number of percentile-based values to sample from the normal 
                     distribution.
    :return: A dictionary where each key maps to a list of unique sampled values
             (as int or float), representing approximate normal distribution samples.
    """
    percentiles = np.linspace(0,1, n_params)[1:-1]
    if isinstance(params_centers, dict):
        dict_list = [params_centers]
    elif isinstance(params_centers, list) and all(isinstance(item, dict) for item in params_centers):
        dict_list = params_centers
    new_params = dict.fromkeys(dict_list[0].keys())
    for key in new_params.keys():
        new_params[key] = np.median([dict_list[param][key] for param in range(len(dict_list))])
    
    for key, mean in new_params.items():
        range_factoor = mean * range_from_mean
        std_approx = (range_factoor * 2) / 6
        
        data_points = np.append(
            norm.ppf(percentiles, loc=mean, scale=std_approx), 
            [mean - range_factoor, mean, mean + range_factoor]
            )
        if isinstance(dict_list[0][key], int):
            new_params[key] = list(set(
                int(np.floor(point)) if point <= mean \
                else int(np.ceil(point)) for point in data_points))
        else:
            # new_params[key] = list(set(float(point) for point in data_points))
            new_params[key] = list(set(data_points))
    
    return new_params

normal_distr_parameters(params_centers=output, n_params=4)


{'learning_rate': [np.float64(0.18127622418087472),
  np.float64(0.18391682407855314),
  np.float64(0.18655742397623157),
  np.float64(0.16552514167069782),
  np.float64(0.20230850648640847)],
 'max_depth': [9, 10, 11, 12, 13],
 'min_child_weight': [np.float64(12.66491216080307),
  np.float64(13.870083015305749),
  np.float64(14.274166230923296),
  np.float64(15.479337085425975),
  np.float64(14.072124623114522)],
 'n_estimators': [129, 131, 133, 145, 117]}

In [None]:
%time
filename_rs = "random_search_xgb.pkl"
model_path = Path(PATH_TO_MODEL + filename_rs)


def param_search():
    return None


if model_path.is_file():
    xgb_pipe_random = joblib.load(filename=model_path)
else:
    random_search_xgb = RandomizedSearchCV(
        estimator= XGBRegressor(),
        param_distributions = dict(
            learning_rate=uniform(loc=0.5, scale=3),  # uniform distribution within the interval 
            n_estimators=randint(low=50, high=300),  # random int within the interval
            max_depth=randint(low=2, high=20),
            min_child_weight=uniform(loc=0.5, scale=3)
        ), 
        n_iter=1,   # default = 10
        verbose=2,
        n_jobs=-1,
        scoring='r2', 
    )

    #TODO: reasess which model to use for applying hyper-param search
    xgb_pipe_random = Pipeline(
        steps=[
            ("engineer_features", engineer_max_features),
            ("xgb_regressor", RegressorChain(random_search_xgb))
            ])

    xgb_pipe_random.fit(X_train, y_train)
    joblib.dump(value=xgb_pipe_random, filename=model_path)

y_pred = xgb_pipe_random.predict(X_test)

print(f"Evaluation table for XGBoost model with randomized hyperparameter search.")
print(evaluation_tabel(
    predictions=y_pred, 
    original_dataset_correlations=sorted_corr_80
    ))

CPU times: total: 0 ns
Wall time: 13.8 μs


NameError: name 'Path' is not defined