In [1]:
import numpy as np
from scipy.stats import norm

In [7]:
mean = 19
output_points = 10
range_proportion = 0.1

def normal_distr_parameters(means:list, 
                            range_from_mean: float = 0.1, 
                            n_params: int = 5)->list:
    """
    Generate parameter values around each mean with a normal distribution.
    Approximate a normal distribution using a range based on a fraction of the mean.
    Get data points within distance of 3 std from the mean.

    :param means: List of mean values to base distributions on.
    :param range_from_mean: Fraction of mean to define ±range and estimate
                                standard deviation.
    :param n_params: Number of percentile values to sample.
    :return: List of sorted parameter values (int or float) per mean.
    """
    percentiles = np.linspace(0,1, n_params)[1:-1]
    result_list = []
    for mean in means:
        range_factoor = mean * range_from_mean
        std_approx = (range_factoor * 2) / 6
        
        data_points = np.append(
            norm.ppf(percentiles, loc=mean, scale=std_approx), 
            [mean - range_factoor, mean, mean + range_factoor]
            )
        if isinstance(mean, int):
            param_list = list(set(
                int(np.floor(point)) if point <= mean \
                else int(np.ceil(point)) for point in data_points))
        else:
            param_list = list(set(float(point) for point in data_points))

        result_list.append(sorted(param_list))
    
    return result_list

normal_distr_parameters(means=[19, 1.9], n_params=3)


[[17, 19, 21], [1.71, 1.9, 2.09]]

In [14]:
np.median([18,19, 26, 190])

np.float64(22.5)

In [None]:
%time
filename_rs = "random_search_xgb.pkl"
model_path = Path(PATH_TO_MODEL + filename_rs)


def param_search():
    return None


if model_path.is_file():
    xgb_pipe_random = joblib.load(filename=model_path)
else:
    random_search_xgb = RandomizedSearchCV(
        estimator= XGBRegressor(),
        param_distributions = dict(
            learning_rate=uniform(loc=0.5, scale=3),  # uniform distribution within the interval 
            n_estimators=randint(low=50, high=300),  # random int within the interval
            max_depth=randint(low=2, high=20),
            min_child_weight=uniform(loc=0.5, scale=3)
        ), 
        n_iter=1,   # default = 10
        verbose=2,
        n_jobs=-1,
        scoring='r2', 
    )

    #TODO: reasess which model to use for applying hyper-param search
    xgb_pipe_random = Pipeline(
        steps=[
            ("engineer_features", engineer_max_features),
            ("xgb_regressor", RegressorChain(random_search_xgb))
            ])

    xgb_pipe_random.fit(X_train, y_train)
    joblib.dump(value=xgb_pipe_random, filename=model_path)

y_pred = xgb_pipe_random.predict(X_test)

print(f"Evaluation table for XGBoost model with randomized hyperparameter search.")
print(evaluation_tabel(
    predictions=y_pred, 
    original_dataset_correlations=sorted_corr_80
    ))