[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/StatMixedML/LightGBMLSS/blob/master/examples/simulation_example_NegativeBinomial.ipynb)

# Imports

In [None]:
from lightgbmlss.model import *
from lightgbmlss.distributions.NegativeBinomial import *

from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

# Data

In [2]:
def custom_transform(y, constr_val):
    # Apply a custom transformation to restrict y between 0 and constr_val
    transformed_y = np.abs(y)  # Example transformation: logarithmic
    constrained_y = constr_val * transformed_y / np.max(transformed_y)  # Scale to desired range
    int_y = constrained_y.astype(int)
    return int_y

# Generate a custom dataset
X, y = make_regression(n_samples=5000, n_features=10, n_informative=2, random_state=123)

# Apply the custom transformation
y = custom_transform(y, 50)

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

dtrain = lgb.Dataset(X_train, label=y_train)

# Distribution Selection

In [3]:
# Specifies NegativeBinomial distribution with corresponding response functions and option to stabilize Gradient/Hessian. See ?NegativeBinomial for more information.
lgblss = LightGBMLSS(
    NegativeBinomial(stabilization="None",              # Options are "None", "MAD", "L2".
                     response_fn_total_count="relu",    # Function to transform the total_count-parameter, e.g., "exp", "softplus" or "relu".
                     response_fn_probs="sigmoid",       # Function to transform the probs-parameter, e.g., "sigmoid".
                     loss_fn="nll"                      # Loss function. Options are "nll" (negative log-likelihood) or "crps"(continuous ranked probability score).
                    )
)

# Hyper-Parameter Optimization

In [4]:
# Any LightGBM hyperparameter can be tuned, where the structure of the parameter dictionary needs to be as follows:

    # Float/Int sample_type
        # {"param_name": ["sample_type", low, high, log]}
            # sample_type: str, Type of sampling, e.g., "float" or "int"
            # low: int, Lower endpoint of the range of suggested values
            # high: int, Upper endpoint of the range of suggested values
            # log: bool, Flag to sample the value from the log domain or not
    # Example: {"eta": "float", low=1e-5, high=1, log=True]}

    # Categorical sample_type
        # {"param_name": ["sample_type", ["choice1", "choice2", "choice3", "..."]]}
            # sample_type: str, Type of sampling, either "categorical"
            # choice1, choice2, choice3, ...: str, Possible choices for the parameter
        # Example: {"boosting": ["categorical", ["gbdt", "dart"]]}

    # For parameters without tunable choice (this is needed if tree_method = "gpu_hist" and gpu_id needs to be specified)
        # {"param_name": ["none", [value]]},
        # param_name: str, Name of the parameter
        # value: int, Value of the parameter
    # Example: {"gpu_id": ["none", [0]]}

param_dict = {
    "eta":                      ["float", {"low": 1e-5,   "high": 1,     "log": True}],
    "max_depth":                ["int",   {"low": 1,      "high": 10,    "log": False}],
    "num_leaves":               ["int",   {"low": 255,    "high": 255,   "log": False}],  # set to constant for this example
    "min_data_in_leaf":         ["int",   {"low": 20,     "high": 20,    "log": False}],  # set to constant for this example
    "min_gain_to_split":        ["float", {"low": 1e-8,   "high": 40,    "log": False}],
    "min_sum_hessian_in_leaf":  ["float", {"low": 1e-8,   "high": 500,   "log": True}],
    "subsample":                ["float", {"low": 0.2,    "high": 1.0,   "log": False}],
    "feature_fraction":         ["float", {"low": 0.2,    "high": 1.0,   "log": False}],
    "boosting":                 ["categorical", ["gbdt"]],
}

np.random.seed(123)
opt_param = lgblss.hyper_opt(param_dict,
                             dtrain,
                             num_boost_round=100,        # Number of boosting iterations.
                             nfold=5,                    # Number of cv-folds.
                             early_stopping_rounds=20,   # Number of early-stopping rounds
                             max_minutes=5,              # Time budget in minutes, i.e., stop study after the given number of minutes.
                             n_trials=None,              # The number of trials. If this argument is set to None, there is no limitation on the number of trials.
                             silence=False,              # Controls the verbosity of the trail, i.e., user can silence the outputs of the trail.
                             seed=123,                   # Seed used to generate cv-folds.
                             hp_seed=None                # Seed for random number generator used in the Bayesian hyperparameter search.
                             )

[32m[I 2023-05-23 17:29:42,931][0m A new study created in memory with name: LightGBMLSS Hyper-Parameter Optimization[0m


   0%|          | 00:00/05:00

[32m[I 2023-05-23 17:29:45,843][0m Trial 0 finished with value: 2362.6454823420363 and parameters: {'eta': 0.2237494745725795, 'max_depth': 8, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 21.718229204152497, 'min_sum_hessian_in_leaf': 0.41610733461582017, 'subsample': 0.5489136781162185, 'feature_fraction': 0.6795070894088597, 'boosting': 'gbdt'}. Best is trial 0 with value: 2362.6454823420363.[0m
[32m[I 2023-05-23 17:29:51,154][0m Trial 1 finished with value: 2391.3637035325364 and parameters: {'eta': 0.02993587048561261, 'max_depth': 10, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 8.375534018622261, 'min_sum_hessian_in_leaf': 0.0007753281443770188, 'subsample': 0.7819790850797737, 'feature_fraction': 0.2656157036043193, 'boosting': 'gbdt'}. Best is trial 0 with value: 2362.6454823420363.[0m
[32m[I 2023-05-23 17:29:57,721][0m Trial 2 finished with value: 2623.1299437982134 and parameters: {'eta': 0.0003193057644574507, 'max_depth': 5, 'nu

# Model Training

In [5]:
np.random.seed(123)

opt_params = opt_param.copy()
n_rounds = opt_params["opt_rounds"]
del opt_params["opt_rounds"]

# Train Model with optimized hyperparameters
lgblss.train(opt_params,
             dtrain,
             num_boost_round=n_rounds
             )

<lightgbm.basic.Booster at 0x1c5cf3aa940>

# Prediction

In [7]:
# Set seed for reproducibility
torch.manual_seed(123)

# Number of samples to draw from predicted distribution
n_samples = 1000
quant_sel = [0.05, 0.95] # Quantiles to calculate from predicted distribution

# Sample from predicted distribution
pred_samples = lgblss.predict(X_test,
                              pred_type="samples",
                              n_samples=n_samples,
                              seed=123)

# Calculate quantiles from predicted distribution
pred_quantiles = lgblss.predict(X_test,
                                pred_type="quantiles",
                                n_samples=n_samples,
                                quantiles=quant_sel)

# Returns predicted distributional parameters
pred_params = lgblss.predict(X_test,
                             pred_type="parameters")

In [8]:
pred_samples.head()

Unnamed: 0,y_sample0,y_sample1,y_sample2,y_sample3,y_sample4,y_sample5,y_sample6,y_sample7,y_sample8,y_sample9,...,y_sample990,y_sample991,y_sample992,y_sample993,y_sample994,y_sample995,y_sample996,y_sample997,y_sample998,y_sample999
0,33,28,20,27,23,45,28,28,52,32,...,52,50,10,26,27,49,10,39,47,38
1,25,24,19,30,15,42,20,36,25,31,...,10,44,14,62,13,25,17,19,27,17
2,13,16,21,14,28,18,18,29,13,8,...,8,11,31,16,40,16,9,15,12,18
3,20,16,17,1,5,10,10,7,10,9,...,22,13,19,5,5,23,13,12,24,9
4,5,10,12,2,6,9,6,8,5,8,...,6,1,10,8,7,3,2,4,2,5


In [9]:
pred_quantiles.head()

Unnamed: 0,quant_0.05,quant_0.95
0,13,58
1,9,47
2,6,36
3,3,24
4,1,15


In [10]:
pred_params.head()

Unnamed: 0,total_count,probs
0,6.795483,0.826296
1,6.390824,0.797698
2,5.651881,0.767255
3,5.114201,0.689337
4,4.382565,0.614175
