In [28]:
import os
import pprint
import random

import matplotlib.colors
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as stats
import sklearn
import xgboost as xgb
from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold, RandomizedSearchCV

sklearn.set_config(enable_metadata_routing=True)

%matplotlib qt

In [29]:
VERSION = "v1d"
FIELD_MODEL = "T89"
MODEL_TYPE = "LOWER_BAND"
pdata_folder = os.path.abspath(r"./../processed_data/chorus_neural_network/")
STAGE_4_folder = os.path.join(pdata_folder, "STAGE_4", VERSION)
CONJUNCTIONS_REFS = np.load(
    file=os.path.join(STAGE_4_folder, f"MODEL_READY_DATA_{VERSION}_{FIELD_MODEL}_{MODEL_TYPE}.npz")
)

TRAINING_FEATURES = CONJUNCTIONS_REFS["FEATURES"]
TRAINING_LABELS = np.log(CONJUNCTIONS_REFS["LABELS"].flatten())
TRAINING_LABELS_EXP = np.exp(TRAINING_LABELS)

TRAINING_MLT = CONJUNCTIONS_REFS["TRAINING_MLT"]
MEAN_L = CONJUNCTIONS_REFS["MEAN_L"]
STD_L = CONJUNCTIONS_REFS["STD_L"]

VALIDATION_FEATURES = CONJUNCTIONS_REFS["VALIDATION_FEATURES"]
VALIDATION_LABELS = np.log(CONJUNCTIONS_REFS["VALIDATION_LABELS"].flatten())


kf = KFold(n_splits=5)

# BINS = CONJUNCTIONS_REFS["BINS"]

CONJUNCTIONS_REFS.close()

print(f"Training set shape: {TRAINING_FEATURES.shape, TRAINING_LABELS.shape}")
print(f"Validation set shape: {VALIDATION_FEATURES.shape, VALIDATION_LABELS.shape}")

Training set shape: ((452850, 5), (452850,))
Validation set shape: ((7756, 5), (7756,))


In [3]:
# Find weights

MEAN_CHORUS = np.nanmean(TRAINING_LABELS)
STD_CHORUS = np.nanstd(TRAINING_LABELS)

print(MEAN_CHORUS, STD_CHORUS)

SAMPLE_WEIGHTS_FROM_CHORUS_DISTRIBUTION = (
    np.abs((TRAINING_LABELS - MEAN_CHORUS) / STD_CHORUS) / 2.0 + 1.0
)
SAMPLE_WEIGHTS_FROM_CHORUS_DISTRIBUTION[SAMPLE_WEIGHTS_FROM_CHORUS_DISTRIBUTION > 5] = 5

order_of_labels = np.argsort(TRAINING_LABELS.flatten())
plt.ylabel("Weights of dataset")
plt.xlabel("Chorus Amplitude (pT)")
plt.plot(
    TRAINING_LABELS[order_of_labels],
    SAMPLE_WEIGHTS_FROM_CHORUS_DISTRIBUTION[order_of_labels].flatten(),
    label="SAMPLE_WEIGHTS",
)

plt.legend()
plt.show()

1.0703959725845127 1.0329194219048508


In [33]:
training_set = xgb.DMatrix(
    TRAINING_FEATURES, TRAINING_LABELS
)

In [5]:
# Define the hyperparameter grid

param_dist = {
    "max_depth": stats.randint(5, 8),
    "learning_rate": stats.uniform(0.001, 0.05),
    "subsample": stats.uniform(0.3, (1 - 0.3)),
    "reg_alpha": stats.uniform(150, 300),
    "reg_lambda": stats.uniform(0, 1000),
    "gamma": stats.uniform(0, 3),
    "max_delta_step": stats.uniform(0, 3),
}


def weighted_mse(y_true, y_pred, sample_weight):
    return np.average((y_true - y_pred) ** 2, weights=sample_weight)


scorer = make_scorer(weighted_mse, greater_is_better=False).set_score_request(sample_weight=True)


# Create the XGBoost model object
xgb_model = xgb.XGBRegressor(n_estimators=700, device="gpu").set_fit_request(sample_weight=True)

# Create the GridSearchCV object
model = RandomizedSearchCV(xgb_model, param_dist, cv=5, n_iter=50, scoring=scorer)

# Fit the GridSearchCV object to the training data
model.fit(TRAINING_FEATURES, TRAINING_LABELS, sample_weight=SAMPLE_WEIGHTS_FROM_CHORUS_DISTRIBUTION)

# Print the best set of hyperparameters and the corresponding score
print("Best set of hyperparameters: ", model.best_params_)
print("Best score: ", model.best_score_)

print(type(model.best_params_))
print(model.best_params_)

df = pd.DataFrame([model.best_params_])
df["Best Score"] = model.best_score_

df.to_csv("Best_Parameters.csv", mode="a", header=False, index=False)

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Best set of hyperparameters:  {'gamma': np.float64(2.52642620528523), 'learning_rate': np.float64(0.009873939071281684), 'max_delta_step': np.float64(0.26606958445607887), 'max_depth': 7, 'reg_alpha': np.float64(209.55231297361803), 'reg_lambda': np.float64(164.4528254424732), 'subsample': np.float64(0.313401298026189)}
Best score:  -1.1051387280767997
<class 'dict'>
{'gamma': np.float64(2.52642620528523), 'learning_rate': np.float64(0.009873939071281684), 'max_delta_step': np.float64(0.26606958445607887), 'max_depth': 7, 'reg_alpha': np.float64(209.55231297361803), 'reg_lambda': np.float64(164.4528254424732), 'subsample': np.float64(0.313401298026189)}


In [53]:
params = model.best_params_.copy()
params["device"] = "gpu"
params["tree_method"] = "hist"
params["alpha"] = params["reg_alpha"]
params["lambda"] = params["reg_lambda"]
params["gamma"] = 0

params["learning_rate"] = 1.0
del params["reg_lambda"]
del params["reg_alpha"]

pprint.pprint(params)

n = 1000

# lr_scheduler = xgb.callback.LearningRateScheduler(custom_learning_rate)

results = xgb.cv(
    params=params,
    dtrain=training_set,
    num_boost_round=n,
    verbose_eval=1,
    seed=random.randint(0, int(1e5)),
    shuffle=True,
    metrics=["rmse"],
    folds=kf,
    # callbacks=[lr_scheduler]
)

# model.save_model(f"./../processed_data_chorus_neural_network/TRAINED_MODELS/Weighted_L2/XG_BOOSTED_REGRESSION_MSE_WEIGHTED_LINEAR_WEIGHTING.model")'''

{'alpha': np.float64(209.55231297361803),
 'device': 'gpu',
 'gamma': 0,
 'lambda': np.float64(164.4528254424732),
 'learning_rate': 1.0,
 'max_delta_step': np.float64(0.26606958445607887),
 'max_depth': 7,
 'subsample': np.float64(0.313401298026189),
 'tree_method': 'hist'}
[0]	train-rmse:0.95581+0.00960	test-rmse:0.96792+0.04423
[1]	train-rmse:0.94253+0.01005	test-rmse:0.95847+0.04343
[2]	train-rmse:0.93574+0.01009	test-rmse:0.95519+0.04388
[3]	train-rmse:0.93314+0.01065	test-rmse:0.95496+0.04294
[4]	train-rmse:0.93061+0.01058	test-rmse:0.95503+0.04253
[5]	train-rmse:0.92836+0.01050	test-rmse:0.95465+0.04260
[6]	train-rmse:0.92663+0.01033	test-rmse:0.95396+0.04285
[7]	train-rmse:0.92514+0.01036	test-rmse:0.95380+0.04309
[8]	train-rmse:0.92361+0.01027	test-rmse:0.95401+0.04324
[9]	train-rmse:0.92254+0.01021	test-rmse:0.95408+0.04264
[10]	train-rmse:0.92128+0.01023	test-rmse:0.95386+0.04288
[11]	train-rmse:0.92024+0.01027	test-rmse:0.95407+0.04299
[12]	train-rmse:0.91910+0.01038	test-r

In [35]:
print(results)
plt.plot(results["train-rmse-mean"], label="Training RMSE")
plt.plot(results["test-rmse-mean"], label="Test RMSE")

     train-rmse-mean  train-rmse-std  test-rmse-mean  test-rmse-std
0           1.031274        0.009868        1.033624       0.040906
1           1.030013        0.009847        1.032419       0.040872
2           1.028762        0.009835        1.031234       0.040837
3           1.027545        0.009830        1.030073       0.040784
4           1.026331        0.009842        1.028945       0.040732
..               ...             ...             ...            ...
771         0.921444        0.010292        0.945708       0.043216
772         0.921427        0.010294        0.945708       0.043216
773         0.921413        0.010294        0.945709       0.043213
774         0.921403        0.010292        0.945706       0.043212
775         0.921394        0.010292        0.945703       0.043214

[776 rows x 4 columns]


[<matplotlib.lines.Line2D at 0x26503890e30>]

In [54]:
evallist = [(training_set, 'train')]


model_xgb = xgb.train(params=params, dtrain=training_set, num_boost_round=10000, verbose_eval=1, evals=evallist)

[0]	train-rmse:0.95391
[1]	train-rmse:0.94476
[2]	train-rmse:0.93877
[3]	train-rmse:0.93447
[4]	train-rmse:0.93301
[5]	train-rmse:0.93126
[6]	train-rmse:0.92992
[7]	train-rmse:0.92908
[8]	train-rmse:0.92776
[9]	train-rmse:0.92658
[10]	train-rmse:0.92499
[11]	train-rmse:0.92374
[12]	train-rmse:0.92254
[13]	train-rmse:0.92173
[14]	train-rmse:0.92057
[15]	train-rmse:0.91987
[16]	train-rmse:0.91899
[17]	train-rmse:0.91860
[18]	train-rmse:0.91804
[19]	train-rmse:0.91715
[20]	train-rmse:0.91639
[21]	train-rmse:0.91540
[22]	train-rmse:0.91487
[23]	train-rmse:0.91413
[24]	train-rmse:0.91358
[25]	train-rmse:0.91313
[26]	train-rmse:0.91259
[27]	train-rmse:0.91202
[28]	train-rmse:0.91153
[29]	train-rmse:0.91115
[30]	train-rmse:0.91069
[31]	train-rmse:0.91040
[32]	train-rmse:0.90980
[33]	train-rmse:0.90948
[34]	train-rmse:0.90926
[35]	train-rmse:0.90895
[36]	train-rmse:0.90852
[37]	train-rmse:0.90830
[38]	train-rmse:0.90799
[39]	train-rmse:0.90751
[40]	train-rmse:0.90717
[41]	train-rmse:0.90701
[4

In [None]:
model.save_model(
    os.path.join(
        pdata_folder, "TRAINED_MODELS", "WEIGHTED_CHORUS_WITH_MEDIAN_INSTEAD_OF_MEAN.model"
    )
)

In [55]:
# model = xgb.Booster({'nthread': 4})  # init model
# model.load_model(f"./../processed_data_chorus_neural_network/TRAINED_MODELS/XG_BOOSTED_REGRESSION.model")  # load model data
training_pred = np.exp(model_xgb.predict(training_set))

In [56]:
plt.scatter(TRAINING_LABELS_EXP, training_pred, s=0.8, c=(TRAINING_FEATURES[:, 0] * STD_L) + MEAN_L)
plt.plot(np.logspace(-2, 4), np.logspace(-2, 4), color="black")
plt.colorbar()
plt.grid()

plt.xlim(1e-1, 1e3)
plt.ylim(1e-1, 1e3)
plt.xlabel("RBSP OBSERVED CHORUS")
plt.ylabel("MODEL PREDICTED CHORUS")
plt.title("TRAINING SET")
plt.xscale("log")
plt.yscale("log")

In [None]:
plt.scatter(
    TRAINING_FEATURES[:, 0],
    np.abs((training_pred.flatten() - TRAINING_LABELS.flatten()) / TRAINING_LABELS.flatten()) * 100,
    s=0.8,
    c=TRAINING_LABELS,
    norm=matplotlib.colors.LogNorm(),
)
plt.colorbar()

plt.grid()
plt.yscale("log")

In [None]:
plt.scatter(VALIDATION_LABELS, validation_pred, s=0.8, c=VALIDATION_FEATURES[:, 0])
plt.plot(np.logspace(-2, 4), np.logspace(-2, 4), color="black")
plt.grid()
plt.colorbar()
plt.xlim(1e-1, 1e3)
plt.ylim(1e-1, 1e3)
plt.xscale("log")
plt.yscale("log")

In [None]:
print(model.get_score(importance_type="gain"))
print(model.get_score(importance_type="weight"))

In [None]:
fig, ax = plt.subplots(2, 2, subplot_kw=dict(projection="polar"))

ax[0][0].set_xlim(xmin=0, xmax=2 * np.pi)
ax[0][0].set_ylim(ymin=0, ymax=7)

ax[0][1].set_xlim(xmin=0, xmax=2 * np.pi)
ax[0][1].set_ylim(ymin=0, ymax=7)

ax[1][0].set_xlim(xmin=0, xmax=2 * np.pi)
ax[1][0].set_ylim(ymin=0, ymax=7)

ax[1][1].set_xlim(xmin=0, xmax=2 * np.pi)
ax[1][1].set_ylim(ymin=0, ymax=7)

radius_of_points_training = (
    TRAINING_FEATURES[:, 0] * 1.345 + 4.1
)  # <---------------------------------------------
angles_of_points_training = np.arctan2(TRAINING_FEATURES[:, 1], TRAINING_FEATURES[:, 2])

radius_of_points_validation = (
    VALIDATION_FEATURES[:, 0] * 1.345 + 4.1
)  # <---------------------------------------------------------
angles_of_points_validation = np.arctan2(VALIDATION_FEATURES[:, 1], VALIDATION_FEATURES[:, 2])

rbins = np.linspace(0, 8, 30)
abins = np.linspace(-1 * np.pi, np.pi, 60)

A, R = np.meshgrid(abins, rbins)

average_chorus_pred_training = np.zeros_like(A)
average_chorus_pred_validation = np.zeros_like(A)
average_chorus_real_training = np.zeros_like(A)
average_chorus_real_validation = np.zeros_like(A)

for r in range(len(rbins) - 1):
    for a in range(len(abins) - 1):

        average_chorus_pred_training[r, a] += np.nanmean(
            training_pred[
                (rbins[r] <= radius_of_points_training)
                & (radius_of_points_training < rbins[r + 1])
                & (abins[a] < angles_of_points_training)
                & (angles_of_points_training < abins[a + 1])
            ]
        )
        average_chorus_real_training[r, a] += np.nanmean(
            TRAINING_LABELS[
                (rbins[r] <= radius_of_points_training)
                & (radius_of_points_training < rbins[r + 1])
                & (abins[a] < angles_of_points_training)
                & (angles_of_points_training < abins[a + 1])
            ]
        )
        average_chorus_pred_validation[r, a] += np.nanmean(
            validation_pred[
                (rbins[r] <= radius_of_points_validation)
                & (radius_of_points_validation < rbins[r + 1])
                & (abins[a] < angles_of_points_validation)
                & (angles_of_points_validation < abins[a + 1])
            ]
        )
        average_chorus_real_validation[r, a] += np.nanmean(
            VALIDATION_LABELS[
                (rbins[r] <= radius_of_points_validation)
                & (radius_of_points_validation < rbins[r + 1])
                & (abins[a] < angles_of_points_validation)
                & (angles_of_points_validation < abins[a + 1])
            ]
        )


pc = ax[0][0].pcolormesh(
    A, R, average_chorus_pred_training, norm=matplotlib.colors.LogNorm(vmin=1, vmax=100)
)
plt.colorbar(pc)

ax[0][0].set_xticklabels(["MLT 0", "", "MLT 6", "", "MLT 12", "", "MLT 18"])
ax[0][1].set_xticklabels(["MLT 0", "", "MLT 6", "", "MLT 12", "", "MLT 18"])
ax[1][0].set_xticklabels(["MLT 0", "", "MLT 6", "", "MLT 12", "", "MLT 18"])
ax[1][1].set_xticklabels(["MLT 0", "", "MLT 6", "", "MLT 12", "", "MLT 18"])


pc = ax[0][1].pcolormesh(
    A, R, average_chorus_pred_validation, norm=matplotlib.colors.LogNorm(vmin=1, vmax=100)
)
plt.colorbar(pc)

pc = ax[1][0].pcolormesh(
    A, R, average_chorus_real_training, norm=matplotlib.colors.LogNorm(vmin=1, vmax=100)
)
plt.colorbar(pc)

pc = ax[1][1].pcolormesh(
    A, R, average_chorus_real_validation, norm=matplotlib.colors.LogNorm(vmin=1, vmax=100)
)
plt.colorbar(pc)

ax[0][0].set_title("Predicted on Training Set")
ax[0][1].set_title("Predicted on Validation Set")
ax[1][0].set_title("Labels of Training Set")
ax[1][1].set_title("Labels of Validation Set")

plt.tight_layout()
plt.show()

In [None]:
fig, axs = plt.subplots(2, 1)