In [1]:
import xgboost as xgb
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.colors
import random
import scipy.stats as stats
import scipy.optimize


%matplotlib qt

In [2]:
version = "v5a"

CONJUNCTIONS_REFS = np.load(f"./../processed_data_chorus_neural_network/STAGE_4/{version}/MODEL_READY_DATA_{version}.npz")

TRAINING_FEATURES = CONJUNCTIONS_REFS["FEATURES"]
TRAINING_LABELS = CONJUNCTIONS_REFS["LABELS"]
TRAINING_MLT = CONJUNCTIONS_REFS["TRAINING_MLT"]

VALIDATION_FEATURES = CONJUNCTIONS_REFS["VALIDATION_FEATURES"]
VALIDATION_LABELS = CONJUNCTIONS_REFS["VALIDATION_LABELS"]

#BINS = CONJUNCTIONS_REFS["BINS"]

CONJUNCTIONS_REFS.close()

print(f"Training set shape: {TRAINING_FEATURES.shape, TRAINING_LABELS.shape}")
print(f"Validation set shape: {VALIDATION_FEATURES.shape, VALIDATION_LABELS.shape}")

Training set shape: ((127456, 6), (127456, 1))
Validation set shape: ((3284, 6), (3284, 1))


In [3]:
#Find weights 

MAX_CHORUS = np.nanmax(TRAINING_LABELS)
MIN_CHORUS = np.nanmin(TRAINING_LABELS)

print(MIN_CHORUS, MAX_CHORUS)

CHORUS_WEIGHTING_BINS = np.logspace(np.log10(MIN_CHORUS), np.log10(MAX_CHORUS), base = 10, num=30)
BINNED_CHORUS_HISTOGRAM = np.histogram(TRAINING_LABELS, bins = CHORUS_WEIGHTING_BINS, density=True)


X_POINTS = (BINNED_CHORUS_HISTOGRAM[1][:-1] + BINNED_CHORUS_HISTOGRAM[1][1:]) / 2.0
Y_POINTS = BINNED_CHORUS_HISTOGRAM[0]

SAMPLE_WEIGHTS_FROM_CHORUS_DISTRIBUTION = np.log10(1 / np.exp(np.interp(x = TRAINING_LABELS.flatten(), xp = X_POINTS, fp = np.log(Y_POINTS))))

order_of_labels = np.argsort(TRAINING_LABELS.flatten())

print(SAMPLE_WEIGHTS_FROM_CHORUS_DISTRIBUTION)

# Plot the histogram of the data and the fitted distribution
plt.plot(X_POINTS, BINNED_CHORUS_HISTOGRAM[0], label = "Data")

plt.ylabel("Percentage of dataset")
plt.xlabel("Chorus Amplitude (pT)")
plt.plot(TRAINING_LABELS[order_of_labels], SAMPLE_WEIGHTS_FROM_CHORUS_DISTRIBUTION[order_of_labels].flatten(), label = "SAMPLE_WEIGHTS")

plt.legend()
plt.show()

0.20202284213155508 396.82318419218063
[1.05008292 0.30878761 1.35738169 ... 0.56917765 0.19059114 0.27939985]


In [27]:
training_set = xgb.DMatrix(TRAINING_FEATURES, TRAINING_LABELS)
validation_set = xgb.DMatrix(VALIDATION_FEATURES, VALIDATION_LABELS)

In [28]:
# Define custom learning rate schedule
def custom_learning_rate(current_iter):
    base_learning_rate = 1.0
    
    lr = base_learning_rate * np.power(0.5, np.floor(current_iter / 20))
    print(f"Learning Rate: {lr}")
    return lr

In [29]:
params = {
  "colsample_bynode": 0.8,
  'colsample_bytree': 0.8,
  "learning_rate": 2.0,
  "max_depth": 15,
  "objective": "reg:squarederror",
  "subsample": 0.5,
  "gamma" : 1.0, 
  "lambda" : 1000.0,
  "tree_method": "hist",
  "device": "cuda",
  "nthread" : 10,
}

evals = [(training_set, "train"), (validation_set, "validation")]

n = 200

lr_scheduler = xgb.callback.LearningRateScheduler(custom_learning_rate)

'''results = xgb.cv(
   params = params,
   nfold = 5,
   dtrain = training_set,
   num_boost_round = n,
   verbose_eval=1,
   seed = random.randint(0, int(1e5)),
   early_stopping_rounds=100,
   shuffle = True,
   metrics = ["mape", "rmse"],
   callbacks=[lr_scheduler])'''

model = xgb.train(
   params = params,
   dtrain = training_set,
   num_boost_round = n,
   evals = evals,
   verbose_eval=1,
   callbacks = [lr_scheduler]
)



#model.save_model(f"./../processed_data_chorus_neural_network/TRAINED_MODELS/Weighted_L2/XG_BOOSTED_REGRESSION_MSE_WEIGHTED_LINEAR_WEIGHTING.model")


Learning Rate: 1.0
[0]	train-rmse:8.65442	validation-rmse:9.57779
Learning Rate: 1.0
[1]	train-rmse:8.34594	validation-rmse:9.35235
Learning Rate: 1.0
[2]	train-rmse:8.23671	validation-rmse:9.43479
Learning Rate: 1.0
[3]	train-rmse:8.14659	validation-rmse:9.49502
Learning Rate: 1.0
[4]	train-rmse:8.09703	validation-rmse:9.51391
Learning Rate: 1.0
[5]	train-rmse:8.06096	validation-rmse:9.50110
Learning Rate: 1.0
[6]	train-rmse:8.04117	validation-rmse:9.53754
Learning Rate: 1.0
[7]	train-rmse:8.02137	validation-rmse:9.55373
Learning Rate: 1.0
[8]	train-rmse:8.00199	validation-rmse:9.55672
Learning Rate: 1.0
[9]	train-rmse:7.97389	validation-rmse:9.61413
Learning Rate: 1.0
[10]	train-rmse:7.94661	validation-rmse:9.62616
Learning Rate: 1.0
[11]	train-rmse:7.92687	validation-rmse:9.65545
Learning Rate: 1.0
[12]	train-rmse:7.90208	validation-rmse:9.65999
Learning Rate: 1.0
[13]	train-rmse:7.88776	validation-rmse:9.67143
Learning Rate: 1.0
[14]	train-rmse:7.86466	validation-rmse:9.64886
Learn

In [None]:
model.save_model(f"./../processed_data_chorus_neural_network/TRAINED_MODELS/Weighted_L2/XG_BOOSTED_REGRESSION_MSE_WEIGHTED_ON_AMPLITUDE_AND_L_AT_AGU.model")


In [None]:
#Cross fold validation results:

plt.plot(results["train-rmse-mean"], label="train-rmse-mean")
plt.plot(results["test-rmse-mean"], label="test-rmse-mean")
plt.legend()

In [30]:
#model = xgb.Booster({'nthread': 4})  # init model
#model.load_model(f"./../processed_data_chorus_neural_network/TRAINED_MODELS/XG_BOOSTED_REGRESSION.model")  # load model data

validation_pred = (model.predict(validation_set))
training_pred = (model.predict(training_set))

In [31]:
print(np.nanmean(TRAINING_FEATURES[:, 0]))
print(np.std(TRAINING_FEATURES[:, 0]))


plt.scatter(TRAINING_LABELS, training_pred, s = 0.8, c = TRAINING_FEATURES[:, 0])
plt.plot(np.logspace(-2, 4), np.logspace(-2, 4), color = "black")
plt.grid()
cbar = plt.colorbar()
cbar.ax.set_ylabel('L - Shell\n', rotation=270, loc="center", labelpad = 1.0)

plt.xlim(1e-1, 1e3)
plt.ylim(1e-1, 1e3)
plt.xlabel("RBSP OBSERVED CHORUS")
plt.ylabel("MODEL PREDICTED CHORUS")
plt.title("TRAINING SET")
plt.xscale("log")
plt.yscale("log")

2.604550324403079e-16
0.9999999999999999


In [None]:
print(np.min(TRAINING_FEATURES[:, 0]))
print(np.max(TRAINING_FEATURES[:, 0]))

print(np.nanmean((training_pred.flatten() - TRAINING_LABELS.flatten())**2))

plt.scatter(TRAINING_FEATURES[:, 0], np.abs((training_pred.flatten() - TRAINING_LABELS.flatten()) / TRAINING_LABELS.flatten()) * 100, s = 0.8, c = TRAINING_LABELS, norm=matplotlib.colors.LogNorm())
plt.colorbar()

plt.grid()
plt.yscale("log")

In [None]:
plt.scatter(VALIDATION_LABELS, validation_pred, s = 0.8, c = VALIDATION_FEATURES[:, 0])
plt.plot(np.logspace(-2, 4), np.logspace(-2, 4), color = "black")
plt.grid()
plt.colorbar()
plt.xlim(1e-1, 1e3)
plt.ylim(1e-1, 1e3)
plt.xscale("log")
plt.yscale("log")

In [None]:
print(model.get_score(importance_type='gain'))
print(model.get_score(importance_type='weight'))

In [None]:
fig, ax = plt.subplots(2, 2, subplot_kw=dict(projection="polar"))

ax[0][0].set_xlim(xmin=0, xmax= 2 * np.pi)
ax[0][0].set_ylim(ymin=0, ymax=7)

ax[0][1].set_xlim(xmin=0, xmax=  2 * np.pi)
ax[0][1].set_ylim(ymin=0, ymax=7)

ax[1][0].set_xlim(xmin = 0, xmax = 2 * np.pi)
ax[1][0].set_ylim(ymin = 0, ymax=7)

ax[1][1].set_xlim(xmin=0, xmax= 2 * np.pi)
ax[1][1].set_ylim(ymin=0, ymax=7)

radius_of_points_training = TRAINING_FEATURES[:, 0] * 1.345 + 4.1 # <---------------------------------------------
angles_of_points_training = np.arctan2(TRAINING_FEATURES[:, 1], TRAINING_FEATURES[:, 2])

radius_of_points_validation = VALIDATION_FEATURES[:, 0] * 1.345 + 4.1 # <---------------------------------------------------------
angles_of_points_validation = np.arctan2(VALIDATION_FEATURES[:, 1], VALIDATION_FEATURES[:, 2])

rbins = np.linspace(0, 8, 30)
abins = np.linspace(-1 * np.pi, np.pi, 60)

A, R = np.meshgrid(abins, rbins)

average_chorus_pred_training = np.zeros_like(A)
average_chorus_pred_validation = np.zeros_like(A)
average_chorus_real_training = np.zeros_like(A)
average_chorus_real_validation = np.zeros_like(A)

for r in range(len(rbins) - 1):
    for a in range(len(abins) - 1):
        
        average_chorus_pred_training[r, a] += np.nanmean(training_pred[(rbins[r] <= radius_of_points_training) & (radius_of_points_training < rbins[r+1]) & (abins[a] < angles_of_points_training) & (angles_of_points_training < abins[a+1])])
        average_chorus_real_training[r, a] += np.nanmean(TRAINING_LABELS[(rbins[r] <= radius_of_points_training) & (radius_of_points_training < rbins[r+1]) & (abins[a] < angles_of_points_training) & (angles_of_points_training < abins[a+1])])
        average_chorus_pred_validation[r, a] += np.nanmean(validation_pred[(rbins[r] <= radius_of_points_validation) & (radius_of_points_validation < rbins[r+1]) & (abins[a] < angles_of_points_validation) & (angles_of_points_validation < abins[a+1])])
        average_chorus_real_validation[r, a] += np.nanmean(VALIDATION_LABELS[(rbins[r] <= radius_of_points_validation) & (radius_of_points_validation < rbins[r+1]) & (abins[a] < angles_of_points_validation) & (angles_of_points_validation < abins[a+1])])


pc = ax[0][0].pcolormesh(A, R, average_chorus_pred_training, norm=matplotlib.colors.LogNorm(vmin = 1, vmax = 100))
plt.colorbar(pc)

ax[0][0].set_xticklabels(['MLT 0', "", 'MLT 6', "", 'MLT 12', "", 'MLT 18'])
ax[0][1].set_xticklabels(['MLT 0', "", 'MLT 6', "", 'MLT 12', "", 'MLT 18'])
ax[1][0].set_xticklabels(['MLT 0', "", 'MLT 6', "", 'MLT 12', "", 'MLT 18'])
ax[1][1].set_xticklabels(['MLT 0', "", 'MLT 6', "", 'MLT 12', "", 'MLT 18'])


pc = ax[0][1].pcolormesh(A, R, average_chorus_pred_validation, norm=matplotlib.colors.LogNorm(vmin = 1, vmax = 100))
plt.colorbar(pc)

pc = ax[1][0].pcolormesh(A, R, average_chorus_real_training, norm=matplotlib.colors.LogNorm(vmin = 1, vmax = 100))
plt.colorbar(pc)

pc = ax[1][1].pcolormesh(A, R, average_chorus_real_validation, norm=matplotlib.colors.LogNorm(vmin = 1, vmax = 100))
plt.colorbar(pc)

ax[0][0].set_title("Predicted on Training Set")
ax[0][1].set_title("Predicted on Validation Set")
ax[1][0].set_title("Labels of Training Set")
ax[1][1].set_title("Labels of Validation Set")

plt.tight_layout()
plt.show()



In [None]:
fig, axs = plt.subplots(2, 1)

