In [19]:
from sklearnex import patch_sklearn
patch_sklearn()

import os
import sys
import warnings

# caution: path[0] is reserved for script path (or '' in REPL).
sys.path.insert(1, os.path.abspath("./../src"))

import matplotlib.colors
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

import sklearn
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
import scipy.stats


plt.rcdefaults()
warnings.filterwarnings("ignore")

%matplotlib qt

Intel(R) Extension for Scikit-learn* enabled (https://github.com/uxlfoundation/scikit-learn-intelex)


In [20]:
VERSION = "v4"
FIELD_MODEL = "T89"
MODEL_TYPE = "LOWER_BAND"
pdata_folder = os.path.abspath(r"./../processed_data/chorus_neural_network/")
STAGE_4_folder = os.path.join(pdata_folder, "STAGE_4", VERSION)
CONJUNCTIONS_REFS = np.load(
    file=os.path.join(STAGE_4_folder, f"MODEL_READY_DATA_{VERSION}_{FIELD_MODEL}_{MODEL_TYPE}.npz")
)

X = CONJUNCTIONS_REFS["FEATURES"]
y = CONJUNCTIONS_REFS["LABELS"].flatten()
day = CONJUNCTIONS_REFS["TRAINING_DAY_IDS"].flatten()

MLT_train = CONJUNCTIONS_REFS["TRAINING_MLT"]
MEAN_L = CONJUNCTIONS_REFS["MEAN_L"]
STD_L = CONJUNCTIONS_REFS["STD_L"]


X_valid = CONJUNCTIONS_REFS["VALIDATION_FEATURES"]
y_valid = CONJUNCTIONS_REFS["VALIDATION_LABELS"].flatten()

CONJUNCTIONS_REFS.close()

In [21]:
print(f"Training set shape: {X.shape, y.shape}")
print(f"Validation set shape: {X_valid.shape, y_valid.shape}")

ax0 = sns.displot(y, log_scale=True)
ax0.set(ylabel='N', xlabel='Chorus (pT)', title='Training Set')
plt.tight_layout()

#ax1 = sns.displot(y_valid, log_scale=True)
#ax1.set(ylabel='N', xlabel='Chorus (pT)', title='Validation Set')
#plt.tight_layout()

Training set shape: ((281186, 7), (281186,))
Validation set shape: ((3699, 7), (3699,))


In [22]:
def sigmoid_relevance(yp, k, c):

    return 1 / (1 + np.exp(-1 * k * (yp - c)))


def wercs_oversample(Xp, yp, dayp, relevance, size, relevance_threshold = 0.5):


    prob = relevance / np.sum(relevance)
    sample_indices = np.random.choice(range(len(yp)), size=size, p=prob, replace=True)
    X_new, y_new, day_new, rel_new = Xp[sample_indices, :], yp[sample_indices], dayp[sample_indices], relevance[sample_indices]

    return X_new, y_new, day_new, rel_new


def wercs(Xp, yp, dayp, relevance, size, relevance_threshold=0.5, noise=True):

    X_new, y_new, day_new, rel_new = wercs_oversample(
        Xp=Xp,
        yp=yp,
        dayp=dayp,
        relevance=relevance,
        size=size,
        relevance_threshold=0.5
    )

    return np.vstack([Xp, X_new]), np.hstack([yp, y_new]), np.hstack([dayp, day_new]), np.hstack([relevance, rel_new])


def add_gaussian_noise(X, y, deltaX=0.5, deltaY=0.5):
    """Currently only adds noise to the y values"""
    y_new = y + np.random.normal(loc=0, scale=(y / 10) * deltaY)
    X_new = np.zeros_like(X)
    for feature in range(X.shape[1]):
        X_new[:, feature] = X[:, feature] + np.random.normal(loc=0, scale=np.abs(X[:, feature] / 10) * deltaX)
    return X_new, y_new

In [68]:
deltaY = 0.3
deltaX = 0.3
k = 4

rare_y_threshold = 1.0

print(f"Given Training shape : {X.shape}")

relevance = sigmoid_relevance(np.log10(y), k, rare_y_threshold)
X_res, y_res, day_res, rel_res = wercs(X, y, day, relevance)

print(f"Resampled shape : {X_res.shape}")

rare_y_threshold = 2.5

relevance_res = sigmoid_relevance(np.log10(y_res), k, rare_y_threshold)
X_res, y_res, day_res, rel_res = wercs(X_res, y_res, day_res, relevance_res)

print(f"Resampled shape : {X_res.shape}")

print(f"Number Removed after adding noise: {np.sum(y_res <= 0)}")

y_res[y_res <= 0] = np.nan
where_finite = np.isfinite(y_res)
X_res = X_res[where_finite, :]
y_res = y_res[where_finite]

X_res, y_res = add_gaussian_noise(X_res, y_res, deltaX=deltaX, deltaY=deltaY)

ax0 = sns.displot(y_res, log_scale=True)
ax0.set(ylabel='N', xlabel='Chorus (pT)', title='Training Set')
plt.tight_layout()

print(f"Resampled shape : {X_res.shape}")

Given Training shape : (281186, 7)


TypeError: wercs() missing 1 required positional argument: 'size'

In [None]:
%%time
# Cross fold validation testing for best parameters

relevance = sigmoid_relevance(np.log10(y_res), k, rare_y_threshold)
y_for_strat = np.zeros_like(relevance)
y_for_strat[relevance <= 0.5] = 0
y_for_strat[relevance > 0.5] = 1

kfold = sklearn.model_selection.StratifiedGroupKFold(n_splits=3)
folds = kfold.split(X_res, y_for_strat, groups=day_res)

# Use random search to find the best hyperparameters
opt = BayesSearchCV(
     sklearn.ensemble.RandomForestRegressor(),
     {
         'n_estimators': Integer(500, 600),
         'max_depth': Integer(5, 6),
         'min_samples_split': Integer(2, 11),
         'min_samples_leaf': Integer(2, 6),
         "max_features" : Integer(1, 5)
     },
     n_iter=2,
     cv=folds,
     verbose=True
 )

# callback handler
def on_step(optim_result):
    print(f"OPTIM RESULT: {optim_result}")

# Fit the random search object to the data
opt.fit(X_res, y_res, callback=on_step)

print(f"Best parameters: {opt.best_params_, opt.best_score_}")

with open(r"./best_parameters.txt", "w") as f:
    f.write(f"Best parameters: {opt.best_params_, opt.best_score_}")
    

In [23]:
%%time
deltaY = 0.1
deltaX = 0.1
k = 4

rare_y_threshold = 1.0

print(f"Given Training shape : {X.shape}")

relevance = sigmoid_relevance(np.log10(y), k, rare_y_threshold)
X_res, y_res, day_res, rel_res = wercs(X, y, day, relevance, size=0)

print(f"Resampled shape : {X_res.shape}")

kfold = sklearn.model_selection.GroupShuffleSplit(n_splits=1, test_size=0.2)
train_idx, test_idx = next(kfold.split(X_res, groups=day_res))

X_train = X_res[train_idx, :]
y_train = y_res[train_idx]
X_test = X_res[test_idx, :]
y_test = y_res[test_idx]
rel_train = rel_res[train_idx]

print(f"Resampled Training shape : {X_train.shape}")
print(f"Resampled Testing shape : {X_test.shape}")

ax0 = sns.displot(y_train, log_scale=True)
ax0.set(ylabel='N', xlabel='Chorus (pT)', title='Training Set')
plt.tight_layout()

ax1 = sns.displot(y_test, log_scale=True)
ax1.set(ylabel='N', xlabel='Chorus (pT)', title='Testing Set')
plt.tight_layout()

#X_train, y_train = add_gaussian_noise(X_train, y_train, deltaX=deltaX, deltaY=deltaY)

print(f"Number Removed after adding noise: {np.sum(y_train < 0)}")

y_train[y_train < 0] = np.nan
where_finite = np.isfinite(y_train)
X_train = X_train[where_finite, :]
y_train = y_train[where_finite]
rel_train =rel_train[where_finite]

ax2 = sns.displot(y_train, log_scale=True)
ax2.set(ylabel='N', xlabel='Chorus (pT)', title='Training Set')
plt.tight_layout()

Given Training shape : (281186, 7)
Resampled shape : (281186, 7)
Resampled Training shape : (222982, 7)
Resampled Testing shape : (58204, 7)
Number Removed after adding noise: 0
CPU times: total: 906 ms
Wall time: 911 ms


In [25]:
%%time
#Best parameters: ({'max_depth': 9, 'max_features': 2, 'min_samples_leaf': 3, 'min_samples_split': 6, 'n_estimators': 143}, np.float64(0.2881147380512229))

regr = sklearn.ensemble.RandomForestRegressor(max_depth = 12,
                                              n_estimators = 1000)

regr.fit(X_train, y_train)

CPU times: total: 2min 9s
Wall time: 17.8 s


In [26]:
#y_train_pred = 10 ** regr.predict(X_train)
#y_test_pred = 10 ** regr.predict(X_test)
#y_valid_pred = 10 ** regr.predict(X_valid)


#y_train_pred = (1000 ** regr.predict(X_train)) - 1
#y_test_pred = (1000 ** regr.predict(X_test)) - 1
#y_valid_pred = (1000 ** regr.predict(X_valid)) - 1


#y_train_pred = (regr.predict(X_train) * b) + a
#y_test_pred = (regr.predict(X_test) * b) + a
#y_valid_pred = (regr.predict(X_valid) * b) + a

#y_train_pred = np.exp(regr.predict(X_train))
#y_test_pred = np.exp(regr.predict(X_test))
#y_valid_pred = np.exp(regr.predict(X_valid))

y_train_pred = regr.predict(X_train)
y_test_pred = regr.predict(X_test)
y_valid_pred = regr.predict(X_valid)


In [27]:
print(regr.feature_importances_)

[0.12819694 0.10457094 0.11575402 0.16268606 0.12654793 0.2459389
 0.11630521]


In [28]:
print(sklearn.metrics.r2_score(y_train, y_train_pred))
print(sklearn.metrics.r2_score(np.hstack([y_test, y_valid]), np.hstack([y_test_pred, y_valid_pred])))

for i in range(-1, 3):

    y_t_true = np.hstack([y_test, y_valid])
    y_t_pred = np.hstack([y_test_pred, y_valid_pred])
    
    training_indices = ((10 ** i) < y_train) & (y_train < (10 ** (i+1)))
    print(f"\nTraining RMSE between {10**i:2} and {10**(i+1):2}: {sklearn.metrics.root_mean_squared_error(y_train[training_indices], y_train_pred[training_indices])}")

    test_indices = ((10 ** i) < y_t_true) & (y_t_true < (10 ** (i+1)))
    print(f"Testing RMSE between {10**i:2} and {10**(i+1):2}: {sklearn.metrics.root_mean_squared_error(y_t_true[test_indices], y_t_pred[test_indices])}\n")
    

0.6368409316329046
0.15412835608277453

Training RMSE between 0.1 and  1: 3.5988351224079422
Testing RMSE between 0.1 and  1: 5.140786311220582


Training RMSE between  1 and 10: 3.5415146325172895
Testing RMSE between  1 and 10: 5.195655343288349


Training RMSE between 10 and 100: 15.318816580214179
Testing RMSE between 10 and 100: 20.45365557679673


Training RMSE between 100 and 1000: 75.92796817823823
Testing RMSE between 100 and 1000: 133.08667717188032



In [29]:
log_scale_axis = True
normalize_columns = True
grid_resolution = 150
plot_min = 0.1
plot_max = 1000
p_cbar_min = 0.001
n_cbar_max = 10000

In [30]:
within_bounds = (plot_min < y_train) & (y_train < plot_max) & (plot_min < y_train_pred) & (y_train_pred < plot_max)

fig = plt.figure(figsize=(12,8))
fig.set_facecolor('black')
params = {"ytick.color" : "w",
          "xtick.color" : "w",
          "axes.labelcolor" : "w",
          "axes.edgecolor" : "w",
          "axes.titlecolor" : "w"}
plt.rcParams.update(params)

if log_scale_axis:
    x_bins = np.logspace(np.log10(plot_min), np.log10(plot_max), grid_resolution)
    y_bins = np.logspace(np.log10(plot_min), np.log10(plot_max), grid_resolution)
else:
    x_bins = np.linspace(plot_min, plot_max, grid_resolution)
    y_bins = np.linspace(plot_min, plot_max, grid_resolution)

x_mapping = np.digitize(y_train[within_bounds], x_bins, right=False) - 1
y_mapping = np.digitize(y_train_pred[within_bounds], y_bins, right=False) - 1

image_to_plot = np.zeros(shape=(x_bins.shape[0], y_bins.shape[0]))

for T in range(len(y_train_pred[within_bounds])):

    x_bin = x_mapping[T]
    y_bin = y_mapping[T]
    image_to_plot[x_bin, y_bin] += 1

if normalize_columns:

    for x_bin in range(image_to_plot.shape[0]):

        image_to_plot[x_bin, :] /= np.sum(image_to_plot[x_bin, :])

    ax = fig.add_subplot(1, 1, 1)
    image = ax.imshow(
        image_to_plot.T,
        origin="lower",
        extent=[x_bins[0], x_bins[-1], y_bins[0], y_bins[-1]],
        norm=matplotlib.colors.LogNorm(vmin=p_cbar_min, vmax=1),
        aspect="equal",
        interpolation="none",
    )

    image.cmap.set_under("black")

    cbar = fig.colorbar(image, pad=0.01)
    cbar.set_label("Probability\n", labelpad = 20, loc="center", rotation=270)

else:

    ax = fig.add_subplot(1, 1, 1)
    image = ax.imshow(
        image_to_plot.T,
        origin="lower",
        extent=[x_bins[0], x_bins[-1], y_bins[0], y_bins[-1]],
        norm=matplotlib.colors.LogNorm(vmin=1, vmax=n_cbar_max),
        aspect="equal",
        interpolation="none")

    image.cmap.set_under("black")

    cbar = fig.colorbar(image, pad=0.01)
    cbar.set_label("Number of Points\n", labelpad = 20, loc="center", rotation=270)

ax.set_xlabel("RBSP OBSERVED CHORUS")
ax.set_ylabel("MODEL PREDICTED CHORUS")
ax.set_title("TRAINING SET")

ax.plot(x_bins, y_bins, color="white")
ax.grid()
matplotlib.pyplot.rcdefaults()


In [31]:
add_testing_set = True

within_bounds = (plot_min < y_valid) & (y_valid < plot_max) & (plot_min < y_valid_pred) & (y_valid_pred < plot_max)
within_bounds_t = (plot_min < y_test) & (y_test < plot_max) & (plot_min < y_test_pred) & (y_test_pred < plot_max)

fig = plt.figure(figsize=(12,8))
fig.set_facecolor('black')
params = {"ytick.color" : "w",
          "xtick.color" : "w",
          "axes.labelcolor" : "w",
          "axes.edgecolor" : "w",
          "axes.titlecolor" : "w"}
plt.rcParams.update(params)

if log_scale_axis:
    x_bins = np.logspace(np.log10(plot_min), np.log10(plot_max), grid_resolution)
    y_bins = np.logspace(np.log10(plot_min), np.log10(plot_max), grid_resolution)
else:
    x_bins = np.linspace(plot_min, plot_max, grid_resolution)
    y_bins = np.linspace(plot_min, plot_max, grid_resolution)

x_mapping = np.digitize(y_valid[within_bounds], x_bins, right=False) - 1
y_mapping = np.digitize(y_valid_pred[within_bounds], y_bins, right=False) - 1

image_to_plot = np.zeros(shape=(x_bins.shape[0], y_bins.shape[0]))

for T in range(len(y_valid_pred[within_bounds])):

    x_bin = x_mapping[T]
    y_bin = y_mapping[T]
    image_to_plot[x_bin, y_bin] += 1

if add_testing_set:

    x_mapping = np.digitize(y_test[within_bounds_t], x_bins, right=False) - 1
    y_mapping = np.digitize(y_test_pred[within_bounds_t], y_bins, right=False) - 1

    for T in range(len(y_test_pred[within_bounds_t])):

        x_bin = x_mapping[T]
        y_bin = y_mapping[T]
        image_to_plot[x_bin, y_bin] += 1

if normalize_columns:

    for x_bin in range(image_to_plot.shape[0]):

        image_to_plot[x_bin, :] /= np.sum(image_to_plot[x_bin, :])

    ax = fig.add_subplot(1, 1, 1)
    image = ax.imshow(
        image_to_plot.T,
        origin="lower",
        extent=[x_bins[0], x_bins[-1], y_bins[0], y_bins[-1]],
        norm=matplotlib.colors.LogNorm(vmin=p_cbar_min, vmax=1),
        aspect="equal",
        interpolation="none",
    )

    image.cmap.set_under("black")
    plt.axis('off')

    cbar = fig.colorbar(image, pad=0.01)
    cbar.set_label("Probability\n", labelpad = 20, loc="center", rotation=270)

else:

    ax = fig.add_subplot(1, 1, 1)
    image = ax.imshow(
        image_to_plot.T,
        origin="lower",
        extent=[x_bins[0], x_bins[-1], y_bins[0], y_bins[-1]],
        norm=matplotlib.colors.LogNorm(vmin=1, vmax=n_cbar_max),
        aspect="equal",
        interpolation="none")

    image.cmap.set_under("black")
    plt.axis('off')

    cbar = fig.colorbar(image, pad=0.01)
    cbar.set_label("Number of Points\n", labelpad = 20, loc="center", rotation=270)

newax = fig.add_axes(ax.get_position(), frameon=False)
newax.set_xlim((x_bins[0], x_bins[-1]))
newax.set_ylim((y_bins[0], y_bins[-1]))

if log_scale_axis:
    newax.set_xscale('log')
    newax.set_yscale('log')

newax.set_xlabel("RBSP OBSERVED CHORUS")
newax.set_ylabel("MODEL PREDICTED CHORUS")

newax.set_title("Validation + Testing Set")

newax.plot(x_bins, y_bins, color="white")
newax.grid()
matplotlib.pyplot.rcdefaults()