In [1]:
from sklearnex import patch_sklearn
patch_sklearn()

import os
import sys
import warnings

# caution: path[0] is reserved for script path (or '' in REPL).
sys.path.insert(1, os.path.abspath("./../src"))

import matplotlib.colors
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

import sklearn
import scipy.stats


plt.rcdefaults()
warnings.filterwarnings("ignore")

%matplotlib qt

Intel(R) Extension for Scikit-learn* enabled (https://github.com/uxlfoundation/scikit-learn-intelex)


In [2]:
VERSION = "v2b"
FIELD_MODEL = "T89"
MODEL_TYPE = "LOWER_BAND"
pdata_folder = os.path.abspath(r"./../processed_data/chorus_neural_network/")
STAGE_4_folder = os.path.join(pdata_folder, "STAGE_4", VERSION)
CONJUNCTIONS_REFS = np.load(
    file=os.path.join(STAGE_4_folder, f"MODEL_READY_DATA_{VERSION}_{FIELD_MODEL}_{MODEL_TYPE}.npz")
)

X = CONJUNCTIONS_REFS["FEATURES"]
y = CONJUNCTIONS_REFS["LABELS"].flatten()
day = CONJUNCTIONS_REFS["TRAINING_DAY_IDS"].flatten()

MLT_train = CONJUNCTIONS_REFS["TRAINING_MLT"]
MEAN_L = CONJUNCTIONS_REFS["MEAN_L"]
STD_L = CONJUNCTIONS_REFS["STD_L"]


X_valid = CONJUNCTIONS_REFS["VALIDATION_FEATURES"]
y_valid = CONJUNCTIONS_REFS["VALIDATION_LABELS"].flatten()

CONJUNCTIONS_REFS.close()

In [3]:
print(f"Training set shape: {X.shape, y.shape}")
print(f"Validation set shape: {X_valid.shape, y_valid.shape}")

ax0 = sns.displot(y, log_scale=True)
ax0.set(ylabel='N', xlabel='Chorus (pT)', title='Training Set')
plt.tight_layout()

ax1 = sns.displot(y_valid, log_scale=True)
ax1.set(ylabel='N', xlabel='Chorus (pT)', title='Validation Set')
plt.tight_layout()

Training set shape: ((1258506, 5), (1258506,))
Validation set shape: ((12667, 5), (12667,))


In [20]:
def sigmoid_relevance(yp, k, c):

    return 1 / (1 + np.exp(-1 * k * (yp - c)))


def wercs_oversample(Xp, yp, dayp, relevance, size, relevance_threshold = 0.5):


    prob = relevance / np.sum(relevance)
    sample_indices = np.random.choice(range(len(yp)), size=size, p=prob, replace=True)
    X_new, y_new, day_new, rel_new = Xp[sample_indices, :], yp[sample_indices], dayp[sample_indices], relevance[sample_indices]

    return X_new, y_new, day_new, rel_new


def wercs(Xp, yp, dayp, relevance, relevance_threshold=0.5, noise=True):

    X_new, y_new, day_new, rel_new = wercs_oversample(
        Xp=Xp,
        yp=yp,
        dayp=dayp,
        relevance=relevance,
        size=int(len(yp)),
        relevance_threshold=0.5
    )

    return X_new, y_new, day_new, rel_new


def add_gaussian_noise(X, y, delta=0.5):
    """Currently only adds noise to the y values"""
    y_new = y + np.random.normal(loc=0, scale=(y / 10) * delta)
    return X, y_new

In [5]:
rare_y_threshold = 2.5

print(f"Given Training shape : {X.shape}")

relevance = sigmoid_relevance(np.log10(y + 1), 2.5, rare_y_threshold)
X_res, y_res, day_res = wercs(X, y, day, relevance)

X_res, y_res = add_gaussian_noise(X_res, y_res, delta=0.5)

print(f"Number Removed after adding noise: {np.sum(y_res <= 0)}")

y_res[y_res <= 0] = np.nan
where_finite = np.isfinite(y_res)
X_res = X_res[where_finite, :]
y_res = y_res[where_finite]

print(f"Resampled shape : {X_res.shape}")

Given Training shape : (1258506, 5)


ValueError: too many values to unpack (expected 3)

In [None]:
%%time
# Cross fold validation testing for best parameters


param_dist = {
    "n_estimators": scipy.stats.randint(100, 2500),
    "max_depth": scipy.stats.randint(5, 21),
    "min_samples_split" : scipy.stats.randint(2, 11),
    "min_samples_leaf" : scipy.stats.randint(2, 6),
    "max_features" : scipy.stats.randint(1, 5)
}

relevance = sigmoid_relevance(np.log10(y_res + 1), 2.5, rare_y_threshold)
y_for_strat = np.zeros_like(relevance)
y_for_strat[relevance <= 0.5] = 0
y_for_strat[relevance > 0.5] = 1

kfold = sklearn.model_selection.StratifiedGroupKFold(n_splits=5)
folds = kfold.split(X_res, y_for_strat, groups=day_res)

regr = sklearn.ensemble.RandomForestRegressor()
# Use random search to find the best hyperparameters
rand_search = sklearn.model_selection.RandomizedSearchCV(regr,
                                                         param_distributions=param_dist,
                                                         n_iter=50,
                                                         cv=list(folds),
                                                         verbose=1)

# Fit the random search object to the data
rand_search.fit(X_res, np.log(y_res + 1), sample_weight = relevance)

print(f"Best parameters: {rand_search.best_params_, rand_search.best_score_}")

with open(r"./best_parameters.txt", "w") as f:
    f.write(f"Best parameters: {rand_search.best_params_, rand_search.best_score_}")
    

In [21]:
%%time
delta = 0.5
k = 2

rare_y_threshold = 1.5

print(f"Given Training shape : {X.shape}")

relevance = sigmoid_relevance(np.log10(y), k, 1.5)
X_res, y_res, day_res, rel_res = wercs(X, y, day, relevance)

print(f"Resampled shape : {X_res.shape}")

kfold = sklearn.model_selection.GroupShuffleSplit(n_splits=1, test_size=0.2)
train_idx, test_idx = next(kfold.split(X_res, groups=day_res))

X_train = X_res[train_idx, :]
y_train = y_res[train_idx]
X_test = X_res[test_idx, :]
y_test = y_res[test_idx]
rel_train = rel_res[train_idx]

print(f"Resampled Training shape : {X_train.shape}")
print(f"Resampled Testing shape : {X_test.shape}")

ax0 = sns.displot(y_train, log_scale=True)
ax0.set(ylabel='N', xlabel='Chorus (pT)', title='Training Set')
plt.tight_layout()

ax1 = sns.displot(y_test, log_scale=True)
ax1.set(ylabel='N', xlabel='Chorus (pT)', title='Testing Set')
plt.tight_layout()

X_train, y_train = add_gaussian_noise(X_train, y_train, delta=delta)

print(f"Number Removed after adding noise: {np.sum(y_train < 0)}")

y_train[y_train < 0] = np.nan
where_finite = np.isfinite(y_train)
X_train = X_train[where_finite, :]
y_train = y_train[where_finite]
rel_train =rel_train[where_finite]

ax2 = sns.displot(y_train, log_scale=True)
ax2.set(ylabel='N', xlabel='Chorus (pT)', title='Training Set')
plt.tight_layout()

Given Training shape : (1258506, 5)
Resampled shape : (1258506, 5)
Resampled Training shape : (1014277, 5)
Resampled Testing shape : (244229, 5)
Number Removed after adding noise: 0
CPU times: total: 2.23 s
Wall time: 2.4 s


In [48]:
weights = sigmoid_relevance(np.log10(y_train), 1, 1.5)

order = np.argsort(y_train)
plt.plot(y_train[order], weights[order])
plt.xscale("log")

In [49]:
%%time
#Best parameters: ({'max_depth': 9, 'max_features': 2, 'min_samples_leaf': 3, 'min_samples_split': 6, 'n_estimators': 143}, np.float64(0.2881147380512229))

regr = sklearn.ensemble.RandomForestRegressor(max_depth = 10, 
                                              max_features = 2, 
                                              min_samples_leaf = 3,  
                                              min_samples_split = 6,
                                              n_estimators = 500)

regr.fit(X_train, np.log(y_train), sample_weight = weights)

CPU times: total: 8min 59s
Wall time: 1min 11s


In [50]:
#y_train_pred = 10 ** regr.predict(X_train) - 1
#y_test_pred = 10 ** regr.predict(X_test) - 1
#y_valid_pred = 10 ** regr.predict(X_valid) - 1


#y_train_pred = (1000 ** regr.predict(X_train)) - 1
#y_test_pred = (1000 ** regr.predict(X_test)) - 1
#y_valid_pred = (1000 ** regr.predict(X_valid)) - 1

#y_train_pred = (regr.predict(X_train) * b) + a
#y_test_pred = (regr.predict(X_test) * b) + a
#y_valid_pred = (regr.predict(X_valid) * b) + a

y_train_pred = np.exp(regr.predict(X_train))
y_test_pred = np.exp(regr.predict(X_test))
y_valid_pred = np.exp(regr.predict(X_valid))

#y_train_pred = regr.predict(X_train)
#y_test_pred = regr.predict(X_test)
#y_valid_pred = regr.predict(X_valid)

In [51]:
log_scale_axis = True
normalize_columns = True
grid_resolution = 100
plot_min = 1.0
plot_max = 1000
p_cbar_min = 0.00001
n_cbar_max = 10000

In [52]:
within_bounds = (plot_min < y_train) & (y_train < plot_max) & (plot_min < y_train_pred) & (y_train_pred < plot_max)

fig = plt.figure(figsize=(12,8))
fig.set_facecolor('black')
params = {"ytick.color" : "w",
          "xtick.color" : "w",
          "axes.labelcolor" : "w",
          "axes.edgecolor" : "w",
          "axes.titlecolor" : "w"}
plt.rcParams.update(params)

if log_scale_axis:
    x_bins = np.logspace(np.log10(plot_min), np.log10(plot_max), grid_resolution)
    y_bins = np.logspace(np.log10(plot_min), np.log10(plot_max), grid_resolution)
else:
    x_bins = np.linspace(plot_min, plot_max, grid_resolution)
    y_bins = np.linspace(plot_min, plot_max, grid_resolution)

x_mapping = np.digitize(y_train[within_bounds], x_bins, right=False) - 1
y_mapping = np.digitize(y_train_pred[within_bounds], y_bins, right=False) - 1

image_to_plot = np.zeros(shape=(x_bins.shape[0], y_bins.shape[0]))

for T in range(len(y_train_pred[within_bounds])):

    x_bin = x_mapping[T]
    y_bin = y_mapping[T]
    image_to_plot[x_bin, y_bin] += 1

if normalize_columns:

    for x_bin in range(image_to_plot.shape[0]):

        image_to_plot[x_bin, :] /= np.sum(np.sum(image_to_plot, axis=1))

    ax = fig.add_subplot(1, 1, 1)
    image = ax.imshow(
        image_to_plot.T,
        origin="lower",
        extent=[x_bins[0], x_bins[-1], y_bins[0], y_bins[-1]],
        norm=matplotlib.colors.LogNorm(vmin=p_cbar_min, vmax=1),
        aspect="equal",
        interpolation="none",
    )

    image.cmap.set_under("black")
    plt.axis('off')

    cbar = fig.colorbar(image, pad=0.01)
    cbar.set_label("Probability\n", labelpad = 20, loc="center", rotation=270)

else:

    ax = fig.add_subplot(1, 1, 1)
    image = ax.imshow(
        image_to_plot.T,
        origin="lower",
        extent=[x_bins[0], x_bins[-1], y_bins[0], y_bins[-1]],
        norm=matplotlib.colors.LogNorm(vmin=1, vmax=n_cbar_max),
        aspect="equal",
        interpolation="none")

    image.cmap.set_under("black")
    plt.axis('off')

    cbar = fig.colorbar(image, pad=0.01)
    cbar.set_label("Number of Points\n", labelpad = 20, loc="center", rotation=270)

newax = fig.add_axes(ax.get_position(), frameon=False)
newax.set_xlim((x_bins[0], x_bins[-1]))
newax.set_ylim((y_bins[0], y_bins[-1]))

if log_scale_axis:
    newax.set_xscale('log')
    newax.set_yscale('log')


newax.set_xlabel("RBSP OBSERVED CHORUS")
newax.set_ylabel("MODEL PREDICTED CHORUS")
newax.set_title("TRAINING SET")

newax.plot(x_bins, y_bins, color="white")
newax.grid()
matplotlib.pyplot.rcdefaults()


In [53]:
add_testing_set = True

within_bounds = (plot_min < y_valid) & (y_valid < plot_max) & (plot_min < y_valid_pred) & (y_valid_pred < plot_max)
within_bounds_t = (plot_min < y_test) & (y_test < plot_max) & (plot_min < y_test_pred) & (y_test_pred < plot_max)

fig = plt.figure(figsize=(12,8))
fig.set_facecolor('black')
params = {"ytick.color" : "w",
          "xtick.color" : "w",
          "axes.labelcolor" : "w",
          "axes.edgecolor" : "w",
          "axes.titlecolor" : "w"}
plt.rcParams.update(params)

if log_scale_axis:
    x_bins = np.logspace(np.log10(plot_min), np.log10(plot_max), grid_resolution)
    y_bins = np.logspace(np.log10(plot_min), np.log10(plot_max), grid_resolution)
else:
    x_bins = np.linspace(plot_min, plot_max, grid_resolution)
    y_bins = np.linspace(plot_min, plot_max, grid_resolution)

x_mapping = np.digitize(y_valid[within_bounds], x_bins, right=False) - 1
y_mapping = np.digitize(y_valid_pred[within_bounds], y_bins, right=False) - 1

image_to_plot = np.zeros(shape=(x_bins.shape[0], y_bins.shape[0]))

for T in range(len(y_valid_pred[within_bounds])):

    x_bin = x_mapping[T]
    y_bin = y_mapping[T]
    image_to_plot[x_bin, y_bin] += 1

if add_testing_set:

    x_mapping = np.digitize(y_test[within_bounds_t], x_bins, right=False) - 1
    y_mapping = np.digitize(y_test_pred[within_bounds_t], y_bins, right=False) - 1

    for T in range(len(y_test_pred[within_bounds_t])):

        x_bin = x_mapping[T]
        y_bin = y_mapping[T]
        image_to_plot[x_bin, y_bin] += 1

if normalize_columns:

    for x_bin in range(image_to_plot.shape[0]):

        image_to_plot[x_bin, :] /= np.sum(np.sum(image_to_plot, axis=1))

    ax = fig.add_subplot(1, 1, 1)
    image = ax.imshow(
        image_to_plot.T,
        origin="lower",
        extent=[x_bins[0], x_bins[-1], y_bins[0], y_bins[-1]],
        norm=matplotlib.colors.LogNorm(vmin=p_cbar_min, vmax=1),
        aspect="equal",
        interpolation="none",
    )

    image.cmap.set_under("black")
    plt.axis('off')

    cbar = fig.colorbar(image, pad=0.01)
    cbar.set_label("Probability\n", labelpad = 20, loc="center", rotation=270)

else:

    ax = fig.add_subplot(1, 1, 1)
    image = ax.imshow(
        image_to_plot.T,
        origin="lower",
        extent=[x_bins[0], x_bins[-1], y_bins[0], y_bins[-1]],
        norm=matplotlib.colors.LogNorm(vmin=1, vmax=n_cbar_max),
        aspect="equal",
        interpolation="none")

    image.cmap.set_under("black")
    plt.axis('off')

    cbar = fig.colorbar(image, pad=0.01)
    cbar.set_label("Number of Points\n", labelpad = 20, loc="center", rotation=270)

newax = fig.add_axes(ax.get_position(), frameon=False)
newax.set_xlim((x_bins[0], x_bins[-1]))
newax.set_ylim((y_bins[0], y_bins[-1]))

if log_scale_axis:
    newax.set_xscale('log')
    newax.set_yscale('log')

newax.set_xlabel("RBSP OBSERVED CHORUS")
newax.set_ylabel("MODEL PREDICTED CHORUS")

newax.set_title("Validation + Testing Set")

newax.plot(x_bins, y_bins, color="white")
newax.grid()
matplotlib.pyplot.rcdefaults()