In [1]:
import os

import matplotlib.pyplot as plt
import numpy as np
import sklearn
import pynndescent

%matplotlib qt

In [2]:
VERSION = "v2"
FIELD_MODEL = "T89"
MODEL_TYPE = "LOWER_BAND"
pdata_folder = os.path.abspath(r"./../processed_data/chorus_neural_network/")
STAGE_4_folder = os.path.join(pdata_folder, "STAGE_4", VERSION)
CONJUNCTIONS_REFS = np.load(
    file=os.path.join(STAGE_4_folder, f"MODEL_READY_DATA_{VERSION}_{FIELD_MODEL}_{MODEL_TYPE}.npz")
)

TRAINING_FEATURES = CONJUNCTIONS_REFS["FEATURES"]
TRAINING_LABELS = CONJUNCTIONS_REFS["LABELS"].flatten()
TRAINING_DAY_IDS = CONJUNCTIONS_REFS["TRAINING_DAY_IDS"].flatten()

TRAINING_MLT = CONJUNCTIONS_REFS["TRAINING_MLT"]
MEAN_L = CONJUNCTIONS_REFS["MEAN_L"]
STD_L = CONJUNCTIONS_REFS["STD_L"]

VALIDATION_FEATURES = CONJUNCTIONS_REFS["VALIDATION_FEATURES"]
VALIDATION_LABELS = CONJUNCTIONS_REFS["VALIDATION_LABELS"].flatten()
VALIDATION_DAY_IDS = CONJUNCTIONS_REFS["VALIDATION_DAY_IDS"].flatten()

CONJUNCTIONS_REFS.close()

print(f"Training set shape: {TRAINING_FEATURES.shape, TRAINING_LABELS.shape}")
print(f"Validation set shape: {VALIDATION_FEATURES.shape, VALIDATION_LABELS.shape}")

Training set shape: ((684250, 8), (684250,))
Validation set shape: ((21435, 8), (21435,))


In [3]:
num_folds = 5

kfold_gen = sklearn.model_selection.GroupKFold(n_splits=num_folds, shuffle=True)
folds_separating_days = list(
    kfold_gen.split(TRAINING_FEATURES, TRAINING_LABELS, groups=TRAINING_DAY_IDS)
)

folds = []

for fold in range(num_folds):

    train = folds_separating_days[fold][0]

    train_b_p1_1 = np.nonzero((0.1 <= TRAINING_LABELS[train]) & (TRAINING_LABELS[train] < 1))[0]
    train_b_1_10 = np.nonzero((1 <= TRAINING_LABELS[train]) & (TRAINING_LABELS[train] < 10))[0]
    train_b_10_100 = np.nonzero((10 <= TRAINING_LABELS[train]) & (TRAINING_LABELS[train] < 100))[0]
    train_g_100 = np.nonzero(100 <= TRAINING_LABELS[train])[0]

    test = folds_separating_days[fold][1]

    print(f"\nRelative Sizes in fold : {fold}")
    print("Training: ")
    print(
        len(train_b_p1_1),
        len(train_b_1_10),
        len(train_b_10_100),
        len(train_g_100),
    )

    largest_size_training = max(
        len(train_b_p1_1),
        len(train_b_1_10),
        len(train_b_10_100),
        len(train_g_100),
    )

    train_p1_1_resampled = sklearn.utils.resample(train[train_b_p1_1], n_samples=largest_size_training)
    train_1_10_resampled = sklearn.utils.resample(train[train_b_1_10], n_samples=largest_size_training)
    train_10_100_resampled = sklearn.utils.resample(train[train_b_10_100], n_samples=largest_size_training)
    train_g_100_resampled = sklearn.utils.resample(train[train_g_100], n_samples=largest_size_training)

    train_for_fold = np.hstack(
        (train_p1_1_resampled, train_1_10_resampled, train_10_100_resampled, train_g_100_resampled)
    )

    np.random.shuffle(train_for_fold)
    np.random.shuffle(test)

    folds.append((train_for_fold, test))

b_p1_1 = np.nonzero((0.1 <= TRAINING_LABELS) & (TRAINING_LABELS < 1))[0]
b_1_10 = np.nonzero((1 <= TRAINING_LABELS) & (TRAINING_LABELS < 10))[0]
b_10_100 = np.nonzero((10 <= TRAINING_LABELS) & (TRAINING_LABELS < 100))[0]
g_100 = np.nonzero(100 <= TRAINING_LABELS)[0]

largest_size_training_full = max(len(b_p1_1), len(b_1_10), len(b_10_100), len(g_100))

full_p1_1_resampled = sklearn.utils.resample(b_p1_1, n_samples=largest_size_training_full)
full_1_10_resampled = sklearn.utils.resample(b_1_10, n_samples=largest_size_training_full)
full_10_100_resampled = sklearn.utils.resample(b_10_100, n_samples=largest_size_training_full)
full_g_100_resampled = sklearn.utils.resample(g_100, n_samples=largest_size_training_full)

RESAMPLED_TRAINING_FEATURES = np.vstack(
    [
        TRAINING_FEATURES[full_p1_1_resampled, :],
        TRAINING_FEATURES[full_1_10_resampled, :],
        TRAINING_FEATURES[full_10_100_resampled, :],
        TRAINING_FEATURES[full_g_100_resampled, :],
    ]
)


RESAMPLED_TRAINING_LABELS = np.hstack(
    [
        TRAINING_LABELS[full_p1_1_resampled],
        TRAINING_LABELS[full_1_10_resampled],
        TRAINING_LABELS[full_10_100_resampled],
        TRAINING_LABELS[full_g_100_resampled],
    ]
)


indices_for_shuffle = np.arange(0, len(RESAMPLED_TRAINING_LABELS))
np.random.shuffle(indices_for_shuffle)
shuffled_indices = indices_for_shuffle.flatten()

RESAMPLED_TRAINING_FEATURES = RESAMPLED_TRAINING_FEATURES[shuffled_indices, :]
RESAMPLED_TRAINING_LABELS = RESAMPLED_TRAINING_LABELS[shuffled_indices]

weight_for_p1_1 = len(b_p1_1) / largest_size_training_full
weight_for_1_10 = len(b_1_10) / largest_size_training_full
weight_for_10_100 = len(b_10_100) / largest_size_training_full
weight_for_g_100 = len(g_100) / largest_size_training_full

sum_weights = np.sum([weight_for_p1_1, weight_for_1_10, weight_for_10_100, weight_for_g_100])

weight_for_p1_1 /= sum_weights
weight_for_1_10 /= sum_weights
weight_for_10_100 /= sum_weights
weight_for_g_100 /= sum_weights

print("\nWeights for each category")
print(weight_for_p1_1, weight_for_1_10, weight_for_10_100, weight_for_g_100)

weights = np.zeros_like(TRAINING_LABELS)
weights[b_p1_1] = weight_for_p1_1
weights[b_1_10] = weight_for_1_10
weights[b_10_100] = weight_for_10_100
weights[g_100] = weight_for_g_100

weights_resampled = np.zeros_like(RESAMPLED_TRAINING_LABELS)
weights_resampled[(0.1 <= RESAMPLED_TRAINING_LABELS) & (RESAMPLED_TRAINING_LABELS < 1)] = (
    weight_for_p1_1
)
weights_resampled[(1 <= RESAMPLED_TRAINING_LABELS) & (RESAMPLED_TRAINING_LABELS < 10)] = (
    weight_for_1_10
)
weights_resampled[(10 <= RESAMPLED_TRAINING_LABELS) & (RESAMPLED_TRAINING_LABELS < 100)] = (
    weight_for_10_100
)
weights_resampled[(100 <= RESAMPLED_TRAINING_LABELS)] = weight_for_g_100

print("\nSize of resampled Training Features: ")
print(RESAMPLED_TRAINING_FEATURES.shape, RESAMPLED_TRAINING_LABELS.shape)

print("\nFolds")
for fold in folds:
    print(fold)


Relative Sizes in fold : 0
Training: 
30615 437412 79814 959

Relative Sizes in fold : 1
Training: 
31123 435519 80506 1004

Relative Sizes in fold : 2
Training: 
31018 436522 79145 948

Relative Sizes in fold : 3
Training: 
29565 436828 80339 1000

Relative Sizes in fold : 4
Training: 
30507 432811 80300 1037

Weights for each category
0.055838349826012094 0.7961689049065901 0.14618490799321293 0.0018078372741847555

Size of resampled Training Features: 
(2179092, 8) (2179092,)

Folds
(array([630306,  99618, 162356, ..., 240148, 251754, 141974]), array([324929, 111737, 260037, ..., 155944, 140797, 247473]))
(array([105461, 264271, 183962, ...,   9577,   6799,  31357]), array([474409, 244073, 429581, ...,  17979,  26998, 283801]))
(array([ 41230,  17155,  21417, ..., 107782, 313005,  23017]), array([280002,  98227, 512039, ..., 549737, 119475, 460161]))
(array([ 60837,  12847, 237217, ..., 385834, 683542,  61651]), array([268048, 268057, 501432, ..., 625014, 308867, 134281]))
(array([

In [4]:
pca = sklearn.decomposition.PCA(n_components=3)
transformed_training_features = pca.fit_transform(RESAMPLED_TRAINING_FEATURES)

print(pca.singular_values_)
print(pca.components_)

[5693.50982843 4839.4878028  3447.70528141]
[[ 0.00177569  0.98861093  0.01122547  0.10243987 -0.0622544   0.08093784
  -0.01283851  0.03786777]
 [ 0.99744499 -0.0118851   0.00207288  0.04019308 -0.0221341   0.0456874
   0.01286489  0.0244886 ]
 [-0.06565463 -0.13969325  0.05540332  0.6114741  -0.55933932  0.49426991
   0.19310677  0.06893274]]


In [5]:
num_principal_components = 4
pca = sklearn.decomposition.PCA(n_components=num_principal_components)
transformed_training_features = pca.fit_transform(RESAMPLED_TRAINING_FEATURES)

transformed_validation_features = pca.fit_transform(VALIDATION_FEATURES)

In [51]:
%%time

knn = pynndescent.NNDescent(transformed_training_features, n_jobs=10, )
knn.prepare()

CPU times: total: 12min 20s
Wall time: 4min 46s


In [81]:
%%time

validation_prediction = knn.query(transformed_validation_features, k=10)

CPU times: total: 391 ms
Wall time: 393 ms


In [82]:
print(len(validation_prediction))
predictions = []

for neighbors, distances in zip(validation_prediction[0], validation_prediction[1]):

    w = (1 / distances)
    w = w / np.sum(w)
    predictions.append(np.average(RESAMPLED_TRAINING_LABELS[neighbors], weights=w))


2


In [84]:
plt.scatter(VALIDATION_LABELS, predictions, s=2.0)
plt.plot(np.logspace(-2, 4), np.logspace(-2, 4), color="black")
plt.grid()
plt.xlabel("RBSP OBSERVED CHORUS")
plt.ylabel("MODEL PREDICTED CHORUS")
plt.title("VALIDATION SET")
plt.xlim(1e-1, 1e3)
plt.ylim(1e-1, 1e3)
plt.xscale("log")
plt.yscale("log")