In [18]:
import os
import sys

# caution: path[0] is reserved for script path (or '' in REPL).
sys.path.insert(1, os.path.abspath("./../src"))

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch
from sklearnex import patch_sklearn

patch_sklearn()
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import KBinsDiscretizer
import sklearn
from imblearn.over_sampling import RandomOverSampler

import datetime
import importlib

import scipy.special
import scipy.stats

import dataset_stats as ds
import heteroscedastic_uncertainty_regressor as hr
import plot_tools

importlib.reload(hr)
importlib.reload(plot_tools)
importlib.reload(ds)

import warnings
warnings.filterwarnings("ignore")
%matplotlib qt

Intel(R) Extension for Scikit-learn* enabled (https://github.com/uxlfoundation/scikit-learn-intelex)


In [19]:
VERSION = "v1"
MODEL_TYPE = "LOWER_BAND"

pdata_folder = os.path.abspath("./../processed_data/chorus_neural_network/")
rbsp_chorus_folder = os.path.join(pdata_folder, "observed_chorus")
output_folder = os.path.join(pdata_folder, "models", VERSION)


dataset = np.load(
    file=os.path.join(output_folder, rf"total_dataset_{VERSION}_{MODEL_TYPE}.npz")
)

X_conditional = dataset["X_conditional"]
X_convolutional = dataset["X_convolutional"]
y = np.sqrt(dataset["y"])

where_y_is_zero = (y != 0)
X_conditional = X_conditional[where_y_is_zero, :]
X_convolutional = X_convolutional[where_y_is_zero, :]
y = y[where_y_is_zero]

dataset.close()

t_data = X_conditional[:, -1]
X_conditional = X_conditional[:, :-1]

In [20]:
ref_date = datetime.datetime(year=2012, month=1, day=1)
g_data = np.zeros_like(y)
for i, t in enumerate(t_data):

    dt = datetime.datetime.fromtimestamp(t)
    del_dt = dt - ref_date
    g_data[i] = del_dt.days

In [21]:
# Train and predict function
def train_predict(X_train, X_test, y_train):

    # Train logistic regression model
    model = LogisticRegression(random_state=42, solver="liblinear", class_weight="balanced", max_iter=1000)
    model.fit(X_train, y_train)

    # Get predictions and probabilities
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]  # Probability of positive class

    return y_pred, y_prob, model


# Evaluation function
def evaluate_model(y_test, y_pred, y_prob):
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "ROC AUC": roc_auc_score(y_test, y_prob),
    }

    # Print evaluation metrics
    print("Model Evaluation Metrics:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")

    return metrics

In [22]:
bounds = {0: (0, 0.1), 1: (0.1, 1.0), 2: (1.0, 10.0), 3:(10.0, 100.0), 4:(100.0, 1000.0)}


kfold = sklearn.model_selection.GroupKFold(n_splits=5, shuffle=True)
trained_models = {0 : [], 1: [], 2: [], 3:[], 4:[]}
scalers = {0: [], 1: [], 2: [], 3: [], 4: []}
for fold, (train_idx, test_idx) in enumerate(kfold.split(y, groups=g_data)):

    X_train = X_conditional[train_idx, :]
    X_test = X_conditional[test_idx, :]

    y_train = y[train_idx]
    y_test = y[test_idx]

    for group in bounds:

        print(f'Making classifier for: {bounds[group]}')

        if bounds[group] == (0,0):

            y_train_bool = (y_train == 0).astype(np.int32)
            y_test_bool = (y_test == 0).astype(np.int32)

        else:

            y_train_bool = ((bounds[group][0] < y_train) & (y_train <= bounds[group][1])).astype(np.int32)
            y_test_bool = ((bounds[group][0] < y_test) & (y_test <= bounds[group][1])).astype(np.int32)


        # Standardize the features|
        scaler = RobustScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        scalers[group].append(scaler)

        # Train model and get predictions
        y_pred, y_prob, model = train_predict(X_train, X_test, y_train_bool)

        # Evaluate model
        metrics = evaluate_model(y_test_bool, y_pred, y_prob)

        trained_models[group].append(model)

        # Print sample probabilities
        print("\nSample Probabilities (Positive Class):")
        for i in range(5):
            print(f"Sample {i+1}: {y_prob[i]:.4f}")
            print(f"Actual: {i+1} : {y_test_bool[i]:4f}")


Making classifier for: (0, 0.1)
Model Evaluation Metrics:
Accuracy: 0.7576
Precision: 0.0029
Recall: 0.7411
F1 Score: 0.0057
ROC AUC: 0.8146

Sample Probabilities (Positive Class):
Sample 1: 0.4759
Actual: 1 : 0.000000
Sample 2: 0.4261
Actual: 2 : 0.000000
Sample 3: 0.4549
Actual: 3 : 0.000000
Sample 4: 0.4910
Actual: 4 : 0.000000
Sample 5: 0.4297
Actual: 5 : 0.000000
Making classifier for: (0.1, 1.0)
Model Evaluation Metrics:
Accuracy: 0.5080
Precision: 0.3442
Recall: 0.5652
F1 Score: 0.4279
ROC AUC: 0.5408

Sample Probabilities (Positive Class):
Sample 1: 0.5551
Actual: 1 : 1.000000
Sample 2: 0.5383
Actual: 2 : 0.000000
Sample 3: 0.5353
Actual: 3 : 0.000000
Sample 4: 0.5332
Actual: 4 : 0.000000
Sample 5: 0.5378
Actual: 5 : 0.000000
Making classifier for: (1.0, 10.0)
Model Evaluation Metrics:
Accuracy: 0.5350
Precision: 0.6458
Recall: 0.5774
F1 Score: 0.6097
ROC AUC: 0.5293

Sample Probabilities (Positive Class):
Sample 1: 0.4788
Actual: 1 : 0.000000
Sample 2: 0.4930
Actual: 2 : 1.000

In [None]:
print(trained_models)

# Standardize the features|
scaler = RobustScaler()
X_conditional = scaler.fit_transform(X_conditional)

y_considered = y > 10

for point in range(len(y[y_considered])):

    group_predictions = {0:0, 1:0, 2:0, 3:0, 4:0, 5:0}

    for group in trained_models:

        predictions = []

        for model in trained_models[group]:

            y_prob = model.predict_proba(X_conditional[y_considered, :])[:, 1]  # Probability of positive class

            predictions.append(y_prob)

        group_predictions[group] = np.nanmean(predictions)


    print(np.argmax(group_predictions.values()))
    print(y[y_considered][point])


    if point > 1000:
        break