# Utils

In [1]:
#|default_exp utils

In [2]:
#| export
import numpy as np
import pandas as pd
from fastcore.all import *
from tsai.basics import SlidingWindow
from tsai.utils import load_object
from collections import Counter
from itertools import combinations, chain
import more_itertools as mit
from tqdm import tqdm
import requests
import papermill as pm
import nbformat

In [3]:
#|export

def get_idxs_per_solar_activity_level(data, thresholds):
    # function that splits the data of a variable into 4 different sets, 
    # one for each solar activity level. The data comes as a numpy array with 
    # shape (samples, steps), and the split is done along the samples axis. 
    # The decision is made based on the first column of each sample. The function 
    # returns a list of 4 numpy arrays, one for each solar activity level. 
    # But it does not return the values, it returns the indices of the
    # samples that belong to each solar activity level.
    idxs_per_solar_activity_level = []
    for i in range(len(thresholds) + 1):
        if i == 0:
            idxs = np.where(data[:, 0] <= thresholds[i])[0]
        elif i == len(thresholds):
            idxs = np.where(data[:, 0] > thresholds[i-1])[0]
        else:
            idxs = np.where((data[:, 0] > thresholds[i-1]) & (data[:, 0] <= thresholds[i]))[0]
        idxs_per_solar_activity_level.append(idxs)
    return idxs_per_solar_activity_level

In [4]:
# Test

# Sample data: 10 samples, each with 5 steps
data = np.array([
    [10, 1, 2, 3, 4],
    [15, 1, 2, 3, 4],
    [25, 1, 2, 3, 4],
    [35, 1, 2, 3, 4],
    [45, 1, 2, 3, 4],
    [55, 1, 2, 3, 4],
    [65, 1, 2, 3, 4],
    [75, 1, 2, 3, 4],
    [85, 1, 2, 3, 4],
    [95, 1, 2, 3, 4]
])

# Thresholds to define solar activity levels
thresholds = [20, 40, 60]

# Call the function with the test data
splits = get_idxs_per_solar_activity_level(data, thresholds)

# Print the resulting splits
activity_levels = ['Low', 'Moderate', 'Elevated', 'High']

for i, level in enumerate(activity_levels):
    print(f"{level} Solar Activity Level Indices: {splits[i]}")


Low Solar Activity Level Indices: [0 1]
Moderate Solar Activity Level Indices: [2 3]
Elevated Solar Activity Level Indices: [4 5]
High Solar Activity Level Indices: [6 7 8 9]


In [5]:
#|export
def convert_uuids_to_indices():
    cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "")
    uuids = re.findall(r"\b[0-9a-fA-F]{8}(?:-[0-9a-fA-F]{4}){3}-[0-9a-fA-F]{12}\b", cuda_visible_devices)

    if uuids:
        indices = [str(i) for i in range(len(uuids))]
        os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(indices)

In [6]:
# Test
def test_convert_uuids_to_indices():
    # Mock the CUDA_VISIBLE_DEVICES environment variable with UUIDs
    os.environ["CUDA_VISIBLE_DEVICES"] = "GPU-123e4567-e89b-12d3-a456-426614174000,GPU-89e4567e-b89b-12d3-a456-426614174111"
    
    # Call the function to convert UUIDs to indices
    convert_uuids_to_indices()
    
    # Print the modified CUDA_VISIBLE_DEVICES
    print("Modified CUDA_VISIBLE_DEVICES:", os.environ["CUDA_VISIBLE_DEVICES"])

# Run the test
test_convert_uuids_to_indices()

Modified CUDA_VISIBLE_DEVICES: 0,1


In [7]:
#|export
def get_classified_columns (df: pd.DataFrame, thresholds:dict, activity_levels:dict):
    """
    Creates classified columns based on predefined ranges for specified columns in the DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame.

    Returns:
        pd.DataFrame: The DataFrame with classification classification of each column.

    """
    columns_to_classify = df.columns.intersection(thresholds.keys())

    if columns_to_classify.empty:
        return df
    else:
        df_cat = pd.DataFrame()
        for column in columns_to_classify:
            # ranges tuples come as strings in the yaml file, so we need to convert them to tuples with eval
            bins = pd.IntervalIndex.from_tuples(thresholds[column])
            df_cat[f'{column}_Cat'] = np.array(activity_levels[column])[pd.cut(df[column], bins=bins).cat.codes]
        return df_cat

In [8]:
# Test 

data = {
    'F10': [50, 100, 160, 200],
    'S10': [30, 70, 170, 220],
    'M10': [60, 100, 150, 170],
    'Y10': [50, 90, 150, 170]
}
df = pd.DataFrame(data)


thresholds = {
    'F10': [(0,75), (76,150), (151,190), (191, df['F10'].max())],
    'S10': [(0,65), (66,150), (151,215), (216, df['S10'].max())],
    'M10': [(0,72), (73,144), (145,167), (168, df['M10'].max())],
    'Y10': [(0,81), (82,148), (149,165), (166, df['Y10'].max())]
}

activity_levels = {
    'F10': ['low', 'moderate', 'elevated', 'high'],
    'S10': ['low', 'moderate', 'elevated', 'high'],
    'M10': ['low', 'moderate', 'elevated', 'high'],
    'Y10': ['low', 'moderate', 'elevated', 'high']
}


# Expected result
expected_data = {
    'F10_Cat': ['low', 'moderate', 'elevated', 'high'],
    'S10_Cat': ['low', 'moderate', 'elevated', 'high'],
    'M10_Cat': ['low', 'moderate', 'elevated', 'high'],
    'Y10_Cat': ['low', 'moderate', 'elevated', 'high']
}
expected_df = pd.DataFrame(expected_data)

result_df = get_classified_columns(df, thresholds=thresholds, activity_levels=activity_levels)

print(result_df.head())

pd.testing.assert_frame_equal(result_df, expected_df)
print("Test passed!")


    F10_Cat   S10_Cat   M10_Cat   Y10_Cat
0       low       low       low       low
1  moderate  moderate  moderate  moderate
2  elevated  elevated  elevated  elevated
3      high      high      high      high
Test passed!


In [9]:
#|export
def euclidean_distance_dict(X:dict, Y:dict):
    return math.sqrt(sum((X.get(d,0) - Y.get(d,0))**2 for d in set(X) | set(Y)))


In [10]:
# Test 

# Test case 1: Basic test with non-overlapping keys
X1 = {'a': 1, 'b': 2, 'c': 3}
Y1 = {'d': 4, 'e': 5, 'f': 6}
expected_distance1 = np.sqrt(1**2 + 2**2 + 3**2 + 4**2 + 5**2 + 6**2)
assert np.isclose(euclidean_distance_dict(X1, Y1), expected_distance1), f"Test case 1 failed"

# Test case 2: Basic test with overlapping keys
X2 = {'a': 1, 'b': 2, 'c': 3}
Y2 = {'a': 1, 'b': 2, 'c': 4}
expected_distance2 = np.sqrt(0**2 + 0**2 + 1**2)
assert np.isclose(euclidean_distance_dict(X2, Y2), expected_distance2), f"Test case 2 failed"

# Test case 3: Basic test with some overlapping and some non-overlapping keys
X3 = {'a': 1, 'b': 2}
Y3 = {'b': 2, 'c': 3}
expected_distance3 = np.sqrt(1**2 + 0**2 + 3**2)
assert np.isclose(euclidean_distance_dict(X3, Y3), expected_distance3), f"Test case 3 failed"

# Test case 4: Test with empty dictionaries
X4 = {}
Y4 = {}
expected_distance4 = 0
assert np.isclose(euclidean_distance_dict(X4, Y4), expected_distance4), f"Test case 4 failed"

# Test case 5: Test with one empty dictionary
X5 = {'a': 1, 'b': 2}
Y5 = {}
expected_distance5 = np.sqrt(1**2 + 2**2)
assert np.isclose(euclidean_distance_dict(X5, Y5), expected_distance5), f"Test case 5 failed"

print("All test cases passed!")

All test cases passed!


In [11]:
#|export

def find_closest_distribution(df_cat, target_distribution, segment_size, val_size):
    """
    Finds the combination of segments in the categorical data that is closest to the target distribution.

    Parameters:
    df_cat (pd.Series): A pandas Series containing the categorical data.
    target_distribution (dict): The target distribution to compare against, given as a dictionary where keys are categories and values are their target proportions.
    segment_size (int): The size of each segment to split the data into.
    val_size (float): The proportion of the validation split.

    Returns:
    best_combination (tuple): The indices of the segments that form the closest combination to the target distribution.
    segments (list): The list of segments created from the data.
    distribution_found (dict): The distribution of categories in the best combination of segments.
    """
    idxs = list(df_cat.index)
    segments = np.array_split(idxs, len(df_cat) // segment_size)

    value_counts = [df_cat[segments[i]].value_counts().to_dict() for i in range(len(segments))]

    num_segments = int(len(segments)*(val_size))
    print(f"Total number of segments:{ len(segments)}, Number of segments for validation: {num_segments} ({num_segments/len(segments)*100:.2f}%)")

    
    best_combination = None
    best_distance = np.inf
    distribution_found = None
    comb = combinations(range(len(value_counts)), num_segments)
    for c in tqdm(comb):
        values = Counter({})
        for i in c:
            values = values + Counter(value_counts[i])
        total = sum(values.values(), 0.0)
        distribution = {k: v / total for k, v in values.items()}
        
        distance = euclidean_distance_dict(distribution, target_distribution)

        if distance < best_distance:
            best_distance = distance
            best_combination = c
            distribution_found = distribution
    print("The closest group of segments to F10.7 categories has an euclidean distance of", best_distance)
    return best_combination, segments, distribution_found

In [12]:
# Test

target_distribution = {'A': 0.25, 'B': 0.25, 'C': 0.25, 'D': 0.25}

data = {
    'category': ['A', 'B', 'A', 'C', 'D', 'B', 'A', 'C', 'D', 'B', 'A', 'C', 'D', 'B', 'A', 'C', 'D', 'B', 'A', 'C']
}
df_cat = pd.Series(data['category'])

# Function parameters
segment_size = 5
val_size = 0.4

best_combination, segments, distribution_found = find_closest_distribution(df_cat, target_distribution, segment_size, val_size)

print("Best combination of segments:", list(best_combination))
print("Distribution found:", distribution_found)

Total number of segments:4, Number of segments for validation: 1 (25.00%)


4it [00:00, 12018.06it/s]

The closest group of segments to F10.7 categories has an euclidean distance of 0.17320508075688773
Best combination of segments: [0]
Distribution found: {'A': 0.4, 'B': 0.2, 'C': 0.2, 'D': 0.2}





In [13]:
#|export
def sliding_window_generator(df, split_start, data_columns, config, comb=None, segments=None):
    consecutive_elements, X, y = None, None, None

    if comb is not None:
        consecutive_elements = [list(group) for group in mit.consecutive_groups(comb)]

        df_to_window = []
        for elements in consecutive_elements:
            best_comb_idxs = [segments[i] for i in elements]
            df_to_window.append(df.iloc[chain.from_iterable(best_comb_idxs)])
    else:
        df_to_window = [df]

    X_window, y_window = None, None 
    for df_window in df_to_window:    
        X_window, y_window = SlidingWindow(
            window_len=config.lookback,
            horizon=config.horizon, 
            stride=1, 
            get_x=data_columns, 
            get_y=data_columns
        )(df_window)
        X = np.concatenate([X, X_window]) if X is not None else X_window
        y = np.concatenate([y, y_window]) if y is not None else y_window
    
    
    splits = L(list(np.arange(split_start, len(X)+split_start)))
    return X, y, splits

In [14]:
# Test

data = {
    'A': range(10),
    'B': range(10)
}
df = pd.DataFrame(data)

# Parameters
split_start = 0
data_columns = ['A', 'B']
comb = [0, 2]
segments = [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
config = AttrDict({'lookback': 2, 'horizon': 1})

# Testing the function
X, y, splits = sliding_window_generator(df, split_start, data_columns, config, comb, segments)

# Print outputs for verification
print("X:\n", X)
print("y:\n", y)
print("splits:\n", splits)

X:
 [[[0 1]
  [0 1]]

 [[6 7]
  [6 7]]]
y:
 [[2 2]
 [8 8]]
splits:
 [0, 1]


In [15]:
#| export
def download_dst_data(start_date: str = '01/1957',
                      end_date: str = pd.Timestamp.today(),
                      save_folder: str = "./dst_data"):
    """
    Downloads Dst index data between the specified start and end dates.

    :param start_date: Start date in the format 'MM/YYYY'
    :param end_date: End date in the format 'MM/YYYY'
    :param save_folder: Folder where the data files should be saved
    """

    os.makedirs(save_folder, exist_ok=True)

    # Initialize file path
    file_name = "DST_IAGA2002.txt"
    file_path = os.path.join(save_folder, file_name)


    # Remove existing file if it exists
    if os.path.exists(file_path):
        os.remove(file_path)
        print(f"Deleted existing file: {file_path}")

    # Convert input dates to datetime objects
    start_dt = pd.to_datetime(start_date, format='%m/%Y')
    end_dt = pd.to_datetime(end_date, format='%m/%Y')

    # HTTP REQUEST COMPONENTS
    current_start = start_dt
    while current_start <= end_dt:
        current_end = min(current_start + pd.DateOffset(years=24), end_dt)

        # Extract year components
        SCent = current_start.year // 100
        STens = (current_start.year % 100) // 10
        SYear = current_start.year % 10
        SMonth = current_start.month

        ECent = current_end.year // 100
        ETens = (current_end.year % 100) // 10
        EYear = current_end.year % 10
        EMonth = current_end.month

        # Construct URL for current chunk
        url = f"https://wdc.kugi.kyoto-u.ac.jp/cgi-bin/dstae-cgi?" \
              f"SCent={SCent}&" \
              f"STens={STens}&" \
              f"SYear={SYear}&" \
              f"SMonth={SMonth:02d}&" \
              f"ECent={ECent}&" \
              f"ETens={ETens}&" \
              f"EYear={EYear}&" \
              f"EMonth={EMonth:02d}&" \
              "Image+Type=GIF&COLOR=COLOR&AE+Sensitivity=0&Dst+Sensitivity=0&Output=DST&Out+format=IAGA2002"

        headers = {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.5",
            "Accept-Encoding": "gzip, deflate, br, zstd",
            "Connection": "keep-alive",
            "Referer": "https://wdc.kugi.kyoto-u.ac.jp/dstae/index.html"
        }

        try:
            session = requests.session()
            response = session.get(url, headers=headers)
            response.raise_for_status()  # Raise an error for bad responses

            # Append or write to file
            mode = 'ab' if os.path.exists(file_path) else 'wb'
            with open(file_path, mode) as file:
                file.write(response.content)

            print(f"Downloaded and saved data from {current_start.strftime('%m/%Y')} to {current_end.strftime('%m/%Y')}")

        except requests.exceptions.RequestException as e:
            print(f"Failed to download data: {e}")

        # Move to the next chunk
        current_start = current_end + pd.DateOffset(days=1)

    print(f"All data downloaded and saved to {file_path}")
    return file_path


In [16]:
# Test
import shutil

download_dst_data()
shutil.rmtree("./dst_data")

Downloaded and saved data from 01/1957 to 01/1981
Downloaded and saved data from 01/1981 to 01/2005
Downloaded and saved data from 01/2005 to 09/2024
All data downloaded and saved to ./dst_data/DST_IAGA2002.txt


In [17]:
#| export
def generate_preprocessed_data(config, generate_preproc_pipe=True, generate_exp_pipe=True):
    result = []
    try:
        df = load_object(config.df_save_path)
 
    except FileNotFoundError:
        output = './tmp/data_out.ipynb'
        print(f"{config.df_save_path} not found. Executing the notebook to generate the data...")
        
        pm.execute_notebook(config.data_nb, output)
        os.remove(output)

        print("Data generated successfully.")

    results = [load_object(config.df_save_path)]

    if generate_preproc_pipe:
        results.append(load_object(config.preproc_pipe_save_path))

    if generate_exp_pipe:
        results.append(load_object(config.exp_pipe_save_path))

    return *results,

The next method has been extracted from [tsai library](https://timeseriesai.github.io/tsai/optuna.html) in order to modify it to accept multiobjective studies, without doing a full fork.

In [18]:
#| export

from pathlib import Path
from fastcore.script import *
import joblib
from importlib import import_module
import warnings
warnings.filterwarnings("ignore")

def run_optuna_study(objective, resume=None, study_type=None, multivariate=True, search_space=None, evaluate=None, seed=None, sampler=None, pruner=None, 
                     study_name=None, direction='maximize', n_trials=None, timeout=None, gc_after_trial=False, show_progress_bar=True, 
                     save_study=True, path='optuna', show_plots=True):
    r"""Creates and runs an optuna study.

    Args: 
        objective:          A callable that implements objective function.
        resume:             Path to a previously saved study.
        study_type:         Type of study selected (bayesian, gridsearch, randomsearch). Based on this a sampler will be build if sampler is None. 
                            If a sampler is passed, this has no effect.
        multivariate:       If this is True, the multivariate TPE is used when suggesting parameters. The multivariate TPE is reported to outperform 
                            the independent TPE.
        search_space:       Search space required when running a gridsearch (if you don't pass a sampler).
        evaluate:           Allows you to pass a specific set of hyperparameters that will be evaluated.
        seed:               Fixed seed used by samplers.
        sampler:            A sampler object that implements background algorithm for value suggestion. If None is specified, TPESampler is used during 
                            single-objective optimization and NSGAIISampler during multi-objective optimization. See also samplers.
        pruner:             A pruner object that decides early stopping of unpromising trials. If None is specified, MedianPruner is used as the default. 
                            See also pruners.
        study_name:         Studyâ€™s name. If this argument is set to None, a unique name is generated automatically.
        direction:          A sequence of directions during multi-objective optimization.
        n_trials:           The number of trials. If this argument is set to None, there is no limitation on the number of trials. If timeout is also set to 
                            None, the study continues to create trials until it receives a termination signal such as Ctrl+C or SIGTERM.
        timeout:            Stop study after the given number of second(s). If this argument is set to None, the study is executed without time limitation. 
                            If n_trials is also set to None, the study continues to create trials until it receives a termination signal such as 
                            Ctrl+C or SIGTERM.
        gc_after_trial:     Flag to execute garbage collection at the end of each trial. By default, garbage collection is enabled, just in case. 
                            You can turn it off with this argument if memory is safely managed in your objective function.
        show_progress_bar:  Flag to show progress bars or not. To disable progress bar, set this False.
        save_study:         Save your study when finished/ interrupted.
        path:               Folder where the study will be saved.
        show_plots:         Flag to control whether plots are shown at the end of the study.
    """
    
    try: import optuna
    except ImportError: raise ImportError('You need to install optuna to use run_optuna_study')

    # Sampler
    if sampler is None:
        if study_type is None or "bayes" in study_type.lower(): 
            sampler = optuna.samplers.TPESampler(seed=seed, multivariate=multivariate)
        elif "grid" in study_type.lower():
            assert search_space, f"you need to pass a search_space dict to run a gridsearch"
            sampler = optuna.samplers.GridSampler(search_space)
        elif "random" in study_type.lower(): 
            sampler = optuna.samplers.RandomSampler(seed=seed)
    assert sampler, "you need to either select a study type (bayesian, gridsampler, randomsampler) or pass a sampler"

    # Study
    if resume: 
        try:
            study = joblib.load(resume)
        except: 
            print(f"joblib.load({resume}) couldn't recover any saved study. Check the path.")
            return
        print("Best trial until now:")
        print(" Value: ", study.best_trial.value)
        print(" Params: ")
        for key, value in study.best_trial.params.items():
            print(f"    {key}: {value}")
    else: 
        study = optuna.create_study(sampler=sampler, pruner=pruner, study_name=study_name, directions=direction)
    if evaluate: study.enqueue_trial(evaluate)
    try:
        study.optimize(objective, n_trials=n_trials, timeout=timeout, gc_after_trial=gc_after_trial, show_progress_bar=show_progress_bar)
    except KeyboardInterrupt:
        pass

    # Save
    if save_study:
        full_path = Path(path)/f'{study.study_name}.pkl'
        full_path.parent.mkdir(parents=True, exist_ok=True)
        joblib.dump(study, full_path)
        print(f'\nOptuna study saved to {full_path}')
        print(f"To reload the study run: study = joblib.load('{full_path}')")

    # Plots
    if show_plots and len(study.trials) > 1:
        try: display(optuna.visualization.plot_optimization_history(study))
        except: pass
        try: display(optuna.visualization.plot_param_importances(study))
        except: pass
        try: display(optuna.visualization.plot_slice(study))
        except: pass
        try: display(optuna.visualization.plot_parallel_coordinate(study))
        except: pass

    # Study stats
    try:
        pruned_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED]
        complete_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]
        print(f"\nStudy statistics    : ")
        print(f"  Study name        : {study.study_name}")
        print(f"  # finished trials : {len(study.trials)}")
        print(f"  # pruned trials   : {len(pruned_trials)}")
        print(f"  # complete trials : {len(complete_trials)}")
        
        print(f"\nBest trial          :")
        trial = study.best_trial
        print(f"  value             : {trial.value}")
        print(f"  best_params = {trial.params}\n")
    except:
        print('\nNo finished trials yet.')
    return study


In [19]:
#| export

def filter_nb (path:str, skip_tags:list):
    """
    Filters out cells with tags in skip_tags from a notebook.

    Args:
    path (str): Path to the notebook file.
    skip_tags (list): List of tags to filter out.

    Returns:
    nb (nbformat.NotebookNode): The filtered notebook.
    """
    nb = nbformat.read(path, as_version=4)

    filtered_cells = [cell for cell in nb.cells if not set(skip_tags) & set(cell.metadata.get('tags', []))]
    nb.cells = filtered_cells
    
    return nb

In [24]:
#| export
def create_vectorized_condition_function(geo_thresholds_dict, sol_thresholds_dict, geo_levels_dict, solact_levels_dict):
    """
    Creates a vectorized function that returns the condition based on geomagnetic and solar values.
    
    Input:
    geo_thresholds_dict: Dictionary of thresholds for the geomagnetic indices (e.g., 'AP', 'DST')
    sol_thresholds_dict: Dictionary of thresholds for the solar indices (e.g., 'F10', 'S10')
    geo_levels_dict: Dictionary of activity levels for geomagnetic indices
    solact_levels_dict: Dictionary of activity levels for solar indices
    
    Output:
    A vectorized function that can be used to compute conditions for arrays of geo_values and sol_values.
    """
    
    def get_combined_condition(geo_index, geo_value, sol_index, sol_value):
        """
        Function that returns the condition based on the geo_value and sol_value.
        Input:
            geo_index: The name of the geomagnetic index (e.g., 'AP', 'DST')
            geo_value: Value of the geomagnetic index (float)
            sol_index: The name of the solar index (e.g., 'F10', 'S10')
            sol_value: Value of the solar index (float)
        Output:
            condition: Condition string that combines geomagnetic and solar conditions (string)
        """
        geo_thresholds = geo_thresholds_dict[geo_index]
        sol_thresholds = sol_thresholds_dict[sol_index]
        geo_levels = geo_levels_dict[geo_index]
        solact_levels = solact_levels_dict[sol_index]

        geo_condition = None
        sol_condition = None

        # Determine geomagnetic condition
        for i, (lower, upper) in enumerate(geo_thresholds):
            if lower < geo_value <= upper:
                geo_condition = geo_levels[i]
                break

        # Determine solar condition
        for j, (lower, upper) in enumerate(sol_thresholds):
            if lower < sol_value <= upper:
                sol_condition = solact_levels[j]
                break

        # Combine conditions
        if geo_condition and sol_condition:
            if geo_index == 'AP':
                return f'{geo_condition}Geo_{sol_condition.capitalize()}'
            else: 
                return f'{geo_condition}_{sol_condition.capitalize()}'
        else:
            return 'Unknown'  # Fallback if no condition is found

    # Return the vectorized version of the get_combined_condition function
    return np.vectorize(get_combined_condition)




In [27]:
# Test
# Geomagnetic thresholds and levels (e.g., 'AP' and 'DST')
geo_thresholds = {
    "AP": [
        (-np.inf, 10),  # Low
        (10, 50),       # Moderate
        (50, np.inf)    # Active
    ],
    "DST": [
        (-30, np.inf),  # G0
        (-50, -30),     # G1
        (-90, -50),     # G2
        (-130, -90),    # G3
        (-350, -130),   # G4
        (-np.inf, -350) # G5
    ]
}

geo_levels = {
    'AP': ["Low", "Moderate", "Active"],
    'DST': ["G0", "G1", "G2", "G3", "G4", "G5"]
}

# Solar thresholds and levels (e.g., 'F10')
sol_thresholds = {
    'F10': [(0,75), (76,150), (151,190), (191, 99999)]
}

solact_levels = {
    'F10': ['low', 'moderate', 'elevated', 'high']
}

# Create the vectorized function with parameters loaded
condition_function = create_vectorized_condition_function(geo_thresholds, sol_thresholds, geo_levels, solact_levels)

# Example data (for geomagnetic index AP and solar index F10)
geo_values = np.array([5, 20, 60])  # AP values
sol_values = np.array([80, 160, 200])  # F10 values

# Now execute the preloaded vectorized function
conditions = condition_function('DST', geo_values, 'F10', sol_values)
print(conditions)

['G0_Moderate' 'G0_Elevated' 'G0_High']


In [28]:
#|eval: false
#|hide
from nbdev import *
nbdev_export()