In [1]:
import sys
import os
# caution: path[0] is reserved for script path (or '' in REPL).
sys.path.insert(1, os.path.abspath('./../src'))

import chorus_machine_learning_helper
import numpy as np
import plot_tools
import data_loader
import matplotlib.colors
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from dateutil import rrule
import datetime 
from cdflib.epochs_astropy import CDFAstropy as cdfepoch
import pandas as pd

import xgboost as xgb

import importlib

importlib.reload(chorus_machine_learning_helper)
importlib.reload(plot_tools)
importlib.reload(data_loader)


%matplotlib qt



In [None]:

MODEL = xgb.Booster({'nthread': 8, "device" : "cuda"})  # init model
MODEL.load_model(f"./../processed_data_chorus_neural_network/TRAINED_MODELS/Weighted_L2/XG_BOOSTED_REGRESSION_MSE_WEIGHTED_ON_L_AND_AMPLITUDE.model")  # load model data
DATASET_VERSION = "v4a"

CHORUS_PREDICTED_TOTAL = []
L_TOTAL = []
MLT_TOTAL = []
TIME_TOTAL = []

for year in range(1998, 2024):

    POES = chorus_machine_learning_helper.load_MPE_year(year)

    SUPERMAG = chorus_machine_learning_helper.load_SUPERMAG_SME_year(year)

    OMNI = chorus_machine_learning_helper.load_OMNI_year(year)
    
    print(f"Number of POES satellites loaded: {len(POES)}")

    FEATURE_REFS = chorus_machine_learning_helper.find_average_SUPERMAG_and_OMNI_values_for_each_POES_data_point(POES, SUPERMAG, OMNI)
    
    POES_TIMES_OF_FEATURES = FEATURE_REFS["POES_TIMES_OF_FEATURES"].flatten()
    MLT_FEATURES_PREPROCESSING = FEATURE_REFS["MLT_FEATURES"].flatten()
    L_FEATURES_PREPROCESSING = FEATURE_REFS["L_FEATURES"].flatten()

    FEATURES_POST_PROCESSING = chorus_machine_learning_helper.normalize_features(FEATURE_REFS, version = DATASET_VERSION)

    CHORUS_PREDICTED = MODEL.predict(xgb.DMatrix(FEATURES_POST_PROCESSING))
    
    CHORUS_PREDICTED_TOTAL.append(CHORUS_PREDICTED)
    L_TOTAL.append(L_FEATURES_PREPROCESSING)
    MLT_TOTAL.append(MLT_FEATURES_PREPROCESSING)
    TIME_TOTAL.append(POES_TIMES_OF_FEATURES)
    
    
np.savez(file = os.path.abspath(f"./../processed_data_chorus_neural_network/temp_chorus_dependence_on_solar_cycle_using_machine_learning_v4a_dataset.npz"), 
         CHORUS = np.hstack(CHORUS_PREDICTED_TOTAL),
         L = np.hstack(L_TOTAL),
         MLT = np.hstack(MLT_TOTAL),
         UNIX_TIME = np.hstack(TIME_TOTAL))

In [2]:
CHORUS_REFS = np.load(f"./../processed_data_chorus_neural_network/temp_chorus_dependence_on_solar_cycle_using_machine_learning_v4a_dataset.npz")

TIME_TOTAL = CHORUS_REFS["UNIX_TIME"]
CHORUS_PREDICTED_TOTAL = CHORUS_REFS["CHORUS"]
L_TOTAL = CHORUS_REFS["L"]
MLT_TOTAL = CHORUS_REFS["MLT"]

CHORUS_REFS.close()

print(TIME_TOTAL.shape)
print(CHORUS_PREDICTED_TOTAL.shape)
print(L_TOTAL.shape)
print(MLT_TOTAL.shape)

(103477670,)
(103477670,)
(103477670,)
(103477670,)


In [3]:
start = datetime.datetime(year = 1998, month = 1, day = 1)
end = datetime.datetime(year = 2024, month = 1, day = 1)
dt = 86400 #Seconds in day


cumulative_chorus, num_points_in_each_epoch_L_bin = plot_tools.bin_3D_data(xdata = TIME_TOTAL,
                                                                            ydata = L_TOTAL, 
                                                                            zdata = CHORUS_PREDICTED_TOTAL,
                                                                            xstart = start.timestamp(),
                                                                            xend = end.timestamp(),
                                                                            xstep = dt,
                                                                            ystart = 3,
                                                                            yend = 7,
                                                                            ystep = 0.1)


averaged_model_predictions = cumulative_chorus / num_points_in_each_epoch_L_bin

  averaged_model_predictions = cumulative_chorus / num_points_in_each_epoch_L_bin


In [4]:
integrated_model_predictions = np.nansum(averaged_model_predictions, axis = 1) * 0.1 #Integrate with bin of 0.1 L

integrated_model_predictions[integrated_model_predictions == 0] = np.nan

integrated_prediction_times = np.array([datetime.datetime.fromtimestamp(start.timestamp() + j * dt) for j in range((int((end.timestamp() - start.timestamp()) / dt) + 1))][:-1])

print(integrated_prediction_times)

[datetime.datetime(1998, 1, 1, 0, 0) datetime.datetime(1998, 1, 2, 0, 0)
 datetime.datetime(1998, 1, 3, 0, 0) ...
 datetime.datetime(2023, 12, 29, 0, 0)
 datetime.datetime(2023, 12, 30, 0, 0)
 datetime.datetime(2023, 12, 31, 0, 0)]


In [5]:
# https://www.sidc.be/SILSO/datafiles

smoothed_sunspot_number_df = pd.read_csv(rf"./../sunspot_numbers/SN_ms_tot_V2.0.csv")
smoothed_sunspot_number_df.columns = ["year", "month", "decimal year", "SNvalue" , "SNerror", "Nb observations"]
smoothed_spotspot_numbers = smoothed_sunspot_number_df["SNvalue"]
smoothed_sunspot_times = []

for r in range(len(smoothed_sunspot_number_df)):
        
    date = datetime.datetime(year = smoothed_sunspot_number_df["year"][r], month = smoothed_sunspot_number_df["month"][r], day = 1, hour = 0, minute = 0, second = 0)
    smoothed_sunspot_times.append(date)


monthly_sunspot_number_df = pd.read_csv(rf"./../sunspot_numbers/SN_m_tot_V2.0.csv")
monthly_sunspot_number_df.columns = ["year", "month", "decimal year", "SNvalue" , "SNerror", "Nb observations"]
monthly_spotspot_numbers = monthly_sunspot_number_df["SNvalue"]
monthly_sunspot_times = []

for r in range(len(monthly_sunspot_number_df)):
    date = datetime.datetime(year = monthly_sunspot_number_df["year"][r], month = monthly_sunspot_number_df["month"][r], day = 1, hour = 0, minute = 0, second = 0)
    monthly_sunspot_times.append(date)
    

print(monthly_sunspot_number_df)

print(monthly_sunspot_times)

      year  month  decimal year  SNvalue  SNerror  Nb observations
0     1749      2      1749.123    104.3     -1.0               -1
1     1749      3      1749.204    116.7     -1.0               -1
2     1749      4      1749.288     92.8     -1.0               -1
3     1749      5      1749.371    141.7     -1.0               -1
4     1749      6      1749.455    139.2     -1.0               -1
...    ...    ...           ...      ...      ...              ...
3300  2024      2      2024.124    123.0     21.7              806
3301  2024      3      2024.206    103.7     16.6             1071
3302  2024      4      2024.288    137.0     22.1             1094
3303  2024      5      2024.373    172.1     23.1             1215
3304  2024      6      2024.455    164.1     21.4             1186

[3305 rows x 6 columns]
[datetime.datetime(1749, 2, 1, 0, 0), datetime.datetime(1749, 3, 1, 0, 0), datetime.datetime(1749, 4, 1, 0, 0), datetime.datetime(1749, 5, 1, 0, 0), datetime.datetime(1749

In [6]:
# https://www.sidc.be/SILSO/cyclesminmax

min_max_solar_cycle_df = pd.read_csv(rf"./../sunspot_numbers/TableCyclesMiMa.csv")

year_of_minimums = min_max_solar_cycle_df["Min_Year"]
month_of_minimums = min_max_solar_cycle_df["Min_Month"]
num_solar_mins = len(year_of_minimums)

year_of_maximums = min_max_solar_cycle_df["Max_Year"][:-1]  # REMOVED THE LAST ONE HERE CAUSE THERE IS NO MAXIMUM FOR SOLAR CYCLE 2025 YET
month_of_maximums = min_max_solar_cycle_df["Max_Month"][:-1]
num_solar_maxs = len(year_of_maximums)

dates_of_minimums = [datetime.datetime(year = int(year_of_minimums[d]), month = int(month_of_minimums[d]), day = 1, hour = 0, minute = 0, second = 0) for d in range(num_solar_mins)]
dates_of_maximums = [datetime.datetime(year = int(year_of_maximums[d]), month = int(month_of_maximums[d]), day = 1, hour = 0, minute = 0, second = 0) for d in range(num_solar_maxs)]

total_dates = []
total_dates.extend(dates_of_minimums)
total_dates.extend(dates_of_maximums)
ordered_dates_of_mins_and_maxs = sorted(total_dates)

relevant_minimums_and_maximums = [ordered_dates_of_mins_and_maxs[-5] + (ordered_dates_of_mins_and_maxs[-4] - ordered_dates_of_mins_and_maxs[-5]) / 2,
                                  ordered_dates_of_mins_and_maxs[-4], 
                                  ordered_dates_of_mins_and_maxs[-4] + (ordered_dates_of_mins_and_maxs[-3] - ordered_dates_of_mins_and_maxs[-4]) / 2, 
                                  ordered_dates_of_mins_and_maxs[-3], 
                                  ordered_dates_of_mins_and_maxs[-3] + (ordered_dates_of_mins_and_maxs[-2] - ordered_dates_of_mins_and_maxs[-3]) / 2, 
                                  ordered_dates_of_mins_and_maxs[-2], 
                                  ordered_dates_of_mins_and_maxs[-2] + (ordered_dates_of_mins_and_maxs[-1] - ordered_dates_of_mins_and_maxs[-2]) / 2,
                                  ordered_dates_of_mins_and_maxs[-1],
                                  ordered_dates_of_mins_and_maxs[-1] + (datetime.datetime(year = 2025, month = 7, day = 1) - ordered_dates_of_mins_and_maxs[-1]) / 2]
print(relevant_minimums_and_maximums)

relevant_minimums_and_maximums_labels = ["Up 23", "Max 23", "Down 23", "Min 24", "Up 24", "Max 24", "Down 24", "Min 25", "* Up 25"]


[datetime.datetime(1999, 3, 18, 0, 0), datetime.datetime(2001, 11, 1, 0, 0), datetime.datetime(2005, 5, 17, 12, 0), datetime.datetime(2008, 12, 1, 0, 0), datetime.datetime(2011, 8, 1, 12, 0), datetime.datetime(2014, 4, 1, 0, 0), datetime.datetime(2017, 1, 30, 0, 0), datetime.datetime(2019, 12, 1, 0, 0), datetime.datetime(2022, 9, 15, 12, 0)]


In [7]:
fig, ax = plt.subplots(3, 1, figsize=(16, 9), sharex=True)


ax[0].set_title("Daily Sunspot Number")
ax[0].set_ylabel("Daily Sunspot Number")

ax[0].plot(monthly_sunspot_times, monthly_spotspot_numbers, color = "black", label="Montly Data")
ax[0].plot(smoothed_sunspot_times, smoothed_spotspot_numbers, color = "red", label="13-Month Averaged")

for i, min_or_max in enumerate(relevant_minimums_and_maximums):
    ax[0].axvline(x = min_or_max, color='black', linestyle='--', linewidth=2)
    ax[0].text(min_or_max, np.nanmax(monthly_spotspot_numbers) - np.std(monthly_spotspot_numbers), relevant_minimums_and_maximums_labels[i], rotation=270, verticalalignment='center')

ax[0].set_xlim(start, end)
ax[0].legend()

ax[1].plot(integrated_prediction_times, integrated_model_predictions, color="black", label="Original Data")
ax[1].set_title("Integrated Model Predictions (Over all L)")

chorus_prediction_df = pd.Series(integrated_model_predictions, index = integrated_prediction_times)
smoothed_chorus_df = chorus_prediction_df.rolling("30d", center = True).mean()

ax[1].plot(smoothed_chorus_df.index, smoothed_chorus_df, label="30-Day Rolling Average", color="red", linewidth=3)
ax[1].set_ylabel("Chorus Amplitude (pT)")

ax[1].legend()


for i, min_or_max in enumerate(relevant_minimums_and_maximums):
    ax[1].axvline(x = min_or_max, color='black', linestyle='--', linewidth=2)

image = ax[2].imshow(averaged_model_predictions.T, 
                  origin = "lower",
                  extent=[start, end, 3, 7],
                  norm=matplotlib.colors.LogNorm(vmin=1, vmax=10),
                  aspect="auto",
                  interpolation="none")

image.cmap.set_under("black")

for i, min_or_max in enumerate(relevant_minimums_and_maximums):
    ax[2].axvline(x = min_or_max, color='black', linestyle='--', linewidth=2)

ax[2].set_title(f"Model Predicted Chorus (Averaged over MLT)")
ax[2].set_ylabel("L")
ax[2].set_xlabel("Time")

from mpl_toolkits.axes_grid1.inset_locator import inset_axes

axins = inset_axes(
    ax[2],
    width="1%",  # width: 5% of parent_bbox width
    height="100%",  # height: 50%
    loc="lower left",
    bbox_to_anchor=(1.01, 0, 1, 1),
    bbox_transform=ax[2].transAxes,
    borderpad=0,
)

cbar = fig.colorbar(image, cax=axins, pad=0.01)

cbar.set_label("Chorus Bw (pT)\n", loc="center", labelpad=15, rotation=270)

# Set the locator to control the tick spacing
locator = mdates.YearLocator()  # Set to DayLocator, HourLocator, etc. based on your desired frequency
ax[2].xaxis.set_major_locator(locator)

# Set the formatter to control the tick label format
formatter = mdates.DateFormatter('%Y-%m-%d')  # Customize the format as needed
ax[2].xaxis.set_major_formatter(formatter)

# Rotate tick labels if necessary
ax[2].tick_params(labelrotation=45)

In [8]:
# Bar Plot of Chorus vs Months

start_of_months_between_start_and_end = np.array([_dt for _dt in rrule.rrule(rrule.MONTHLY, dtstart = datetime.datetime(year = start.year, month = start.month, day = 1), until = end)])

cum_chorus_included_in_study = []
months_included_in_study = []
cum_chorus_per_month_over_all_regions = {i : [] for i in range(12)}

for _dt in range(len(start_of_months_between_start_and_end) - 1):
        
    start_of_month = start_of_months_between_start_and_end[_dt]
    end_of_month = start_of_months_between_start_and_end[_dt + 1]
    
    times_between_start_and_end_of_month = (start_of_month <= integrated_prediction_times) & (integrated_prediction_times < end_of_month)
    cum_chorus_for_dt = np.nansum(integrated_model_predictions[times_between_start_and_end_of_month])
    
    if (cum_chorus_for_dt > 0) and (np.sum(times_between_start_and_end_of_month) > 10):
        
        months_included_in_study.append(_dt)
        cum_chorus_included_in_study.append(cum_chorus_for_dt)
        
        cum_chorus_per_month_over_all_regions[start_of_month.month - 1].append(cum_chorus_for_dt)

start_of_months_between_start_and_end = start_of_months_between_start_and_end[np.asarray(months_included_in_study)]

plt.plot(start_of_months_between_start_and_end, cum_chorus_included_in_study)
plt.show()

avg_cum_chorus_per_month_over_all_regions = np.array([np.nanmean(cum_chorus_per_month_over_all_regions[i]) for i in range(12)])
std_cum_chorus_per_month_over_all_regions = np.array([np.nanstd(cum_chorus_per_month_over_all_regions[i], ddof=1) for i in range(12)])



In [9]:
num_regions = 4

regions = {}

for r in range(num_regions):
    regions[r] = (np.zeros(shape=(12)), np.zeros(shape=(12)))
    
for d, _dt in enumerate(start_of_months_between_start_and_end[:-1]):
    
    selected_region = None
    if (_dt < relevant_minimums_and_maximums[0]):
        selected_region = 0
    elif (relevant_minimums_and_maximums[0] <= _dt) and (_dt < relevant_minimums_and_maximums[1]):
        selected_region = 1
    elif (relevant_minimums_and_maximums[1] <= _dt) and (_dt < relevant_minimums_and_maximums[2]):
        selected_region = 2
    elif (relevant_minimums_and_maximums[2] <= _dt) and (_dt < relevant_minimums_and_maximums[3]):
        selected_region = 3
    elif (relevant_minimums_and_maximums[3] <= _dt) and (_dt < relevant_minimums_and_maximums[4]):
        selected_region = 0
    elif (relevant_minimums_and_maximums[4] <= _dt) and (_dt < relevant_minimums_and_maximums[5]):
        selected_region = 1
    elif (relevant_minimums_and_maximums[5] <= _dt) and (_dt < relevant_minimums_and_maximums[6]):
        selected_region = 2
    elif (relevant_minimums_and_maximums[6] <= _dt) and (_dt < relevant_minimums_and_maximums[7]):
        selected_region = 3
    elif (relevant_minimums_and_maximums[7] <= _dt) and (_dt < relevant_minimums_and_maximums[8]):
        selected_region = 0
    elif (relevant_minimums_and_maximums[8] <= _dt):
        selected_region = 1
        
    for m in range(1, 13):
        
        if _dt.month == m:
            
            regions[selected_region][0][m - 1] += cum_chorus_included_in_study[d]
            regions[selected_region][1][m - 1] += 1





In [10]:
fig, ax = plt.subplots(num_regions, 1, figsize=(16, 9), sharex=True, sharey=True)

ax[0].grid(True, linestyle = '--', linewidth = 0.5)
ax[1].grid(True, linestyle = '--', linewidth = 0.5)
ax[2].grid(True, linestyle = '--', linewidth = 0.5)
ax[3].grid(True, linestyle = '--', linewidth = 0.5)

for r in range(num_regions):
    avg_cum_chorus_per_month_in_region = regions[r][0] / regions[r][1]
    sigma = (avg_cum_chorus_per_month_in_region - avg_cum_chorus_per_month_over_all_regions) / std_cum_chorus_per_month_over_all_regions
    
    ax[r].bar(["Jan.", "Feb.", "Mar.", "Apr.", "May.", "June", "July", "Aug.", "Sept.", "Oct.", "Nov.", "Dec."], 
              sigma, 
              color="grey",
              edgecolor='black',
              linewidth=2)
    ax[r].set_ylabel("std. from mean")    
    
ax[0].set_title(f"Ascending From Minimum ({int(np.sum(regions[0][1]))})")
ax[1].set_title(f"Ascending To Maximum ({int(np.sum(regions[1][1]))})")
ax[2].set_title(f"Descending From Maximum ({int(np.sum(regions[2][1]))})")
ax[3].set_title(f"Descending To Minimum ({int(np.sum(regions[3][1]))})")
ax[0].axhline(y = 0, color = 'black', linestyle = '--', linewidth = 3) 
ax[1].axhline(y = 0, color = 'black', linestyle = '--', linewidth = 3) 
ax[2].axhline(y = 0, color = 'black', linestyle = '--', linewidth = 3) 
ax[3].axhline(y = 0, color = 'black', linestyle = '--', linewidth = 3) 

ax[0].axhline(y = 1, color = 'black', linestyle = '--', linewidth = 1) 
ax[1].axhline(y = 1, color = 'black', linestyle = '--', linewidth = 1) 
ax[2].axhline(y = 1, color = 'black', linestyle = '--', linewidth = 1) 
ax[3].axhline(y = 1, color = 'black', linestyle = '--', linewidth = 1) 

ax[0].axhline(y = -1, color = 'black', linestyle = '--', linewidth = 1) 
ax[1].axhline(y = -1, color = 'black', linestyle = '--', linewidth = 1) 
ax[2].axhline(y = -1, color = 'black', linestyle = '--', linewidth = 1) 
ax[3].axhline(y = -1, color = 'black', linestyle = '--', linewidth = 1) 

ax[0].set_ylim(-1.25, 1.25) 
ax[1].set_ylim(-1.25, 1.25) 
ax[2].set_ylim(-1.25, 1.25) 
ax[3].set_ylim(-1.25, 1.25) 



(-1.25, 1.25)

In [11]:
fig, ax = plt.subplots(num_regions, 1, figsize=(16, 9), sharex=True, sharey=True)

ax[0].grid(True, linestyle = '--', linewidth = 0.5)
ax[1].grid(True, linestyle = '--', linewidth = 0.5)
ax[2].grid(True, linestyle = '--', linewidth = 0.5)
ax[3].grid(True, linestyle = '--', linewidth = 0.5)

for r in range(num_regions):
    avg_cum_chorus_per_month_in_region = regions[r][0] / regions[r][1]
        
    ax[r].bar(["Jan.", "Feb.", "Mar.", "Apr.", "May.", "June", "July", "Aug.", "Sept.", "Oct.", "Nov.", "Dec."],
              avg_cum_chorus_per_month_in_region, 
              color="grey",
              edgecolor='black',
              linewidth=2)
    ax[r].set_ylabel("Avg. Cumulative Chorus (pT)")
            
    ax[r].errorbar(["Jan.", "Feb.", "Mar.", "Apr.", "May.", "June", "July", "Aug.", "Sept.", "Oct.", "Nov.", "Dec."], avg_cum_chorus_per_month_over_all_regions, std_cum_chorus_per_month_over_all_regions, fmt='o', color="black", capsize=3)
    
    
ax[0].set_title(f"Ascending From Minimum ({int(np.sum(regions[0][1]))})")
ax[1].set_title(f"Ascending To Maximum ({int(np.sum(regions[1][1]))})")
ax[2].set_title(f"Descending From Maximum ({int(np.sum(regions[2][1]))})")
ax[3].set_title(f"Descending To Minimum ({int(np.sum(regions[3][1]))})")

Text(0.5, 1.0, 'Descending To Minimum (76)')