# Draftsheet

> First experiment and ideas

In [None]:
import sklearn
from tsai.basics import *
my_setup(sklearn)
import matplotlib.dates as mdates

In [None]:
data_path = download_data('https://sol.spacenvironment.net/jb2008/indices/SOLFSMY.TXT')
data_path

In [None]:
# Read the text file into a pandas DataFrame, ignoring the lines starting with '#'
# Column names: YYYY DDD   JulianDay  F10   F81c  S10   S81c  M10   M81c  Y10   Y81c  Ssrc
df_raw = pd.read_csv(data_path, delim_whitespace=True, comment='#', header=None, 
                 names=['Year', 'DDD', 'JulianDay', 'F10', 'F81c', 'S10', 'S81c', 
                        'M10', 'M81c', 'Y10', 'Y81c', 'Ssrc'])
df_raw.head()

F10, S10, M10, and Y10 (81c) have different observation and report times; to standardize reporting, all values are reported in sfu units at 12UT (Universal Time); observations are 3-times daily for F10 (20 UT used), every 5 minutes for S10 (daily average used), twice daily for M10 (7 and 16 UT), and every 1 minute for Y10 (Xrays are each minute while Lya is daily); 

For model inputs the values should be used as a daily value between 0-24 UT for a given calendar date; F10 and S10 are 1-day lagged, M10 is 2-day, and Y10 is 5-day lagged in JB2008; the 81-day centered values are used with the same respective lag times. Ssrc has 4 fields (1 for each index): 

*  0 = (F10, S10, M10, Y10) spline-filled if value or missing if no value; 
* 1 = (F10, M10, Y10) derived or measured index, (S10) SOHO/SEM; 
* 2 = (S10) TIMED/SEE v11; 
* 3 = (S10) SOHO gap (daily); 
* 4 = (S10) SOHO gap (average); 
* 5 = (F10) F10 mean (2 surrounding values), (S10) SDO/EVE; 
* 6 = (S10) GOES/EUVS fill-in, (M10) M10 mean (2 surrounding values); 
* 7 = (S10) S10 scaled to match M10 change from previous day; 
* 8 = (S10) SDO/EVE corrections and all S10 tweaked from sat 12388 delta B%, (Y10) UARS/SOLSTICE V18; 
* 9 = (S10) replace original v4.0h data for versions 4.0 and higher, (Y10) UARS/SOLSTICE v19; 
* A = (S10) TIMED/SEE solar minimum correction; 
* B = (S10) replace with original v4.0h S10 data for versions 4.0 and higher, (M10) SORCE/SOLSTICE/SIM v9; 
* C = (S10) SDO/EVE correction, (Y10) GOES XRS; 
* D = (S10) validated TIMED/SEE, (Y10) GOES XRS and SET composite LYA; 
* E = (S10) S10 composite, (Y10) SET composite LYA; 
* F = (F10, S10, M10, Y10) mean of bordering values

Acronyms:
* SOHO/SEM: Solar and Heliospheric Observatory/ Spacecraft's Solar Extreme-ultraviolet Monitor (SEM)
* SDO/EVE: Solar Dynamics Observatory/Extreme Ultraviolet Variability Experiment.
* UARS/SOLSTICE: Upper Atmosphere Research Satellite/Solar Stellar Irradiance Comparison Experiment
* SORCE/SOLSTICE/SIM: Solar Radiation and Climate Experiment/SOLSTICE/Spectral Irradiance Monitor
* GOES/XRS: Geostationary Operational Environmental Satellite/X-Ray Sensor
* "SET composite LYA" refers to the solar irradiance in the Lyman-alpha (Lyα) wavelength range, as measured by the Solar EUV Experiment Telescope (SET) onboard the Solar Radiation and Climate Experiment (SORCE) spacecraft.

This webpage contains forecasts (paid forecast) that we can use to compare to
https://agupubs.onlinelibrary.wiley.com/doi/pdf/10.1029/2020SW002496. It's interesting
to see what they forecast from the previous data in order to try the same thing 
with the neural network

## Data preprocessing

In [None]:
# Check if there are any missing values
df_raw.isna().sum()

In [None]:
# Convert the JulianDay column to a datetime column, and set it as index
df_raw['Date'] = pd.to_datetime(df_raw['JulianDay'], unit='D', origin='julian')
df_raw['Date'].head()
df_raw.set_index('Date', inplace=True)

In [None]:
# Distinct value of the column Ssrc
df_raw.Ssrc.unique()

In [None]:
# Separate the Ssrc columns into four colums, one for each character of the string,
# The names of the new columns will be SsrcF10, SsrcS10, SsrcM10, and SsrcY10,
# Cast the new columns into categories. Use a loop
for i, c in enumerate('F10 S10 M10 Y10'.split()):
    df_raw[f'Ssrc_{c}'] = df_raw['Ssrc'].str[i].astype('category')
df_raw[['Ssrc_F10', 'Ssrc_S10', 'Ssrc_M10', 'Ssrc_Y10']].head()


In [None]:
# See the categories of the column Ssrc_S10
df_raw.Ssrc_S10.cat.categories

In [None]:
# Plot the variable S10. The color of the line will be determined by the value of Ssrc_S10
fig, ax = plt.subplots(figsize=(20, 5))
ax.scatter(df_raw.index, df_raw.S10, c=df_raw.Ssrc_S10.cat.codes, cmap='tab10', s=10)
ax.xaxis.set_major_locator(mdates.YearLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
ax.set_xlabel('Year')
ax.set_ylabel('S10')
ax.set_title('S10 and Ssrc_S10')
# TODO: Add a legend

TODO: How can we detected those anomalies between 1988 and 1999? Maybe the ones equal to zero

In [None]:
# Get the number of values equlas to zero in S10
print((df_raw.S10 == 0).sum())
# convert them to NA
df_raw.loc[df_raw.S10 == 0, 'S10'] = np.nan
# plot the variable S10 again
fig, ax = plt.subplots(figsize=(20, 5))
ax.scatter(df_raw.index, df_raw.S10, c=df_raw.Ssrc_S10.cat.codes, cmap='tab10', s=10)


In [None]:
datetime_col = 'Date'
freq = '1D'
data_columns = 'F10 S10 M10 Y10'.split()
imputation_method = 'ffill'

# sklearn's preprocessing pipeline
preproc_pipe = sklearn.pipeline.Pipeline([
    ('shrinker', TSShrinkDataFrame()), # shrik dataframe memory usage and set the right dtypes
    ('drop_duplicates', TSDropDuplicates(use_index=True)), # drop duplicates
    ('fill_missing', TSFillMissing(columns=data_columns, method=imputation_method, value=None)), # fill missing data (1st ffill. 2nd value=0)
], verbose=True)

df = preproc_pipe.fit_transform(df_raw)
df

In [None]:
# In the paper by Licata et al. (2020) (https://agupubs.onlinelibrary.wiley.com/doi/pdf/10.1029/2020SW002496),
# authors use a period from October 2012 through the end of 2018 for the benchmarking.
# Therefore, we will set the test set as the same period for our analysis, 
# using the column Date as the timestamp, from October 2012 to the end of 2018. 
# Everything before the test set will be used for training, and everything after the test set
# will be used for validation
test_start_datetime = '2012-10-01'
test_end_datetime = '2018-12-31'
valid_start_datetime = '2018-01-01'


# Plot the variables F10, S10, M10 and Y10, covering the different periods (training, test and validation)
# with different colors. Do it for the 4 variables mentioned above 
fig, ax = plt.subplots(4, 1, figsize=(20, 10))
 
for i, var in enumerate(['F10', 'S10', 'M10', 'Y10']):
    ax[i].plot(df[var], label='train')
    ax[i].plot(df[var][(df.index >= test_start_datetime) & (df.index <= test_end_datetime)],
               label='test')
    ax[i].plot(df[var][(df.index >= valid_start_datetime)], label='valid')
    ax[i].set_title(var)
    ax[i].legend()
    ax[i].xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d')) # format x-axis ticks


In [None]:
# Splits: Since the validation period is after the test period in this use case, we cannot
# use the default `get_forecasting_splits` from tsai. Instead, we will do manually
# the validation splits, and use the funcion only for the test splits

horizon = 6 # same as paper by Licata et al. (2020)
lookback = 6*horizon # same as paper by Stevenson et al. (2021) 
val_idxs = L(df.reset_index()[df.index >= valid_start_datetime].index.tolist())
splits_ = get_forecasting_splits(df[df.index < valid_start_datetime], 
                             fcst_history=lookback, 
                             fcst_horizon=horizon, 
                             use_index=True, 
                             test_cutoff_datetime=test_start_datetime, 
                             show_plot=False)
splits = (splits_[0], val_idxs, splits_[1])
splits

In [None]:
# Find where the index 9477 is in the splits tuple
for i, s in enumerate(splits):
    if 9477 in s:
        print(i)

In [None]:
# Now that we have defined the splits for this particular experiment, we'll scaled
# the data
train_split = splits[0]
exp_pipe = sklearn.pipeline.Pipeline([
    ('scaler', TSStandardScaler(columns=data_columns)),
], verbose=True)
save_object(exp_pipe, 'tmp/exp_pipe.pkl')
exp_pipe = load_object('tmp/exp_pipe.pkl')

df_scaled = exp_pipe.fit_transform(df.reset_index(), scaler__idxs = train_split)
df_scaled.set_index(datetime_col, inplace=True)
df_scaled.head()


 ### Apply a sliding window. 

In [None]:
# We'll approach the time series forecasting task as a supervised learning problem. 
# Remember that tsai requires that both inputs and outputs have the following shape:
# (samples, features, steps)

# To get those inputs and outputs we're going to use a function called 
# `prepare_forecasting_data`` that applies a sliding window along the dataframe:
x_vars = data_columns
y_vars = data_columns
X, y = prepare_forecasting_data(df, fcst_history=lookback, fcst_horizon=horizon, 
                                x_vars=x_vars, y_vars=y_vars)
X.shape, y.shape

In [None]:
min(splits[1]), max(splits[1])

### Prepare the forecaster

In [None]:
arch_config = dict(
    n_layers=3,  # number of encoder layers
    n_heads=4,  # number of heads
    d_model=16,  # dimension of model
    d_ff=128,  # dimension of fully connected network
    attn_dropout=0.0, # dropout applied to the attention weights
    dropout=0.3,  # dropout applied to all linear layers in the encoder except q,k&v projections
    patch_len=24,  # length of the patch applied to the time series to create patches
    stride=2,  # stride used when creating patches
    padding_patch=True,  # padding_patch
)
learn = TSForecaster(X, y, splits=splits, batch_size=16, path="models", pipelines=[preproc_pipe, exp_pipe],
                        arch="PatchTST", arch_config=arch_config, metrics=[mse, mae], cbs=[ShowGraph()])