## Setup

In [1]:
import pandas as pd
import os
from dotenv import load_dotenv
from src.py_src import util
from tqdm import tqdm
import numpy as np

In [2]:
load_dotenv()

events_csv_path = os.getenv("EVENTS_CSV_PATH")
xrays_csv_path = os.getenv("XRAY_V2_CSV_PATH")

data_begin_year = 1983
data_end_year = 2025
data_year_range = range(data_begin_year, data_end_year + 1)

events_begin_year = 1996
events_end_year = 2024
events_year_range = range(events_begin_year, events_end_year + 1)

# cols_names = ['xs','xl']
cols_names = ['xl']

## Reading CSVs

In [3]:
data = {
    'events': {
        'all': {},
        'DSD': {}
    },
    'xrays': {}
}

In [4]:
missing_files = []
for y in range(1983, 1995+1):
    f = missing_files.append(os.path.join(events_csv_path, os.path.join(str(y), f"{y}_events.csv")))
    missing_files.append(os.path.join(events_csv_path, os.path.join(str(y), f"{y}_DSD.csv")))

In [5]:
missing_events_dsd = [
    os.path.join(events_csv_path, str(y), f"{y}_{file_type}.csv")
    for y in range(1983, 1995+1)
    for file_type in ('events', 'DSD')
]
missing_files = set(missing_events_dsd)

for y in tqdm(data_year_range):
    events_year_dir = os.path.join(events_csv_path, str(y))
    xrays_year_dir = os.path.join(xrays_csv_path, str(y))

    try:
        data['events']['all'][y] = pd.read_csv(os.path.join(events_year_dir, f"{y}_events.csv"),
                                               parse_dates=['date','begin','max','end'],
                                               ).rename(columns={'date':'ds'})
    except FileNotFoundError as e:
        data['events']['all'][y] = pd.DataFrame()
        if e.filename not in missing_files: print(e)

    try:
        data['events']['DSD'][y] = pd.read_csv(os.path.join(events_year_dir, f"{y}_DSD.csv"),
                                               parse_dates=['ds'],
                                               index_col='ds')
    except FileNotFoundError as e:
        data['events']['DSD'][y] = pd.DataFrame()
        if e.filename not in missing_files: print(e)

    try:
        data['xrays'][y] = pd.read_csv(os.path.join(xrays_year_dir, f"{y}_xrays.csv"),
                                       parse_dates=['ds'],
                                       index_col='ds')
        data['xrays'][y] = data['xrays'][y].asfreq('1min')

    except FileNotFoundError as e:
        data[y]['xrays'] = pd.DataFrame()
        if e.filename not in missing_files: print(e)

 98%|█████████▊| 42/43 [00:25<00:00,  1.45it/s]

[Errno 2] No such file or directory: 'G:\\My Drive\\Solar_Flares\\Data\\events\\formatted_csv\\2025\\2025_events.csv'
[Errno 2] No such file or directory: 'G:\\My Drive\\Solar_Flares\\Data\\events\\formatted_csv\\2025\\2025_DSD.csv'


100%|██████████| 43/43 [00:26<00:00,  1.63it/s]


### Categorizing Different Events Types

In [6]:
event_types = ("BSL", "DSF", "EPL", "FIL", "FLA", "FOR", "GLE", "LPS", "PCA", "RBR", "RNS", "RSP", "XRA")

for t in event_types:
    data['events'][t] = {}
    for y in data_year_range:
        df_all = data['events']['all'][y]
        try:
            mask = df_all['type'] == t
            data['events'][t][y] = df_all[mask].reset_index(drop=False)
            data['events'][t][y] = data['events'][t][y].rename(columns={'index': 'original_index'})
        except KeyError as e:
            data['events'][t][y] = pd.DataFrame()

In [7]:
xra_events = {'all': pd.DataFrame()}
all_dfs_list = []

for y in events_year_range:
    df = data['events']['XRA'][y].copy()

    particulars_split = df['particulars'].str.split(expand=True)
    df['class_expanded'] = particulars_split[0]
    df['flux'] = df['class_expanded'].apply(util.parse_flare_class_expanded)
    df['class'] = df['class_expanded'].str[0].str.upper()
    df = df.drop(columns=['particulars', 'class_expanded'])

    df['class_numeric'] = df['class'].apply(lambda c: util.flare_class_map.get(c, 0))

    df = df[['ds', 'type', 'class', 'class_numeric', 'flux', 'begin', 'max', 'end', 'q']]

    df = df.dropna(subset=['begin'])

    xra_events[y] = df
    all_dfs_list.append(df)

xra_events['all'] = pd.concat(all_dfs_list, ignore_index=True)

### DFs To Slide

In [8]:
def get_cols(year: int) -> list[str]:
    # return ['xs','xl'] if year < 2020 else ['xrsa_flux','xrsb_flux']
    return ['xl'] if year < 2020 else ['xrsb_flux']

In [9]:
xrays_to_slide_list = []
xra_events_to_slide_list = []

for y in tqdm(events_year_range):
    cols = get_cols(y)
    df_xrays = data['xrays'][y][[c for c in cols]]
    if cols not in cols_names:
        df_xrays = df_xrays.rename(columns=dict(zip(cols, cols_names)))

    xrays_to_slide_list.append(df_xrays)

    df_events = xra_events[y][['class_numeric', 'begin', 'flux']]
    xra_events_to_slide_list.append(df_events)

xrays_to_slide = pd.concat(xrays_to_slide_list).sort_index()
xra_events_to_slide = pd.concat(xra_events_to_slide_list).sort_values('begin')

100%|██████████| 29/29 [00:00<00:00, 318.06it/s]


### Ground Truth

In [10]:
def create_ground_truth(events: pd.DataFrame, indexes: pd.DataFrame.index) -> pd.DataFrame:
    events = events.copy()

    events['duration'] = (events['end'] - events['begin']).dt.total_seconds() / 60
    mean_duration_by_class = events.groupby('class_numeric')['duration'].mean()
    duration_map = mean_duration_by_class.to_dict()

    def impute_end(row):
        if pd.isna(row['end']):
            mean_duration = duration_map.get(row['class_numeric'])
            return row['begin'] + pd.to_timedelta(mean_duration, unit='m')
        else:
            return row['end']

    events['end_imputed'] = events.apply(impute_end, axis=1)
    events_sorted = events.sort_values(by='class_numeric', ascending=True)

    ground_truth = pd.DataFrame(index=indexes)
    ground_truth['current_class'] = 0

    for event in tqdm(events_sorted.itertuples(), total=len(events_sorted)):
        ground_truth.loc[event.begin:event.end_imputed, 'current_class'] = event.class_numeric

    return ground_truth

In [11]:
df_ground_truth = create_ground_truth(xra_events['all'], xrays_to_slide.index)

percentages = df_ground_truth['current_class'].value_counts(normalize=True)
for value_label, proportion in percentages.items():
    print(f"{value_label} -> {proportion * 100:.2f}%")

100%|██████████| 50756/50756 [00:02<00:00, 23651.37it/s]


0 -> 93.21%
3 -> 4.08%
2 -> 1.94%
4 -> 0.71%
5 -> 0.06%
1 -> 0.01%


## Showing DFs

In [12]:
data['xrays'][2020]

Unnamed: 0_level_0,xrsa_flux,xrsb_flux
ds,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-01 00:00:00,2.344518e-09,4.515841e-09
2020-01-01 00:01:00,2.840341e-09,5.645528e-09
2020-01-01 00:02:00,4.735201e-09,4.346297e-09
2020-01-01 00:03:00,5.261884e-09,4.360483e-09
2020-01-01 00:04:00,6.190385e-09,4.019309e-09
...,...,...
2020-12-31 23:55:00,,
2020-12-31 23:56:00,,
2020-12-31 23:57:00,,
2020-12-31 23:58:00,,


In [13]:
xrays_to_slide

Unnamed: 0_level_0,xl
ds,Unnamed: 1_level_1
1996-01-01 00:00:00,1.380000e-08
1996-01-01 00:01:00,1.460000e-08
1996-01-01 00:02:00,1.380000e-08
1996-01-01 00:03:00,1.370000e-08
1996-01-01 00:04:00,1.300000e-08
...,...
2024-12-31 23:55:00,6.078074e-06
2024-12-31 23:56:00,6.095521e-06
2024-12-31 23:57:00,6.143794e-06
2024-12-31 23:58:00,6.191976e-06


In [14]:
xra_events_to_slide

Unnamed: 0,class_numeric,begin,flux
0,2,1996-07-31 01:32:00,1.300000e-07
1,2,1996-07-31 02:22:00,1.200000e-07
2,2,1996-07-31 07:00:00,1.200000e-07
3,2,1996-07-31 08:29:00,2.000000e-07
4,2,1996-07-31 11:14:00,1.400000e-07
...,...,...,...
3751,3,2024-12-30 20:21:00,5.500000e-06
3752,3,2024-12-30 21:27:00,6.500000e-06
3753,3,2024-12-30 21:48:00,6.100000e-06
3754,3,2024-12-30 22:19:00,7.600000e-06


In [15]:
percentages = xra_events['all']['class'].value_counts(normalize=True)
for value_label, proportion in percentages.items():
    print(f"{value_label} -> {proportion * 100:.2f}%")
print(len(xra_events['all']))

C -> 57.49%
B -> 34.64%
M -> 7.16%
X -> 0.49%
A -> 0.22%
50756


## Slided DataFrames

In [16]:
metrics_windows = ['1h', '6h', '12h', '24h', '7D', '14D', '30D']
deriv_windows = ['5min', '15min', '30min', '1h', '3h', '6h', '12h', '24h']

df_features = pd.DataFrame(index=xrays_to_slide.index)
# cols = ['xs', 'xl']
cols = ['xl']

In [17]:
for col in cols:
    xrays_to_slide[f'{col}_log'] = np.log10(xrays_to_slide[col] + 1e-9)
    for w in tqdm(metrics_windows):
        rolling_window = xrays_to_slide[col].rolling(window=w)
        df_features[f'{col}_mean_{w}'] = rolling_window.mean()
        df_features[f'{col}_std_{w}'] = rolling_window.std()
        df_features[f'{col}_max_{w}'] = rolling_window.max()

        df_features[f'{col}_log_mean_{w}'] = xrays_to_slide[f'{col}_log'].rolling(window=w).mean()
        df_features[f'{col}_integ_{w}'] = rolling_window.sum()

    col_diff = xrays_to_slide[col].diff()
    for w in tqdm(deriv_windows):
        df_features[f'{col}_deriv_{w}'] = col_diff.rolling(w).mean()

        diff_2 = col_diff.diff()
        df_features[f'{col}_accel_{w}'] = diff_2.rolling(w).mean()

    df_features[f'{col}_ratio_max1h_mean24h'] = df_features[f'{col}_max_1h'] / (df_features[f'{col}_mean_24h'] + 1e-9)
    df_features[f'{col}_ratio_max6h_mean24h'] = df_features[f'{col}_max_6h'] / (df_features[f'{col}_mean_24h'] + 1e-9)
    df_features[f'{col}_ratio_mean24h_mean7d'] = df_features[f'{col}_mean_24h'] / (df_features[f'{col}_mean_7D'] + 1e-9)

    xrays_to_slide = xrays_to_slide.drop(columns=[f'{col}_log'])

100%|██████████| 7/7 [00:13<00:00,  1.99s/it]
100%|██████████| 8/8 [00:07<00:00,  1.04it/s]


In [18]:
events_series = pd.Series(0, index=xrays_to_slide.index)
event_occurrences = xra_events_to_slide.set_index('begin')['class_numeric']
event_occurrences = event_occurrences.groupby(level=0).max()
events_series.update(event_occurrences)

history_windows = ['6h', '24h', '3D', '7D']

for w in tqdm(history_windows, desc="Event History Counts"):
    is_C = (events_series == 3).astype(int)
    is_M = (events_series == 4).astype(int)
    is_X = (events_series == 5).astype(int)

    df_features[f'count_C_{w}'] = is_C.rolling(window=w).sum()
    df_features[f'count_M_{w}'] = is_M.rolling(window=w).sum()
    df_features[f'count_X_{w}'] = is_X.rolling(window=w).sum()

    df_features[f'sum_class_score_{w}'] = events_series.rolling(window=w).sum()

Event History Counts: 100%|██████████| 4/4 [00:07<00:00,  1.79s/it]


In [19]:
target_events_grouped = xra_events_to_slide.set_index('begin')[['class_numeric', 'flux']]
target_events_grouped = target_events_grouped.groupby(level=0).max().reindex(xrays_to_slide.index).fillna(0)

reversed_class_numeric = target_events_grouped['class_numeric'].iloc[::-1]
future_class_numeric_max = reversed_class_numeric.rolling(window='24h', min_periods=1).max().iloc[::-1]

reversed_flux = target_events_grouped['flux'].iloc[::-1]
future_flux_max = reversed_flux.rolling(window='24h', min_periods=1).max().iloc[::-1]

df_target = pd.DataFrame(index=xrays_to_slide.index)
df_target['target_class_in_24h'] = (future_class_numeric_max.shift(-1, freq='1min').fillna(0)).astype(int)
df_target['target_flux_in_24h'] = future_flux_max.shift(-1, freq='1min').fillna(0.0)

In [20]:
RESAMPLE_FREQ = '10min'
RESAMPLE_METHOD = 'last'
df_features = df_features.resample(RESAMPLE_FREQ).agg(RESAMPLE_METHOD).ffill().dropna()
df_target = df_target.resample(RESAMPLE_FREQ).agg(RESAMPLE_METHOD).ffill().dropna()

df_slided = pd.concat([df_features, df_target], axis=1).dropna()

In [21]:
df_slided

Unnamed: 0_level_0,xl_mean_1h,xl_std_1h,xl_max_1h,xl_log_mean_1h,xl_integ_1h,xl_mean_6h,xl_std_6h,xl_max_6h,xl_log_mean_6h,xl_integ_6h,...,count_C_3D,count_M_3D,count_X_3D,sum_class_score_3D,count_C_7D,count_M_7D,count_X_7D,sum_class_score_7D,target_class_in_24h,target_flux_in_24h
ds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1996-01-01 00:00:00,1.382000e-08,4.077036e-10,1.460000e-08,-7.829300,1.382000e-07,1.382000e-08,4.077036e-10,1.460000e-08,-7.829300,1.382000e-07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1996-01-01 00:10:00,1.386000e-08,3.662039e-10,1.460000e-08,-7.828107,2.772000e-07,1.386000e-08,3.662039e-10,1.460000e-08,-7.828107,2.772000e-07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1996-01-01 00:20:00,1.465000e-08,1.199353e-09,1.690000e-08,-7.806686,4.395000e-07,1.465000e-08,1.199353e-09,1.690000e-08,-7.806686,4.395000e-07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1996-01-01 00:30:00,1.500750e-08,1.278498e-09,1.760000e-08,-7.797015,6.003000e-07,1.500750e-08,1.278498e-09,1.760000e-08,-7.797015,6.003000e-07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1996-01-01 00:40:00,1.493600e-08,1.207757e-09,1.760000e-08,-7.798834,7.468000e-07,1.493600e-08,1.207757e-09,1.760000e-08,-7.798834,7.468000e-07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-31 23:10:00,1.741547e-05,8.155662e-06,2.906757e-05,-4.821113,1.044928e-03,8.042177e-06,7.965353e-06,2.906757e-05,-5.260968,2.895184e-03,...,8.0,35.0,3.0,179.0,34.0,44.0,3.0,293.0,0.0,0.0
2024-12-31 23:20:00,1.386945e-05,8.019732e-06,2.906757e-05,-4.937769,8.321670e-04,8.102606e-06,7.935948e-06,2.906757e-05,-5.254595,2.916938e-03,...,8.0,35.0,3.0,179.0,34.0,44.0,3.0,293.0,0.0,0.0
2024-12-31 23:30:00,1.089013e-05,7.042411e-06,2.906757e-05,-5.039415,6.534077e-04,8.168346e-06,7.906699e-06,2.906757e-05,-5.248270,2.940605e-03,...,8.0,35.0,3.0,179.0,34.0,44.0,3.0,293.0,0.0,0.0
2024-12-31 23:40:00,8.751141e-06,5.947402e-06,2.906757e-05,-5.119485,5.250685e-04,8.258113e-06,7.865352e-06,2.906757e-05,-5.239397,2.972921e-03,...,8.0,35.0,3.0,179.0,34.0,44.0,3.0,293.0,0.0,0.0


## Exporting CSVs

In [22]:
slided_dfs_path = os.getenv("SLIDED_DFS_CSV_PATH")

# # df_slided.to_csv(os.path.join(slided_dfs_path, f"data_slided_V3"))
df_slided.to_parquet(os.path.join(slided_dfs_path, "data_slided_V4.parquet"))