## Setup

In [44]:
import pandas as pd
import os
from dotenv import load_dotenv
from src import util
from tqdm import tqdm

In [45]:
load_dotenv()

events_csv_path = os.getenv("EVENTS_CSV_PATH")
xrays_csv_path = os.getenv("XRAY_V2_CSV_PATH")

data_begin_year = 1983
data_end_year = 2025
data_year_range = range(data_begin_year, data_end_year + 1)

events_begin_year = 1996
events_end_year = 2024
events_year_range = range(events_begin_year, events_end_year + 1)

## Reading CSVs

In [46]:
data = {
    'events': {
        'all': {},
        'DSD': {}
    },
    'xrays': {}
}

In [47]:
missing_files = []
for y in range(1983, 1995+1):
    f = missing_files.append(os.path.join(events_csv_path, os.path.join(str(y), f"{y}_events.csv")))
    missing_files.append(os.path.join(events_csv_path, os.path.join(str(y), f"{y}_DSD.csv")))

In [48]:
missing_events_dsd = [
    os.path.join(events_csv_path, str(y), f"{y}_{file_type}.csv")
    for y in range(1983, 1995+1)
    for file_type in ('events', 'DSD')
]
missing_files = set(missing_events_dsd)

for y in data_year_range:
    events_year_dir = os.path.join(events_csv_path, str(y))
    xrays_year_dir = os.path.join(xrays_csv_path, str(y))

    try:
        data['events']['all'][y] = pd.read_csv(os.path.join(events_year_dir, f"{y}_events.csv"),
                                               parse_dates=['date','begin','max','end'],
                                               ).rename(columns={'date':'ds'})
    except FileNotFoundError as e:
        data['events']['all'][y] = pd.DataFrame()
        if e.filename not in missing_files: print(e)

    try:
        data['events']['DSD'][y] = pd.read_csv(os.path.join(events_year_dir, f"{y}_DSD.csv"),
                                               parse_dates=['ds'],
                                               index_col='ds')
    except FileNotFoundError as e:
        data['events']['DSD'][y] = pd.DataFrame()
        if e.filename not in missing_files: print(e)

    try:
        data['xrays'][y] = pd.read_csv(os.path.join(xrays_year_dir, f"{y}_xrays.csv"),
                                       parse_dates=['ds'],
                                       index_col='ds')
        data['xrays'][y] = data['xrays'][y].asfreq('1min')

    except FileNotFoundError as e:
        data[y]['xrays'] = pd.DataFrame()
        if e.filename not in missing_files: print(e)

[Errno 2] No such file or directory: 'G:\\My Drive\\Solar_Flares\\Data\\events\\formatted_csv\\2025\\2025_events.csv'
[Errno 2] No such file or directory: 'G:\\My Drive\\Solar_Flares\\Data\\events\\formatted_csv\\2025\\2025_DSD.csv'


## Preparing Data

### Categorizing Different Events Types

In [49]:
event_types = ("BSL", "DSF", "EPL", "FIL", "FLA", "FOR", "GLE", "LPS", "PCA", "RBR", "RNS", "RSP", "XRA")

for t in event_types:
    data['events'][t] = {}
    for y in data_year_range:
        df_all = data['events']['all'][y]
        try:
            mask = df_all['type'] == t
            data['events'][t][y] = df_all[mask].reset_index(drop=False)
            data['events'][t][y] = data['events'][t][y].rename(columns={'index': 'original_index'})
        except KeyError as e:
            data['events'][t][y] = pd.DataFrame()

In [50]:
xra_events = {'all': pd.DataFrame()}

for y in events_year_range:
    df = data['events']['XRA'][y]

    df[['class_expanded', 'peak_flux']] = df['particulars'].str.split(expand=True)
    df['peak_flux'] = pd.to_numeric(df['peak_flux'])
    df['class'] = df['class_expanded'].str[0]
    df = df.drop(columns=['particulars'])

    df['class_numeric'] = df['class'].apply(lambda c: util.flare_class_map.get(c, 0))

    df = df[['ds', 'type', 'class_numeric' ,'class', 'class_expanded', 'peak_flux', 'begin', 'max', 'end', 'q']]

    df = df.dropna(subset=['begin'])

    xra_events[y] = df
    xra_events['all'] = pd.concat([xra_events['all'], df])

### DFs To Slide

In [51]:
def get_cols(year: int) -> list[str]:
    return ['xs','xl'] if year < 2020 else ['xrsa_flux','xrsb_flux']

In [52]:
xrays_to_slide = pd.DataFrame()
xra_events_to_slide = pd.DataFrame()
for y in events_year_range:
    cols = get_cols(y)
    df_xrays = data['xrays'][y][[c for c in cols]]
    if cols != ['xs', 'xl']:
        df_xrays = df_xrays.rename(columns=dict(zip(cols, ['xs', 'xl'])))

    xrays_to_slide = pd.concat([xrays_to_slide, df_xrays])


    df_events = xra_events[y][['class_numeric', 'begin']]

    xra_events_to_slide = pd.concat([xra_events_to_slide, df_events])

### Ground Truth

In [53]:
def create_ground_truth(events: pd.DataFrame, indexes: pd.DataFrame.index) -> pd.DataFrame:
    events = events.copy()

    events['duration'] = (events['end'] - events['begin']).dt.total_seconds() / 60
    mean_duration_by_class = events.groupby('class_numeric')['duration'].mean()
    duration_map = mean_duration_by_class.to_dict()

    def impute_end(row):
        if pd.isna(row['end']):
            mean_duration = duration_map.get(row['class_numeric'])
            return row['begin'] + pd.to_timedelta(mean_duration, unit='m')
        else:
            return row['end']

    events['end_imputed'] = events.apply(impute_end, axis=1)
    events_sorted = events.sort_values(by='class_numeric', ascending=True)

    ground_truth = pd.DataFrame(index=indexes)
    ground_truth['current_class'] = 0

    for event in tqdm(events_sorted.itertuples(), total=len(events_sorted)):
        ground_truth.loc[event.begin:event.end_imputed, 'current_class'] = event.class_numeric

    return ground_truth

In [55]:
df_ground_truth = create_ground_truth(xra_events['all'], xrays_to_slide.index)

percentages = df_ground_truth['current_class'].value_counts(normalize=True)
for value_label, proportion in percentages.items():
    print(f"{value_label} -> {proportion * 100:.2f}%")

100%|██████████| 50756/50756 [00:02<00:00, 21558.60it/s]


0 -> 93.21%
3 -> 4.08%
2 -> 1.94%
4 -> 0.71%
5 -> 0.06%
1 -> 0.01%


## Showing DFs

In [11]:
data['xrays'][2020]

Unnamed: 0_level_0,xrsa_flux,xrsb_flux
ds,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-01 00:00:00,2.344518e-09,4.515841e-09
2020-01-01 00:01:00,2.840341e-09,5.645528e-09
2020-01-01 00:02:00,4.735201e-09,4.346297e-09
2020-01-01 00:03:00,5.261884e-09,4.360483e-09
2020-01-01 00:04:00,6.190385e-09,4.019309e-09
...,...,...
2020-12-31 23:55:00,,
2020-12-31 23:56:00,,
2020-12-31 23:57:00,,
2020-12-31 23:58:00,,


In [12]:
xrays_to_slide

Unnamed: 0_level_0,xs,xl
ds,Unnamed: 1_level_1,Unnamed: 2_level_1
1996-01-01 00:00:00,9.250000e-10,1.380000e-08
1996-01-01 00:01:00,8.970000e-10,1.460000e-08
1996-01-01 00:02:00,9.270000e-10,1.380000e-08
1996-01-01 00:03:00,9.200000e-10,1.370000e-08
1996-01-01 00:04:00,9.330000e-10,1.300000e-08
...,...,...
2024-12-31 23:55:00,2.279210e-07,6.078074e-06
2024-12-31 23:56:00,2.283925e-07,6.095521e-06
2024-12-31 23:57:00,2.347545e-07,6.143794e-06
2024-12-31 23:58:00,2.435809e-07,6.191976e-06


In [None]:
xra_events_to_slide

In [19]:
percentages = xra_events['all']['class'].value_counts(normalize=True)
for value_label, proportion in percentages.items():
    print(f"{value_label} -> {proportion * 100:.2f}%")
print(len(xra_events['all']))

C -> 57.49%
B -> 34.64%
M -> 7.16%
X -> 0.49%
A -> 0.22%
50756


## Slided DataFrames

In [14]:
metrics_windows = ['1h', '6h', '12h', '24h', '7D', '14D', '30D']
deriv_windows = ['5min', '15min', '30min', '1h', '3h', '6h']

df_features = pd.DataFrame(index=xrays_to_slide.index)
cols = ['xs', 'xl']

In [15]:
for col in cols:
    for w in metrics_windows:
        rolling_window = xrays_to_slide[col].rolling(window=w)
        df_features[f'{col}_mean_{w}'] = rolling_window.mean()
        df_features[f'{col}_std_{w}'] = rolling_window.std()
        df_features[f'{col}_max_{w}'] = rolling_window.max()

    col_diff = xrays_to_slide[col].diff()
    for w in deriv_windows:
        df_features[f'{col}_deriv_{w}'] = col_diff.rolling(w).mean()

In [16]:
target_events = xra_events_to_slide.set_index('begin')[['class_numeric']]
target_events = target_events.groupby(level=0).max().reindex(xrays_to_slide.index).fillna(0)

reversed_targets = target_events['class_numeric'].iloc[::-1]
future_max_reversed = reversed_targets.rolling(window='24h', min_periods=1).max()
future_max = future_max_reversed.iloc[::-1]

df_target = pd.DataFrame(index=xrays_to_slide.index)
df_target['target_class_in_24h'] = (future_max.shift(-1, freq='1min').fillna(0)).astype(int)

In [17]:
# O resampling deve ser realizado na hora do uso
df_slided = pd.concat([df_features, df_target], axis=1).dropna()

In [20]:
df_slided

NameError: name 'df_slided' is not defined

## Exporting CSVs

In [19]:
slided_dfs_path = os.getenv("SLIDED_DFS_CSV_PATH")

# df_slided.to_csv(os.path.join(slided_dfs_path, f"data_slided_V2"))
df_slided.to_parquet(os.path.join(slided_dfs_path, "data_slided_V2.parquet"))

In [20]:
df_slided.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 14780343 entries, 1996-01-01 00:01:00 to 2024-12-31 23:58:00
Data columns (total 55 columns):
 #   Column               Dtype  
---  ------               -----  
 0   xs_mean_1h           float64
 1   xs_std_1h            float64
 2   xs_max_1h            float64
 3   xs_mean_6h           float64
 4   xs_std_6h            float64
 5   xs_max_6h            float64
 6   xs_mean_12h          float64
 7   xs_std_12h           float64
 8   xs_max_12h           float64
 9   xs_mean_24h          float64
 10  xs_std_24h           float64
 11  xs_max_24h           float64
 12  xs_mean_7D           float64
 13  xs_std_7D            float64
 14  xs_max_7D            float64
 15  xs_mean_14D          float64
 16  xs_std_14D           float64
 17  xs_max_14D           float64
 18  xs_mean_30D          float64
 19  xs_std_30D           float64
 20  xs_max_30D           float64
 21  xs_deriv_5min        float64
 22  xs_deriv_15min       float64
 23