## Setup

In [None]:
import pandas as pd
import os
from dotenv import load_dotenv
from src import util

In [None]:
load_dotenv()

events_csv_path = os.getenv("EVENTS_CSV_PATH")
xrays_csv_path = os.getenv("XRAY_V2_CSV_PATH")

data_begin_year = 1983
data_end_year = 2025
data_year_range = range(data_begin_year, data_end_year + 1)

events_begin_year = 1996
events_end_year = 2024
events_year_range = range(events_begin_year, events_end_year + 1)

## Reading CSVs

In [None]:
data = {
    'events': {
        'all': {},
        'DSD': {}
    },
    'xrays': {}
}

In [None]:
missing_files = []
for y in range(1983, 1995+1):
    f = missing_files.append(os.path.join(events_csv_path, os.path.join(str(y), f"{y}_events.csv")))
    missing_files.append(os.path.join(events_csv_path, os.path.join(str(y), f"{y}_DSD.csv")))

In [None]:
missing_events_dsd = [
    os.path.join(events_csv_path, str(y), f"{y}_{file_type}.csv")
    for y in range(1983, 1995+1)
    for file_type in ('events', 'DSD')
]
missing_files = set(missing_events_dsd)

for y in data_year_range:
    events_year_dir = os.path.join(events_csv_path, str(y))
    xrays_year_dir = os.path.join(xrays_csv_path, str(y))

    try:
        data['events']['all'][y] = pd.read_csv(os.path.join(events_year_dir, f"{y}_events.csv"),
                                               parse_dates=['date','begin','max','end'],
                                               ).rename(columns={'date':'ds'})
    except FileNotFoundError as e:
        data['events']['all'][y] = pd.DataFrame()
        if e.filename not in missing_files: print(e)

    try:
        data['events']['DSD'][y] = pd.read_csv(os.path.join(events_year_dir, f"{y}_DSD.csv"),
                                               parse_dates=['ds'],
                                               index_col='ds')
    except FileNotFoundError as e:
        data['events']['DSD'][y] = pd.DataFrame()
        if e.filename not in missing_files: print(e)

    try:
        data['xrays'][y] = pd.read_csv(os.path.join(xrays_year_dir, f"{y}_xrays.csv"),
                                       parse_dates=['ds'],
                                       index_col='ds')
        data['xrays'][y] = data['xrays'][y].asfreq('1min')

    except FileNotFoundError as e:
        data[y]['xrays'] = pd.DataFrame()
        if e.filename not in missing_files: print(e)

## Preparing Data

### Categorizing Different Events Types

In [None]:
event_types = ("BSL", "DSF", "EPL", "FIL", "FLA", "FOR", "GLE", "LPS", "PCA", "RBR", "RNS", "RSP", "XRA")

for t in event_types:
    data['events'][t] = {}
    for y in data_year_range:
        df_all = data['events']['all'][y]
        try:
            mask = df_all['type'] == t
            data['events'][t][y] = df_all[mask].reset_index(drop=False)
            data['events'][t][y] = data['events'][t][y].rename(columns={'index': 'original_index'})
        except KeyError as e:
            data['events'][t][y] = pd.DataFrame()

In [None]:
xra_events = {'all': pd.DataFrame()}

for y in events_year_range:
    df = data['events']['XRA'][y]

    df[['class_expanded', 'peak_flux']] = df['particulars'].str.split(expand=True)
    df['peak_flux'] = pd.to_numeric(df['peak_flux'])
    df['class'] = df['class_expanded'].str[0]
    df = df.drop(columns=['particulars'])

    df['class_numeric'] = df['class'].apply(lambda c: util.flare_class_map.get(c, 0))

    df = df[['ds', 'type', 'class_numeric' ,'class', 'class_expanded', 'peak_flux', 'begin', 'max', 'end', 'q']]

    df = df.dropna(subset=['begin'])

    xra_events[y] = df
    xra_events['all'] = pd.concat([xra_events['all'], df])

### DFs To Slide

In [None]:
def get_cols(year: int) -> list[str]:
    return ['xs','xl'] if year < 2020 else ['xrsa_flux','xrsb_flux']

In [None]:
xrays_to_slide = pd.DataFrame()
xra_events_to_slide = pd.DataFrame()
for y in events_year_range:
    cols = get_cols(y)
    df_xrays = data['xrays'][y][[c for c in cols]]
    if cols != ['xs', 'xl']:
        df_xrays = df_xrays.rename(columns=dict(zip(cols, ['xs', 'xl'])))

    xrays_to_slide = pd.concat([xrays_to_slide, df_xrays])


    df_events = xra_events[y][['class_numeric', 'begin']]

    xra_events_to_slide = pd.concat([xra_events_to_slide, df_events])

## Showing DFs

In [None]:
data['xrays'][2020]

In [None]:
xrays_to_slide

In [None]:
xra_events[2020]

In [None]:
xra_events_to_slide

## Slided DataFrames

In [None]:
metrics_windows = ['1h', '6h', '12h', '24h', '7D', '14D', '30D']
deriv_windows = ['5min', '15min', '30min', '1h', '3h', '6h']

df_features = pd.DataFrame(index=xrays_to_slide.index)
cols = ['xs', 'xl']

In [None]:
for col in cols:
    for w in metrics_windows:
        rolling_window = xrays_to_slide[col].rolling(window=w)
        df_features[f'{col}_mean_{w}'] = rolling_window.mean()
        df_features[f'{col}_std_{w}'] = rolling_window.std()
        df_features[f'{col}_max_{w}'] = rolling_window.max()

    col_diff = xrays_to_slide[col].diff()
    for w in deriv_windows:
        df_features[f'{col}_deriv_{w}'] = col_diff.rolling(w).mean()

In [None]:
target_events = xra_events_to_slide.set_index('begin')[['class_numeric']]
target_events = target_events.groupby(level=0).max().reindex(xrays_to_slide.index).fillna(0)

reversed_targets = target_events['class_numeric'].iloc[::-1]
future_max_reversed = reversed_targets.rolling(window='24h', min_periods=1).max()
future_max = future_max_reversed.iloc[::-1]

df_target = pd.DataFrame(index=xrays_to_slide.index)
df_target['target_class_in_24h'] = future_max.shift(-1, freq='1min').fillna(0)

df_target['target_class_label'] = df_target['target_class_in_24h'].apply(lambda x: util.reverse_flare_class_map.get(x, 'No Flare'))

In [None]:
# O resampling deve ser realizado na hora do uso
df_slided = pd.concat([df_features, df_target], axis=1).dropna()

In [None]:
df_slided

## Exporting CSVs

In [None]:
slided_dfs_path = os.getenv("SLIDED_DFS_CSV_PATH")

df_slided.to_csv(os.path.join(slided_dfs_path, f"data_slided_V2"))
df_slided.to_parquet(os.path.join(slided_dfs_path, "data_slided_V2.parquet"))