## Setup

In [1]:
import pandas as pd
import os
from dotenv import load_dotenv
from src.py_src import util
from tqdm import tqdm
from src.py_src.models import SolarFlarePredictionModel

In [2]:
load_dotenv()

events_csv_path = os.getenv("EVENTS_CSV_PATH")
xrays_csv_path = os.getenv("XRAY_V2_CSV_PATH")

data_begin_year = 1983
data_end_year = 2025
data_year_range = range(data_begin_year, data_end_year + 1)

events_begin_year = 1996
events_end_year = 2024
events_year_range = range(events_begin_year, events_end_year + 1)

cols_names = ['xl']

## Reading CSVs

In [3]:
data = {
    'events': {
        'all': {},
        'DSD': {}
    },
    'xrays': {}
}

In [4]:
missing_files = []
for y in range(1983, 1995+1):
    f = missing_files.append(os.path.join(events_csv_path, os.path.join(str(y), f"{y}_events.csv")))
    missing_files.append(os.path.join(events_csv_path, os.path.join(str(y), f"{y}_DSD.csv")))

In [5]:
missing_events_dsd = [
    os.path.join(events_csv_path, str(y), f"{y}_{file_type}.csv")
    for y in range(1983, 1995+1)
    for file_type in ('events', 'DSD')
]
missing_files = set(missing_events_dsd)

for y in tqdm(data_year_range):
    events_year_dir = os.path.join(events_csv_path, str(y))
    xrays_year_dir = os.path.join(xrays_csv_path, str(y))

    try:
        data['events']['all'][y] = pd.read_csv(os.path.join(events_year_dir, f"{y}_events.csv"),
                                               parse_dates=['date','begin','max','end'],
                                               ).rename(columns={'date':'ds'})
    except FileNotFoundError as e:
        data['events']['all'][y] = pd.DataFrame()
        if e.filename not in missing_files: print(e)

    try:
        data['events']['DSD'][y] = pd.read_csv(os.path.join(events_year_dir, f"{y}_DSD.csv"),
                                               parse_dates=['ds'],
                                               index_col='ds')
    except FileNotFoundError as e:
        data['events']['DSD'][y] = pd.DataFrame()
        if e.filename not in missing_files: print(e)

    try:
        data['xrays'][y] = pd.read_csv(os.path.join(xrays_year_dir, f"{y}_xrays.csv"),
                                       parse_dates=['ds'],
                                       index_col='ds')
        #A correcao é essa unica linha
        data['xrays'][y].index = data['xrays'][y].index.tz_localize('UTC')
        data['xrays'][y] = data['xrays'][y].asfreq('1min')

    except FileNotFoundError as e:
        data[y]['xrays'] = pd.DataFrame()
        if e.filename not in missing_files: print(e)

 98%|█████████▊| 42/43 [00:28<00:01,  1.24s/it]

[Errno 2] No such file or directory: 'G:\\My Drive\\Solar_Flares\\Data\\events\\formatted_csv\\2025\\2025_events.csv'
[Errno 2] No such file or directory: 'G:\\My Drive\\Solar_Flares\\Data\\events\\formatted_csv\\2025\\2025_DSD.csv'


100%|██████████| 43/43 [00:31<00:00,  1.38it/s]


### Categorizing Different Events Types

In [12]:
event_types = ("BSL", "DSF", "EPL", "FIL", "FLA", "FOR", "GLE", "LPS", "PCA", "RBR", "RNS", "RSP", "XRA")

for t in event_types:
    data['events'][t] = {}
    for y in data_year_range:
        df_all = data['events']['all'][y]
        try:
            mask = df_all['type'] == t
            data['events'][t][y] = df_all[mask].reset_index(drop=False)
            data['events'][t][y] = data['events'][t][y].rename(columns={'index': 'original_index'})
        except KeyError as e:
            data['events'][t][y] = pd.DataFrame()

In [13]:
xra_events = {'all': pd.DataFrame()}
all_dfs_list = []

for y in events_year_range:
    df = data['events']['XRA'][y].copy()

    particulars_split = df['particulars'].str.split(expand=True)
    df['class_expanded'] = particulars_split[0]
    df['flux'] = df['class_expanded'].apply(util.parse_flare_class_expanded)
    df['class'] = df['class_expanded'].str[0].str.upper()
    df = df.drop(columns=['particulars', 'class_expanded'])

    df['class_numeric'] = df['class'].apply(lambda c: util.flare_class_map.get(c, 0))

    df = df[['ds', 'type', 'class', 'class_numeric', 'flux', 'begin', 'max', 'end', 'q']]

    df = df.dropna(subset=['begin'])

    xra_events[y] = df
    all_dfs_list.append(df)

xra_events['all'] = pd.concat(all_dfs_list, ignore_index=True)

### DFs To Slide

In [14]:
def get_cols(year: int) -> list[str]:
    # return ['xs','xl'] if year < 2020 else ['xrsa_flux','xrsb_flux']
    return ['xl'] if year < 2020 else ['xrsb_flux']

In [15]:
xrays_to_slide_list = []
xra_events_to_slide_list = []

for y in tqdm(events_year_range):
    cols = get_cols(y)
    df_xrays = data['xrays'][y][[c for c in cols]]
    if cols not in cols_names:
        df_xrays = df_xrays.rename(columns=dict(zip(cols, cols_names)))

    xrays_to_slide_list.append(df_xrays)

    df_events = xra_events[y][['class_numeric', 'begin', 'flux']]
    xra_events_to_slide_list.append(df_events)

xrays_to_slide = pd.concat(xrays_to_slide_list).sort_index()
xra_events_to_slide = pd.concat(xra_events_to_slide_list).sort_values('begin')

100%|██████████| 29/29 [00:00<00:00, 123.36it/s]


### Ground Truth

In [10]:
def create_ground_truth(events: pd.DataFrame, indexes: pd.DataFrame.index) -> pd.DataFrame:
    events = events.copy()

    events['duration'] = (events['end'] - events['begin']).dt.total_seconds() / 60
    mean_duration_by_class = events.groupby('class_numeric')['duration'].mean()
    duration_map = mean_duration_by_class.to_dict()

    def impute_end(row):
        if pd.isna(row['end']):
            mean_duration = duration_map.get(row['class_numeric'])
            return row['begin'] + pd.to_timedelta(mean_duration, unit='m')
        else:
            return row['end']

    events['end_imputed'] = events.apply(impute_end, axis=1)
    events_sorted = events.sort_values(by='class_numeric', ascending=True)

    ground_truth = pd.DataFrame(index=indexes)
    ground_truth['current_class'] = 0

    for event in tqdm(events_sorted.itertuples(), total=len(events_sorted)):
        ground_truth.loc[event.begin:event.end_imputed, 'current_class'] = event.class_numeric

    return ground_truth

In [11]:
df_ground_truth = create_ground_truth(xra_events['all'], xrays_to_slide.index)

percentages = df_ground_truth['current_class'].value_counts(normalize=True)
for value_label, proportion in percentages.items():
    print(f"{value_label} -> {proportion * 100:.2f}%")

100%|██████████| 50756/50756 [00:09<00:00, 5110.36it/s]


0 -> 93.21%
3 -> 4.08%
2 -> 1.94%
4 -> 0.71%
5 -> 0.06%
1 -> 0.01%


## Showing DFs

In [12]:
data['xrays'][2020]

Unnamed: 0_level_0,xrsa_flux,xrsb_flux
ds,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-01 00:00:00,2.344518e-09,4.515841e-09
2020-01-01 00:01:00,2.840341e-09,5.645528e-09
2020-01-01 00:02:00,4.735201e-09,4.346297e-09
2020-01-01 00:03:00,5.261884e-09,4.360483e-09
2020-01-01 00:04:00,6.190385e-09,4.019309e-09
...,...,...
2020-12-31 23:55:00,,
2020-12-31 23:56:00,,
2020-12-31 23:57:00,,
2020-12-31 23:58:00,,


In [13]:
xrays_to_slide

Unnamed: 0_level_0,xl
ds,Unnamed: 1_level_1
1996-01-01 00:00:00,1.380000e-08
1996-01-01 00:01:00,1.460000e-08
1996-01-01 00:02:00,1.380000e-08
1996-01-01 00:03:00,1.370000e-08
1996-01-01 00:04:00,1.300000e-08
...,...
2024-12-31 23:55:00,6.078074e-06
2024-12-31 23:56:00,6.095521e-06
2024-12-31 23:57:00,6.143794e-06
2024-12-31 23:58:00,6.191976e-06


In [14]:
xra_events_to_slide

Unnamed: 0,class_numeric,begin,flux
0,2,1996-07-31 01:32:00,1.300000e-07
1,2,1996-07-31 02:22:00,1.200000e-07
2,2,1996-07-31 07:00:00,1.200000e-07
3,2,1996-07-31 08:29:00,2.000000e-07
4,2,1996-07-31 11:14:00,1.400000e-07
...,...,...,...
3751,3,2024-12-30 20:21:00,5.500000e-06
3752,3,2024-12-30 21:27:00,6.500000e-06
3753,3,2024-12-30 21:48:00,6.100000e-06
3754,3,2024-12-30 22:19:00,7.600000e-06


In [15]:
percentages = xra_events['all']['class'].value_counts(normalize=True)
for value_label, proportion in percentages.items():
    print(f"{value_label} -> {proportion * 100:.2f}%")
print(len(xra_events['all']))

C -> 57.49%
B -> 34.64%
M -> 7.16%
X -> 0.49%
A -> 0.22%
50756


## Slided DataFrames

In [None]:
df_features = SolarFlarePredictionModel.generate_features(xrays_to_slide)
df_target = SolarFlarePredictionModel.generate_target(xrays_to_slide, xra_events_to_slide)

df_slided = pd.concat([df_features, df_target], axis=1).dropna()

In [21]:
max_window_duration = pd.to_timedelta('72h')
cutoff_time = df_slided.index[-1] - max_window_duration
df_slided = df_slided[df_slided.index <= cutoff_time]

In [1]:
df_slided

NameError: name 'df_slided' is not defined

## Exporting CSVs

In [23]:
slided_dfs_path = os.getenv("SLIDED_DFS_CSV_PATH")

df_slided.to_parquet(os.path.join(slided_dfs_path, "data_slided_V4.parquet"))