## Setup

In [2]:
import os
from dotenv import load_dotenv
import numpy as np
import pandas as pd
import re
from src import util
import xarray as xr

In [26]:
load_dotenv()
data_path = os.getenv('XRAY_V2_PATH')
csv_path = os.getenv('XRAY_V2_CSV_PATH')

data = {}
months = ["jan","feb","mar","apr","may","jun","jul","aug","sep","oct","nov","dec"]

begin_year = 1983
end_year = 2020

## Read files

In [27]:
def find_data_row(file_path) -> int|None :
    with open(file_path, 'r', errors='ignore') as f:
        for i, line in enumerate(f):
            if line.strip() == 'data:':
                return i + 1

    return None

In [28]:
for y in range(begin_year, end_year + 1):
    data[y] = {}
    year_dir = os.path.join(data_path,f"{y}")

    if not os.path.isdir(year_dir):
        print(f"ERRO: Diretório não encontrado, pulando ano {y}")
        continue

    files_in_dir = os.listdir(year_dir)
    for m in range(1, 13):
        m_str = f"{m:02d}"

        if y <= 1985:
            pattern_str = f"^g\\d+_xrs_1m_3s_{y}{m_str}01_{y}{m_str}\\d{{2}}\\.csv$"
            pattern = re.compile(pattern_str)
        else:
            pattern_str = f"^g\\d+_xrs_1m_{y}{m_str}01_{y}{m_str}\\d{{2}}\\.csv$"
            pattern = re.compile(pattern_str)

        found_file_name = None
        for file_name in files_in_dir:
            if pattern.match(file_name):
                found_file_name = file_name
                break

        if found_file_name is None:
            print(f"ERRO: Arquivo não encontrado, pulando mes {m} de {y}")
            continue

        full_path = os.path.join(year_dir, found_file_name)
        data_row = find_data_row(full_path)
        if data_row is None:
            print(f"ERRO: Marcador 'data:' não encontrado em {full_path}. Pulando arquivo.")
            continue

        df = pd.read_csv(full_path, skiprows=data_row, index_col="time_tag")

        data[y][months[m-1]] = df
        print(f"{months[m-1]} de {y} registrado!")

ERRO: Arquivo não encontrado, pulando mes 1 de 1983
ERRO: Arquivo não encontrado, pulando mes 2 de 1983
ERRO: Arquivo não encontrado, pulando mes 3 de 1983
ERRO: Arquivo não encontrado, pulando mes 4 de 1983
ERRO: Arquivo não encontrado, pulando mes 5 de 1983


KeyboardInterrupt: 

## Prepare DataFrames

In [13]:
for y in data.keys():
    df_whole_year = pd.DataFrame()
    for m in data[y]:
        df = data[y][m]
        df.index = pd.to_datetime(df.index)
        df.index.name = "ds"
        df = df.asfreq("min")

        if y > 2009:
            df = df.drop(columns=["A_QUAL_FLAG","A_NUM_PTS","B_QUAL_FLAG","B_NUM_PTS"])
            df = df.rename(columns={"A_AVG":"xs", "B_AVG":"xl"})

        df["xs"] = df["xs"].astype(float)
        df["xl"] = df["xl"].astype(float)

        data[y][m] = df
        print(f"{m} de {y} prepared!")

        df_m = df
        df_whole_year = pd.concat([df_whole_year, df_m])

    data[y]["whole_year"] = df_whole_year
    print(f"{y} prepared!")

jun de 1983 prepared!
jul de 1983 prepared!
aug de 1983 prepared!
sep de 1983 prepared!
oct de 1983 prepared!
nov de 1983 prepared!
dec de 1983 prepared!
1983 prepared!
jan de 1984 prepared!
feb de 1984 prepared!
mar de 1984 prepared!
apr de 1984 prepared!
may de 1984 prepared!
jun de 1984 prepared!
jul de 1984 prepared!
aug de 1984 prepared!
sep de 1984 prepared!
oct de 1984 prepared!
nov de 1984 prepared!
dec de 1984 prepared!
1984 prepared!
jan de 1985 prepared!
feb de 1985 prepared!
mar de 1985 prepared!
apr de 1985 prepared!
may de 1985 prepared!
jun de 1985 prepared!
jul de 1985 prepared!
aug de 1985 prepared!
sep de 1985 prepared!
oct de 1985 prepared!
nov de 1985 prepared!
dec de 1985 prepared!
1985 prepared!
jan de 1986 prepared!
feb de 1986 prepared!
mar de 1986 prepared!
apr de 1986 prepared!
may de 1986 prepared!
jun de 1986 prepared!
jul de 1986 prepared!
aug de 1986 prepared!
sep de 1986 prepared!
oct de 1986 prepared!
nov de 1986 prepared!
dec de 1986 prepared!
1986 prep

## Handling Missing Values

In [14]:
missing_values = [0,-99999,99999]

for y in data.keys():
    df = data[y]["whole_year"]

    df[['xs','xl']] = df[['xs','xl']].replace(missing_values, np.nan)
    missing_xs_count = df['xs'].isna().sum()
    missing_xl_count = df['xl'].isna().sum()

    rows_count = len(df)
    print(f"{y} Missing Values --> XS:{round((missing_xs_count/rows_count)*100,2)}% | XL:{round((missing_xl_count/rows_count)*100,2)}%")

1983 Missing Values --> XS:5.99% | XL:5.99%
1984 Missing Values --> XS:7.39% | XL:7.39%
1985 Missing Values --> XS:4.36% | XL:4.36%
1986 Missing Values --> XS:3.59% | XL:3.59%
1987 Missing Values --> XS:4.42% | XL:4.42%
1988 Missing Values --> XS:1.68% | XL:1.68%
1989 Missing Values --> XS:1.73% | XL:1.73%
1990 Missing Values --> XS:1.8% | XL:1.8%
1991 Missing Values --> XS:1.78% | XL:1.78%
1992 Missing Values --> XS:1.5% | XL:1.5%
1993 Missing Values --> XS:1.01% | XL:1.01%
1994 Missing Values --> XS:1.68% | XL:1.7%
1995 Missing Values --> XS:3.23% | XL:3.23%
1996 Missing Values --> XS:1.75% | XL:1.75%
1997 Missing Values --> XS:2.05% | XL:2.05%
1998 Missing Values --> XS:1.94% | XL:1.94%
1999 Missing Values --> XS:1.72% | XL:1.72%
2000 Missing Values --> XS:1.36% | XL:1.36%
2001 Missing Values --> XS:1.4% | XL:1.4%
2002 Missing Values --> XS:1.32% | XL:1.32%
2003 Missing Values --> XS:4.48% | XL:4.48%
2004 Missing Values --> XS:1.4% | XL:1.4%
2005 Missing Values --> XS:1.34% | XL:1.3

## Exporting CSVs

In [17]:
util.create_dirs(csv_path, begin_year, end_year)

In [18]:
for y in range(begin_year, end_year+1):
    df = data[y]["whole_year"]

    year_dir = os.path.join(csv_path,str(y))
    df.to_csv(os.path.join(year_dir,f"{y}_xrays.csv"))

## NetCDF Files

In [3]:
avg1m_file_path = r"G:\My Drive\Solar_Flares\Data\xray_V2\raw\sci_xrsf-l2-avg1m_g18_d20230605_v2-2-0.nc"

avg1m_obj = xr.open_dataset(avg1m_file_path)
df_avg1m = avg1m_obj[['xrsa_flux','xrsb_flux']].to_dataframe()
df_avg1m

Unnamed: 0_level_0,xrsa_flux,xrsb_flux
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-06-05 00:00:00,8.952141e-08,0.000003
2023-06-05 00:01:00,1.044956e-07,0.000003
2023-06-05 00:02:00,1.310519e-07,0.000003
2023-06-05 00:03:00,1.658852e-07,0.000003
2023-06-05 00:04:00,2.027730e-07,0.000003
...,...,...
2023-06-05 23:55:00,9.997442e-09,0.000001
2023-06-05 23:56:00,1.524219e-08,0.000001
2023-06-05 23:57:00,2.725268e-08,0.000002
2023-06-05 23:58:00,2.307480e-08,0.000002


In [None]:
avg1m_obj