## Setup

In [None]:
import os
from dotenv import load_dotenv
import numpy as np
import pandas as pd
import re
from src import util
import xarray as xr

In [None]:
load_dotenv()
raw_data_path = os.getenv('XRAY_V2_RAW_PATH')
csv_data_path = os.getenv('XRAY_V2_CSV_PATH')

data = {}
months = ["jan","feb","mar","apr","may","jun","jul","aug","sep","oct","nov","dec"]

## Read files

In [None]:
def find_data_row(file_path) -> int|None :
    with open(file_path, 'r', errors='ignore') as f:
        for i, line in enumerate(f):
            if line.strip() == 'data:':
                return i + 1

    return None

In [None]:
csv_raw_range = range(1983, 2019+1)

for y in csv_raw_range:
    data[y] = {}
    year_dir = os.path.join(raw_data_path, f"{y}")

    if not os.path.isdir(year_dir):
        print(f"ERRO: Diretório não encontrado, pulando ano {y}")
        continue

    files_in_dir = os.listdir(year_dir)
    for m in range(1, 13):
        m_str = f"{m:02d}"

        if y <= 1985:
            pattern_str = f"^g\\d+_xrs_1m_3s_{y}{m_str}01_{y}{m_str}\\d{{2}}\\.csv$"
            pattern = re.compile(pattern_str)
        else:
            pattern_str = f"^g\\d+_xrs_1m_{y}{m_str}01_{y}{m_str}\\d{{2}}\\.csv$"
            pattern = re.compile(pattern_str)

        found_file_name = None
        for file_name in files_in_dir:
            if pattern.match(file_name):
                found_file_name = file_name
                break

        if found_file_name is None:
            print(f"ERRO: Arquivo não encontrado, pulando mes {m} de {y}")
            continue

        full_path = os.path.join(year_dir, found_file_name)
        data_row = find_data_row(full_path)
        if data_row is None:
            print(f"ERRO: Marcador 'data:' não encontrado em {full_path}. Pulando arquivo.")
            continue

        df = pd.read_csv(full_path, skiprows=data_row, index_col="time_tag")

        data[y][months[m-1]] = df
        print(f"{months[m-1]} de {y} registrado!")

## Prepare DataFrames

In [None]:
for y in data.keys():
    df_whole_year = pd.DataFrame()
    for m in data[y]:
        df = data[y][m]
        df.index = pd.to_datetime(df.index)
        df.index.name = "ds"
        df = df.asfreq("min")

        if y > 2009:
            df = df.drop(columns=["A_QUAL_FLAG","A_NUM_PTS","B_QUAL_FLAG","B_NUM_PTS"])
            df = df.rename(columns={"A_AVG":"xs", "B_AVG":"xl"})

        df["xs"] = df["xs"].astype(float)
        df["xl"] = df["xl"].astype(float)

        data[y][m] = df
        print(f"{m} de {y} prepared!")

        df_m = df
        df_whole_year = pd.concat([df_whole_year, df_m])

    data[y]["whole_year"] = df_whole_year
    print(f"{y} prepared!")

## Handling Missing Values

In [None]:
missing_values = [0,-99999,99999]

for y in data.keys():
    df = data[y]["whole_year"]

    df[['xs','xl']] = df[['xs','xl']].replace(missing_values, np.nan)
    missing_xs_count = df['xs'].isna().sum()
    missing_xl_count = df['xl'].isna().sum()

    rows_count = len(df)
    print(f"{y} Missing Values --> XS:{round((missing_xs_count/rows_count)*100,2)}% | XL:{round((missing_xl_count/rows_count)*100,2)}%")

## Exporting CSVs

In [None]:
util.create_dirs(csv_data_path, csv_raw_range)

In [None]:
for y in csv_raw_range:
    df = data[y]["whole_year"]

    year_dir = os.path.join(csv_data_path, str(y))
    df.to_csv(os.path.join(year_dir,f"{y}_xrays.csv"))

## NetCDF Files

In [None]:
current_y = 2020
df_days_list = []
for date in pd.date_range(start='2020-01-01', end="2025-09-21"):
    if date <= pd.Timestamp('2022-09-01'):
        satellite = "g17"
    else:
        satellite = "g18"
    y = date.year
    m = f"{date.month:02d}"
    d = f"{date.day:02d}"

    year_dir = os.path.join(raw_data_path, str(y))
    file_name = f"sci_xrsf-l2-avg1m_{satellite}_d{y}{m}{d}_v2-2-0.nc"
    full_path = os.path.join(year_dir, file_name)

    if y != current_y:
        data[current_y] = {}
        data[current_y]["whole_year"] = pd.concat(df_days_list)

        df_days_list = []
        current_y = y

    try:
        xr_obj = xr.open_dataset(full_path)
        df = xr_obj[['time','xrsa_flux','xrsb_flux']].to_dataframe()

        df = df.rename(columns={'time':'ds'}).set_index('ds')
        df = df.drop(columns=['Unnamed: 0'])

        print(f"SUCCES READING {y}-{m}-{d}")
        if not df.empty:
            df_days_list.append(df)
    except FileNotFoundError as e:
        print(f"AVISO: Arquivo não encontrado, pulando: {full_path}")

print(df_days_list)
if df_days_list:
    data[current_y] = {}
    data[current_y]["whole_year"] = pd.concat(df_days_list)

### Exporting CSVs

In [None]:
nc_range = range(2020, 2025+1)
util.create_dirs(csv_data_path, nc_range)

In [None]:
for y in nc_range:
    df = data[y]["whole_year"]

    year_dir = os.path.join(csv_data_path,str(y))
    df.to_csv(os.path.join(year_dir,f"{y}_xrays.csv"))