## Setup

In [1]:
import os
import re

from dotenv import load_dotenv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from webdriver_manager.firefox import GeckoDriverManager
import time
import src.util as util

## Scrape Functions

In [8]:
def download_xray_data_selenium(start_year_, end_year_, save_path_) -> None:
    save_path_abs = os.path.abspath(save_path_)
    os.makedirs(save_path_abs, exist_ok=True)
    print(f"Arquivos serão salvos em: '{save_path_abs}'")

    options = Options()
    options.set_preference("browser.download.folderList", 2)
    options.set_preference("browser.download.dir", save_path_abs)
    options.set_preference("browser.download.manager.showWhenStarting", False)
    options.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/csv,application/octet-stream")
    driver = webdriver.Firefox(service=Service(GeckoDriverManager().install()), options=options)

    base_url = "https://www.ncei.noaa.gov/data/goes-space-environment-monitor/access/avg/"

    try:
        # --- Nível 1: Anos ---
        print(f"Acessando a página principal: {base_url}")
        driver.get(base_url)
        time.sleep(2)

        year_links = driver.find_elements(By.XPATH, "//div[@id='main_text']//table/tbody/tr/td/a")
        year_urls_to_visit = []
        for link in year_links:
            try:
                year_text = link.text.strip().replace('/', '')
                if year_text.isdigit() and start_year_ <= int(year_text) <= end_year_:
                    year_urls_to_visit.append((int(year_text), link.get_attribute('href')))
            except (ValueError, IndexError):
                continue

        for current_year, year_url in year_urls_to_visit:
            print(f"\n[ANO] Acessando ano: {current_year}")
            driver.get(year_url)
            time.sleep(1)

            # --- Nível 2: Meses ---
            month_links = driver.find_elements(By.XPATH, "//div[@id='main_text']//table/tbody/tr/td/a")
            month_urls_to_visit = []

            if current_year == 1983:
                valid_months = [f"{m:02d}/" for m in range(6, 13)]
            elif current_year == 2020:
                valid_months = [f"{m:02d}/" for m in range(1, 4)]
            else:
                valid_months = [f"{m:02d}/" for m in range(1, 13)]

            for link in month_links:
                if link.text.strip() in valid_months:
                    month_urls_to_visit.append((link.text.strip(), link.get_attribute('href')))

            for month_name, month_url in month_urls_to_visit:
                print(f"  [MÊS] Acessando mês: {month_name}")
                driver.get(month_url)
                time.sleep(1)

                # --- Nível 3: Satélite ---
                all_rows = driver.find_elements(By.XPATH, "//div[@id='main_text']//table/tbody/tr")
                if len(all_rows) < 2: continue

                penultimate_row = all_rows[-2]
                satellite_link = penultimate_row.find_element(By.TAG_NAME, 'a')

                s = satellite_link.text.strip()
                if ((s == "log_410_check_goes_files_exist_log.txt") or
                (s == "goes13/") or
                (s == "goes14/" and current_year == 2009) or
                (s == "goes15/" and current_year == 2010) or
                (s == "goes11/" and current_year in [2000,2001]) or
                (s == "goes12/" and current_year in [2007,2008,2009])):

                    antepenultimate_row = all_rows[-3]
                    satellite_link = antepenultimate_row.find_element(By.TAG_NAME, 'a')

                satellite_name = satellite_link.text.strip()
                satellite_url = satellite_link.get_attribute('href')
                print(f"    [SATÉLITE] Usando o mais recente: {satellite_name}")
                driver.get(satellite_url)
                time.sleep(1)

                # --- Nível 4: Diretório CSV ---
                driver.find_element(By.LINK_TEXT, 'csv/').click()
                time.sleep(1)

                # --- Nível 5: Arquivos ---
                digits = re.search(r'\d+', satellite_name)
                if not digits: continue

                file_prefix = f"g{digits.group()}_xrs_1m"
                print(f"      [ARQUIVO] Procurando por arquivos que começam com: '{file_prefix}'")

                file_links = driver.find_elements(By.XPATH, f"//a[starts-with(text(), '{file_prefix}')]")
                for file_link in file_links:
                    file_name = file_link.text
                    file_path = os.path.join(save_path_abs, file_name)

                    if not os.path.exists(file_path):
                        print(f"        -> Baixando {file_name}...")
                        file_link.click()

                        util.wait_download(file_path_=file_path, file_name_=file_name)

                        print(f"        -> Download iniciado.")
                    else:
                        print(f"        -> Arquivo '{file_name}' já existe. Pulando.")

    except Exception as e:
        print(f"Ocorreu um erro inesperado: {e}")
    finally:
        print("\nProcesso concluído. Fechando o navegador.")
        driver.quit()

    return None

In [None]:
def check_files(path_: str, range_: range) -> None:
    for y in range_:
        year_dir = os.path.join(path_, str(y))
        files_in_dir = os.listdir(year_dir)
        for m in range(1, 13):
            m_str = f"{m:02d}"

            if y <= 1985:
                pattern_str = f"^g\\d+_xrs_1m_3s_{y}{m_str}01_{y}{m_str}\\d{{2}}\\.csv$"
                pattern = re.compile(pattern_str)
            else:
                pattern_str = f"^g\\d+_xrs_1m_{y}{m_str}01_{y}{m_str}\\d{{2}}\\.csv$"
                pattern = re.compile(pattern_str)

            match = False
            for file in files_in_dir:
                if pattern.match(file):
                    match = True
                    break
            if not match:
                print(f"AVISO: Arquivo faltante! {m} de {y}")

    return None

## Main

In [9]:
load_dotenv()

start_year = 1983
end_year = 2019
save_path = os.getenv("XRAY_V2_PATH")

download_xray_data_selenium(start_year_=start_year, end_year_=end_year, save_path_=save_path)

Arquivos serão salvos em: 'G:\My Drive\Solar_Flares\Data\V2'
Acessando a página principal: https://www.ncei.noaa.gov/data/goes-space-environment-monitor/access/avg/

[ANO] Acessando ano: 2017
  [MÊS] Acessando mês: 01/
    [SATÉLITE] Usando o mais recente: goes15/
      [ARQUIVO] Procurando por arquivos que começam com: 'g15_xrs_1m'
        -> Baixando g15_xrs_1m_20170101_20170131.csv...
        -> Aguardando conclusão do download...
        -> Download de 'g15_xrs_1m_20170101_20170131.csv' concluído com sucesso!
        -> Download iniciado.
  [MÊS] Acessando mês: 02/
    [SATÉLITE] Usando o mais recente: goes15/
      [ARQUIVO] Procurando por arquivos que começam com: 'g15_xrs_1m'
        -> Baixando g15_xrs_1m_20170201_20170228.csv...
        -> Aguardando conclusão do download...
        -> Download de 'g15_xrs_1m_20170201_20170228.csv' concluído com sucesso!
        -> Download iniciado.
  [MÊS] Acessando mês: 03/
    [SATÉLITE] Usando o mais recente: goes15/
      [ARQUIVO] Procu