In [1]:
import os
import time
import datetime
import pandas as pd

import openmeteo_requests
import requests_cache
from retry_requests import retry

# Global variable to track the number of API calls made today
calls_today = 0

def get_historical_data(lat, lon, start_date, cutoff_time, HOURLY_VARS, client, max_retries=5):
    """
    Retrieves historical data from start_date up to cutoff_time using the archive endpoint.
    """
    global calls_today
    hist_end_date = cutoff_time.strftime("%Y-%m-%d")
    params = {
        "latitude": lat,
        "longitude": lon,
        "start_date": start_date,
        "end_date": hist_end_date,
        "hourly": HOURLY_VARS
    }
    retry_count = 0
    hist_df = pd.DataFrame()
    while retry_count <= max_retries:
        try:
            response = client.weather_api("https://archive-api.open-meteo.com/v1/archive", params=params)[0]
            calls_today += 1
            hourly = response.Hourly()
            times = pd.date_range(
                start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
                end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True),
                freq=pd.Timedelta(seconds=hourly.Interval()),
                inclusive="left"
            )
            data = {"date": times}
            for i, var in enumerate(HOURLY_VARS):
                var_obj = hourly.Variables(i)
                data[var] = var_obj.ValuesAsNumpy() if var_obj else [float("nan")] * len(times)
            hist_df = pd.DataFrame(data)
            # Filter to include only data before cutoff_time
            hist_df = hist_df[hist_df["date"] < cutoff_time]
            break
        except Exception as e:
            message = str(e)
            if "Hourly API request limit exceeded" in message or "Minutely API request limit exceeded" in message:
                now_retry = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
                if "Minutely" in message:
                    next_minute = now_retry.replace(second=0, microsecond=0) + datetime.timedelta(minutes=1)
                    wait_secs = (next_minute - now_retry).total_seconds() + 5
                    print(f"⚠️ Minutely request limit reached (historical) — waiting {int(wait_secs)} seconds until {next_minute} UTC")
                else:
                    next_hour = now_retry.replace(minute=0, second=0, microsecond=0) + datetime.timedelta(hours=1)
                    wait_secs = (next_hour - now_retry).total_seconds() + 5
                    print(f"⚠️ Hourly request limit reached (historical) — waiting {int(wait_secs/60)} minutes until {next_hour} UTC")
                time.sleep(wait_secs)
                retry_count += 1
            else:
                print(f"❌ Unrecoverable error (historical): {message}")
                break
    return hist_df

def process_plant(row, client, cutoff_time, HOURLY_VARS, output_folder):
    """
    Processes a plant: retrieves historical data and saves the result to a Parquet file.
    """
    name = row["CENTRAL"]
    lat, lon = row["Latitud"], row["Longitud"]
    # Convert the first appearance date; adjust the format if needed.
    start_date = pd.to_datetime(row["FirstAppearance"], dayfirst=True).strftime("%Y-%m-%d")
    print(f"▶️ Processing {name}: from {start_date} until cutoff {cutoff_time.isoformat()}")
    
    hist_df = get_historical_data(lat, lon, start_date, cutoff_time, HOURLY_VARS, client)
    
    filename = f"{name.lower().replace(' ', '_')}.parquet"
    filepath = os.path.join(output_folder, filename)
    hist_df.to_parquet(filepath, index=False)
    print(f"✅ Saved {filename} for {name}.")
    return hist_df

def main():
    global calls_today
    # Configuration
    CENTRAL_INFO_CSV = "../data/lookup/central_info.csv"
    OUTPUT_FOLDER = "../data/raw/open_meteo_data"
    MAX_DAILY_CALLS = 10000

    HOURLY_VARS = [
        "temperature_2m", "shortwave_radiation", "diffuse_radiation", "global_tilted_irradiance",
        "shortwave_radiation_instant", "diffuse_radiation_instant", "global_tilted_irradiance_instant",
        "direct_radiation", "direct_normal_irradiance", "terrestrial_radiation",
        "direct_radiation_instant", "direct_normal_irradiance_instant", "terrestrial_radiation_instant",
        "relative_humidity_2m", "dew_point_2m", "apparent_temperature", "precipitation", "rain",
        "pressure_msl", "surface_pressure", "et0_fao_evapotranspiration",
        "vapour_pressure_deficit", "cloud_cover", "cloud_cover_low", "cloud_cover_mid",
        "cloud_cover_high", "wind_speed_10m", "wind_direction_10m", "wind_gusts_10m"
    ]
    
    PLANTS_OF_INTEREST = [
        "parque eolico agua clara"#, "parque eolico de matafongo", "parque eolico guanillo",
        # "parque eolico larimar", "parque eolico larimar ii", "parque eolico los guzmancitos",
        # "parque eolico los guzmancitos 2", "parque fotovoltaico bayasol", "parque fotovoltaico calabaza",
        # "parque fotovoltaico cumayasa 1", "parque fotovoltaico cumayasa 2", "parque fotovoltaico la victoria",
        # "parque fotovoltaico los negros", "parque fotovoltaico maranatha fase i", "parque fotovoltaico mata de palma",
        # "parque fotovoltaico matrisol", "parque fotovoltaico mirasol", "parque fotovoltaico montecristi solar 1",
        # "parque fotovoltaico sajoma", "parque fotovoltaico santanasol", "parque fotovoltaico washington capital 2",
        # "parque fotovoltaico washington capital 3"
    ]
    
    os.makedirs(OUTPUT_FOLDER, exist_ok=True)
    # Create UTC-aware datetime objects
    now = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
    # Set cutoff_time to now minus 26 hours (only take historical data before that)
    cutoff_time = now - datetime.timedelta(hours=26)
    
    # Configure the Open-Meteo client
    cache = requests_cache.CachedSession('.cache', expire_after=-1)
    session = retry(cache, retries=3, backoff_factor=0.2)
    client = openmeteo_requests.Client(session=session)
    
    # Load and filter central information from CSV
    df = pd.read_csv(CENTRAL_INFO_CSV)
    df_filtered = df[df["CENTRAL"].str.lower().isin([p.lower() for p in PLANTS_OF_INTEREST])].copy()
    
    # Process each plant
    for idx, row in df_filtered.iterrows():
        if calls_today >= MAX_DAILY_CALLS:
            print("✅ Daily API call limit reached — stopping execution.")
            break
        process_plant(row, client, cutoff_time, HOURLY_VARS, OUTPUT_FOLDER)
    
    print("✅ All plants have been processed.")
    print(f"API calls made today: {calls_today}")

# Execute main in the Notebook
main()

▶️ Processing parque eolico agua clara: from 2019-02-22 until cutoff 2025-03-21T14:41:25.513640+00:00
✅ Saved parque_eolico_agua_clara.parquet for parque eolico agua clara.
✅ All plants have been processed.
API calls made today: 1


  start_date = pd.to_datetime(row["FirstAppearance"], dayfirst=True).strftime("%Y-%m-%d")
