In [7]:
import os
import time
import datetime
import pandas as pd

import openmeteo_requests
import requests_cache
from retry_requests import retry

# Global variable to track the number of API calls made today
calls_today = 0

def get_historical_data(lat, lon, start_date, cutoff_time, HOURLY_VARS, client, max_retries=5):
    """
    Retrieves historical data from start_date up to cutoff_time using the archive endpoint.
    """
    global calls_today
    hist_end_date = cutoff_time.strftime("%Y-%m-%d")
    params = {
        "latitude": lat,
        "longitude": lon,
        "start_date": start_date,
        "end_date": hist_end_date,
        "hourly": HOURLY_VARS
    }
    retry_count = 0
    hist_df = pd.DataFrame()
    while retry_count <= max_retries:
        try:
            response = client.weather_api("https://archive-api.open-meteo.com/v1/archive", params=params)[0]
            calls_today += 1

            hourly = response.Hourly()
            # Build a datetime index from the API's time range
            times = pd.date_range(
                start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
                end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True),
                freq=pd.Timedelta(seconds=hourly.Interval()),
                inclusive="left"
            )

            # Create a data dict for each variable
            data = {"date": times}
            for i, var in enumerate(HOURLY_VARS):
                var_obj = hourly.Variables(i)
                if var_obj:
                    data[var] = var_obj.ValuesAsNumpy()
                else:
                    # If the variable isn't available, fill with NaN
                    data[var] = [float("nan")] * len(times)

            hist_df = pd.DataFrame(data)
            # Filter to include only data before cutoff_time
            hist_df = hist_df[hist_df["date"] < cutoff_time]
            break
        except Exception as e:
            message = str(e)
            if "Hourly API request limit exceeded" in message or "Minutely API request limit exceeded" in message:
                now_retry = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
                if "Minutely" in message:
                    next_minute = now_retry.replace(second=0, microsecond=0) + datetime.timedelta(minutes=1)
                    wait_secs = (next_minute - now_retry).total_seconds() + 5
                    print(f"⚠️ Minutely request limit reached (historical) — waiting {int(wait_secs)} seconds until {next_minute} UTC")
                else:
                    next_hour = now_retry.replace(minute=0, second=0, microsecond=0) + datetime.timedelta(hours=1)
                    wait_secs = (next_hour - now_retry).total_seconds() + 5
                    print(f"⚠️ Hourly request limit reached (historical) — waiting {int(wait_secs/60)} minutes until {next_hour} UTC")
                time.sleep(wait_secs)
                retry_count += 1
            else:
                print(f"❌ Unrecoverable error (historical): {message}")
                break
    return hist_df

def process_plant(row, client, cutoff_time, HOURLY_VARS, output_folder):
    """
    Processes a plant: retrieves historical data and saves the result to a Parquet file.
    """
    name = row["CENTRAL"]
    lat, lon = row["Latitud"], row["Longitud"]
    # Convert the first appearance date; adjust the format if needed.
    start_date = pd.to_datetime(row["FirstAppearance"], dayfirst=True).strftime("%Y-%m-%d")
    print(f"▶️ Processing {name}: from {start_date} until cutoff {cutoff_time.isoformat()}")

    hist_df = get_historical_data(lat, lon, start_date, cutoff_time, HOURLY_VARS, client)

    filename = f"{name.lower().replace(' ', '_')}.parquet"
    filepath = os.path.join(output_folder, filename)
    hist_df.to_parquet(filepath, index=False)
    print(f"✅ Saved {filename} for {name}.")
    return hist_df

def main():
    global calls_today
    # Configuration
    CENTRAL_INFO_CSV = "../data/lookup/central_info.csv"
    OUTPUT_FOLDER = "../data/raw/open_meteo_data"
    MAX_DAILY_CALLS = 10000

    # --------------------------
    # New HOURLY_VARS list with all variables from your forecast snippet
    # WARNING: Some of these may not be available historically
    # --------------------------
    HOURLY_VARS = [
        "temperature_2m", "wind_speed_10m", "relative_humidity_2m", "wind_gusts_10m", "vapour_pressure_deficit", "cloud_cover", "cloud_cover_low",
        "cloud_cover_mid", "cloud_cover_high", "surface_pressure", "pressure_msl",
        "apparent_temperature", "rain", "shortwave_radiation",
        "diffuse_radiation", "global_tilted_irradiance", "shortwave_radiation_instant",
        "diffuse_radiation_instant", "global_tilted_irradiance_instant", "direct_radiation",
        "direct_normal_irradiance", "terrestrial_radiation", "direct_radiation_instant",
        "direct_normal_irradiance_instant", "terrestrial_radiation_instant"
    ]
        
    PLANTS_OF_INTEREST = [
        "parque solar girasol"#, "parque eolico agua clara", "parque eolico de matafongo", "parque eolico guanillo",
        # "parque eolico larimar", "parque eolico larimar ii", "parque eolico los guzmancitos",
        # "parque eolico los guzmancitos 2", "parque fotovoltaico bayasol", "parque fotovoltaico calabaza",
        # "parque fotovoltaico cumayasa 1", "parque fotovoltaico cumayasa 2", "parque fotovoltaico la victoria",
        # "parque fotovoltaico los negros", "parque fotovoltaico maranatha fase i", "parque fotovoltaico mata de palma",
        # "parque fotovoltaico matrisol", "parque fotovoltaico mirasol", "parque fotovoltaico montecristi solar 1",
        # "parque fotovoltaico sajoma", "parque fotovoltaico santanasol", "parque fotovoltaico washington capital 2",
        # "parque fotovoltaico washington capital 3"
    ]

    os.makedirs(OUTPUT_FOLDER, exist_ok=True)
    now = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
    cutoff_time = now - datetime.timedelta(hours=26)

    # Configure the Open-Meteo client
    cache = requests_cache.CachedSession('.cache', expire_after=-1)
    session = retry(cache, retries=3, backoff_factor=0.2)
    client = openmeteo_requests.Client(session=session)

    # Load and filter central information from CSV
    df = pd.read_csv(CENTRAL_INFO_CSV)
    df_filtered = df[df["CENTRAL"].str.lower().isin([p.lower() for p in PLANTS_OF_INTEREST])].copy()

    # Process each plant
    for idx, row in df_filtered.iterrows():
        if calls_today >= MAX_DAILY_CALLS:
            print("✅ Daily API call limit reached — stopping execution.")
            break
        process_plant(row, client, cutoff_time, HOURLY_VARS, OUTPUT_FOLDER)

    print("✅ All plants have been processed.")
    print(f"API calls made today: {calls_today}")

if __name__ == "__main__":
    main()


▶️ Processing parque solar girasol: from 2021-09-07 until cutoff 2025-03-22T15:17:15.007425+00:00
✅ Saved parque_solar_girasol.parquet for parque solar girasol.
✅ All plants have been processed.
API calls made today: 1


In [8]:
import pandas as pd
df = pd.read_parquet("../data/raw/open_meteo_data/parque_solar_girasol.parquet")

In [9]:

pd.set_option('display.max_columns', None)

# Display the DataFrame completely

df[:-12]

Unnamed: 0,date,temperature_2m,wind_speed_10m,relative_humidity_2m,wind_gusts_10m,vapour_pressure_deficit,cloud_cover,cloud_cover_low,cloud_cover_mid,cloud_cover_high,surface_pressure,pressure_msl,apparent_temperature,rain,shortwave_radiation,diffuse_radiation,global_tilted_irradiance,shortwave_radiation_instant,diffuse_radiation_instant,global_tilted_irradiance_instant,direct_radiation,direct_normal_irradiance,terrestrial_radiation,direct_radiation_instant,direct_normal_irradiance_instant,terrestrial_radiation_instant
0,2021-09-07 00:00:00+00:00,27.088999,9.957108,83.185791,23.400000,0.602050,25.0,10.0,1.0,2.0,1003.130005,1013.099976,31.729336,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
1,2021-09-07 01:00:00+00:00,27.239000,8.913181,80.982498,20.519999,0.686970,17.0,13.0,12.0,2.0,1003.530884,1013.500000,31.846409,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
2,2021-09-07 02:00:00+00:00,27.139000,11.720751,77.852386,24.480000,0.795430,24.0,18.0,18.0,2.0,1004.220764,1014.200012,30.887413,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
3,2021-09-07 03:00:00+00:00,26.739000,15.077082,77.558205,32.039997,0.787343,17.0,11.0,12.0,1.0,1004.801575,1014.799988,29.737156,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
4,2021-09-07 04:00:00+00:00,26.338999,16.119801,79.650108,34.919998,0.697344,12.0,9.0,6.0,0.0,1005.085449,1015.099976,29.214397,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31007,2025-03-21 23:00:00+00:00,25.239000,6.552099,79.743988,20.160000,0.650461,13.0,11.0,4.0,0.0,1006.038635,1016.099976,28.923424,0.0,49.0,21.0,48.999992,0.0,0.0,0.0,28.0,205.069275,128.390945,0.0,0.0,0.0
31008,2025-03-22 00:00:00+00:00,24.889000,8.242743,81.674309,15.480000,0.576348,7.0,6.0,1.0,0.0,1006.719971,1016.799988,28.354549,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
31009,2025-03-22 01:00:00+00:00,,,,,,,,,,,,,,,,,,,,,,0.000000,,,0.0
31010,2025-03-22 02:00:00+00:00,,,,,,,,,,,,,,,,,,,,,,0.000000,,,0.0


26