In [None]:
import requests
import pandas as pd
from datetime import datetime
import time
import os

# Coordinates and Parameters
LATITUDE = -7.0520702239386175
LONGITUDE = 110.43532807750137
TIMEZONE = "Asia/Jakarta"
API_URL = "https://archive-api.open-meteo.com/v1/archive"

# Weather Condition Mapping (Optional helper, main logic uses weather_code)
def map_weather_code(code):
    """Maps WMO weather code to user's custom condition string."""
    if code is None:
        return 'Unknown'
    if code == 0:
        return 'Clear'
    elif code in [1, 2]:
        return 'Partially cloudy'
    elif code in [3, 45, 48]:
        return 'Overcast'
    elif code in [51, 53, 55]:
        return 'Rain'
    elif code in [61, 63, 65]:
        return 'Rain, Overcast'
    elif code in [80, 81, 82]:
        return 'Rain, Partially cloudy'
    elif code in [95, 96, 99]:
        return 'Rain'
    else:
        return 'Unknown'

def fetch_hourly_data_chunk(start_date, end_date):
    """Fetch hourly and daily data for a specific date range."""
    params = {
        "latitude": LATITUDE,
        "longitude": LONGITUDE,
        "start_date": start_date,
        "end_date": end_date,
        "hourly": ["temperature_2m", "weather_code", "relative_humidity_2m", 
                   "pressure_msl", "wind_speed_10m", "rain", "precipitation", 
                   "apparent_temperature", "surface_pressure"],
        "daily": ["temperature_2m_max", "temperature_2m_min", "weather_code", 
                  "temperature_2m_mean", "relative_humidity_2m_mean", 
                  "pressure_msl_mean", "wind_speed_10m_mean"],
        "timezone": TIMEZONE
    }
    
    response = requests.get(API_URL, params=params)
    response.raise_for_status()
    return response.json()

def fetch_historical_hourly_data():
    """Fetch hourly and daily data from 2000 to today in yearly chunks."""
    # today = datetime.now()
    today = datetime(2025, 12, 5) # Set to 2025-12-05 as requested
    start_year = 2000
    end_year = today.year
    
    all_data = []
    
    for year in range(start_year, end_year + 1):
        start_date = f"{year}-01-01"
        if year == end_year:
            end_date = today.strftime("%Y-%m-%d")
        else:
            end_date = f"{year}-12-31"
        
        print(f"Fetching data for {year} (from {start_date} to {end_date})...")
        
        try:
            data = fetch_hourly_data_chunk(start_date, end_date)
            hourly_data = data.get("hourly", {})
            daily_data = data.get("daily", {})
            
            if not hourly_data:
                print(f"No hourly data found for {year}.")
                continue
            
            # Process hourly data
            df_hourly = pd.DataFrame({
                "timestamp": hourly_data["time"],
                "temp": hourly_data["temperature_2m"],
                "humidity": hourly_data["relative_humidity_2m"],
                "windspeed": hourly_data["wind_speed_10m"],
                "sealevelpressure": hourly_data["pressure_msl"],
                "weather_code": hourly_data["weather_code"],
                "rain": hourly_data["rain"],
                "precipitation": hourly_data["precipitation"],
                "apparent_temperature": hourly_data["apparent_temperature"],
                "surface_pressure": hourly_data["surface_pressure"]
            })
            df_hourly["timestamp"] = pd.to_datetime(df_hourly["timestamp"])
            df_hourly["date_only"] = df_hourly["timestamp"].dt.date # Helper for merging

            # Process daily data (with new avg features)
            df_daily = pd.DataFrame({
                "date_only": daily_data["time"],
                "temp_max_daily": daily_data["temperature_2m_max"],
                "temp_min_daily": daily_data["temperature_2m_min"],
                "weather_code_daily": daily_data["weather_code"],
                "temp_mean_daily": daily_data["temperature_2m_mean"],
                "humidity_avg_daily": daily_data["relative_humidity_2m_mean"],
                "pressure_avg_daily": daily_data["pressure_msl_mean"],
                "windspeed_avg_daily": daily_data["wind_speed_10m_mean"]
            })
            df_daily["date_only"] = pd.to_datetime(df_daily["date_only"]).dt.date

            # Merge hourly and daily data
            df_year = pd.merge(df_hourly, df_daily, on="date_only", how="left")
            df_year = df_year.drop(columns=["date_only"]) # Drop helper column
            
            all_data.append(df_year)
            time.sleep(0.5)
            
        except requests.exceptions.RequestException as e:
            print(f"Error fetching data for {year}: {e}")
            continue
    
    if not all_data:
        print("No data fetched.")
        return
    
    df = pd.concat(all_data, ignore_index=True)
    
    df["hour"] = df["timestamp"].dt.hour
    df["day"] = df["timestamp"].dt.day
    df["month"] = df["timestamp"].dt.month
    df["year"] = df["timestamp"].dt.year
    
    df["conditions"] = df["weather_code"].apply(map_weather_code)
    df["id"] = range(len(df))
    
    output_columns = ["id", "timestamp", "hour", "day", "month", "year", 
                      "temp", "humidity", "windspeed", "sealevelpressure", 
                      "rain", "precipitation", "apparent_temperature", "surface_pressure",
                      "weather_code", "conditions", 
                      "temp_max_daily", "temp_min_daily", "weather_code_daily", "temp_mean_daily",
                      "humidity_avg_daily", "pressure_avg_daily", "windspeed_avg_daily"]
    
    final_df = df[output_columns]
    
    # Ensure output directory exists
    output_dir = "../data"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    output_file = os.path.join(output_dir, "historical_data_2000_2024_hourly_daily.csv")
    final_df.to_csv(output_file, index=False)
    print(f"Data successfully saved to {output_file}")
    print(f"Total records: {len(final_df)}")
    print(final_df.head())

fetch_historical_hourly_data()

Fetching data for 2000 (from 2000-01-01 to 2000-12-31)...
Fetching data for 2001 (from 2001-01-01 to 2001-12-31)...
Fetching data for 2002 (from 2002-01-01 to 2002-12-31)...
Fetching data for 2003 (from 2003-01-01 to 2003-12-31)...
Fetching data for 2004 (from 2004-01-01 to 2004-12-31)...
Fetching data for 2005 (from 2005-01-01 to 2005-12-31)...
Fetching data for 2006 (from 2006-01-01 to 2006-12-31)...
Fetching data for 2007 (from 2007-01-01 to 2007-12-31)...
Fetching data for 2008 (from 2008-01-01 to 2008-12-31)...
Fetching data for 2009 (from 2009-01-01 to 2009-12-31)...
Fetching data for 2010 (from 2010-01-01 to 2010-12-31)...
Fetching data for 2011 (from 2011-01-01 to 2011-12-31)...
Fetching data for 2012 (from 2012-01-01 to 2012-12-31)...
Fetching data for 2013 (from 2013-01-01 to 2013-12-31)...
Fetching data for 2014 (from 2014-01-01 to 2014-12-31)...
Fetching data for 2015 (from 2015-01-01 to 2015-12-31)...
Error fetching data for 2015: 429 Client Error: Too Many Requests for ur