In [1]:
# Pip install if needed 
# pip install openmeteo-requests

In [2]:
# pip install if needed
# pip install requests-cache retry-requests numpy pandas

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# imports needed for open-meteo.com data
from retry_requests import retry
import openmeteo_requests
import requests_cache
from requests.adapters import Retry
import time
import random
import os

In [4]:
# commented out so this is not run everytime the kernel needs to be restarted

# Setup cached session with retry functionality
cache_session = requests_cache.CachedSession('.cache', expire_after=3600)
retry = Retry(total=5, backoff_factor=0.2)
retry_session = requests_cache.CachedSession('.cache', expire_after=3600)
openmeteo = openmeteo_requests.Client(session=retry_session)

# List of coordinates (latitude, longitude)
coordinates = [
    (40.49, -79.76), (40.49, -78.76), (40.49, -77.76), (40.49, -76.76),
    (40.49, -75.76), (40.49, -74.76), (40.49, -73.76), (40.49, -72.76),
    (41.49, -79.76), (41.49, -78.76), (41.49, -77.76), (41.49, -76.76),
    (41.49, -75.76), (41.49, -74.76), (41.49, -73.76), (41.49, -72.76),
    (42.49, -79.76), (42.49, -78.76), (42.49, -77.76), (42.49, -76.76),
    (42.49, -75.76), (42.49, -74.76), (42.49, -73.76), (42.49, -72.76),
    (43.49, -79.76), (43.49, -78.76), (43.49, -77.76), (43.49, -76.76),
    (43.49, -75.76), (43.49, -74.76), (43.49, -73.76), (43.49, -72.76),
    (44.49, -79.76), (44.49, -78.76), (44.49, -77.76), (44.49, -76.76),
    (44.49, -75.76), (44.49, -74.76), (44.49, -73.76), (44.49, -72.76)
]

# Function to fetch weather data for a single location
def fetch_weather_data(lat, lon):
    url = "https://archive-api.open-meteo.com/v1/archive"
    params = {
        "latitude": lat,
        "longitude": lon,
        "start_date": "2000-01-01",
        "end_date": "2024-12-01",
        "daily": [
            "daylight_duration", "sunshine_duration", "rain_sum", 
            "snowfall_sum", "precipitation_hours", "wind_speed_10m_max", "wind_gusts_10m_max"
        ],
        "timezone": "America/New_York"
    }

    response = openmeteo.weather_api(url, params=params)[0]

    daily = response.Daily()
    daily_data = {
        "date": pd.date_range(
            start=pd.to_datetime(daily.Time(), unit="s", utc=True),
            end=pd.to_datetime(daily.TimeEnd(), unit="s", utc=True),
            freq=pd.Timedelta(seconds=daily.Interval()),
            inclusive="left"
        ),
        "daylight_duration": daily.Variables(0).ValuesAsNumpy(),
        "sunshine_duration": daily.Variables(1).ValuesAsNumpy(),
        "rain_sum": daily.Variables(2).ValuesAsNumpy(),
        "snowfall_sum": daily.Variables(3).ValuesAsNumpy(),
        "precipitation_hours": daily.Variables(4).ValuesAsNumpy(),
        "wind_speed_10m_max": daily.Variables(5).ValuesAsNumpy(),
        "wind_gusts_10m_max": daily.Variables(6).ValuesAsNumpy(),
        "latitude": [lat] * len(daily.Variables(0).ValuesAsNumpy()),
        "longitude": [lon] * len(daily.Variables(0).ValuesAsNumpy()),
    }
    return pd.DataFrame(data=daily_data)

def get_date_coverage(csv_path, lat, lon):
    """
    Get the date range already collected for a specific coordinate
    """
    if not os.path.exists(csv_path):
        return None

    # Read the CSV file
    df = pd.read_csv(csv_path)
    
    # Filter for the specific latitude and longitude
    coord_df = df[(df['latitude'] == lat) & (df['longitude'] == lon)]
    
    if coord_df.empty:
        return None
    
    # Convert date column to datetime
    coord_df['date'] = pd.to_datetime(coord_df['date'])
    
    # Return the minimum and maximum dates
    return {
        'min_date': coord_df['date'].min(),
        'max_date': coord_df['date'].max()
    }

def collect_comprehensive_weather_data(
    coordinates, 
    new_start_date, 
    new_end_date, 
    output_path='./data/new-york-weather.csv', 
    max_retries=3
):
    """
    Collect weather data, ensuring complete coverage across all coordinates
    """
    # Ensure data directory exists
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    # Convert date inputs to datetime
    new_start = pd.to_datetime(new_start_date)
    new_end = pd.to_datetime(new_end_date)
    
    # Tracking variables
    failed_coordinates = []
    request_count = 0
    hourly_request_count = 0
    
    for i, (lat, lon) in enumerate(coordinates, 1):
        retries = 0
        success = False
        
        # Check existing date coverage
        existing_coverage = get_date_coverage(output_path, lat, lon)
        
        # Determine date ranges to fetch
        fetch_ranges = []
        
        if existing_coverage is None:
            # No existing data for this coordinate, fetch the entire new range
            fetch_ranges.append((new_start, new_end))
        else:
            # Check and fill date gaps
            existing_min = existing_coverage['min_date']
            existing_max = existing_coverage['max_date']
            
            # Ranges to fetch before existing data
            if new_start < existing_min:
                fetch_ranges.append((new_start, existing_min))
            
            # Ranges to fetch after existing data
            if new_end > existing_max:
                fetch_ranges.append((existing_max, new_end))
        
        # Fetch data for each range
        for start, end in fetch_ranges:
            while retries < max_retries and not success:
                try:
                    print(f"Fetching weather data for coordinates: {lat}, {lon} (Location {i}/{len(coordinates)}, Date Range: {start.date()} to {end.date()}, Attempt {retries + 1})")
                    
                    # Fetch data for the current location and date range
                    df = fetch_weather_data(lat, lon, start.strftime('%Y-%m-%d'), end.strftime('%Y-%m-%d'))
                    
                    # Append to CSV
                    df.to_csv(output_path, mode='a', header=not os.path.exists(output_path), index=False)
                    
                    request_count += 1
                    hourly_request_count += 1
                    success = True
                    
                    # Add delay every 3 requests
                    if request_count % 3 == 0:
                        delay = random.uniform(2.5, 4.5)
                        print(f"Pausing for {delay:.2f} seconds to manage request rate...")
                        time.sleep(delay)
                    
                    # Check and exit on hourly limit
                    if hourly_request_count >= 5000:
                        print("Hourly API request limit reached. Exiting script.")
                        raise SystemExit("Hourly API request limit exceeded")
                
                except Exception as e:
                    error_message = str(e)
                    
                    # Specific handling for API limit errors
                    if 'Minutely API request limit exceeded' in error_message:
                        print("API minutely rate limit reached. Exiting script.")
                        raise SystemExit("Minutely API request limit exceeded")
                    elif 'Hourly API request limit exceeded' in error_message:
                        print("API hourly rate limit reached. Exiting script.")
                        raise SystemExit("Hourly API request limit exceeded")
                    
                    # General error handling
                    retries += 1
                    
                    if retries >= max_retries:
                        print(f"Error: {error_message}")
                        failed_coordinates.append((lat, lon, start.date(), end.date()))
                        break
                    else:
                        print(f"Error: {error_message}")
                        print(f"Waiting 10 seconds before retrying (Attempt {retries}/{max_retries})...")
                        time.sleep(10)
    
    # Print and save failed coordinates if any
    if failed_coordinates:
        print("\nFailed to fetch data for the following coordinates:")
        for coord in failed_coordinates:
            print(coord)
        
        # Save failed coordinates to a separate file
        failed_coords_df = pd.DataFrame(failed_coordinates, columns=['Latitude', 'Longitude', 'Start_Date', 'End_Date'])
        failed_coords_df.to_csv('./data/failed_coordinates.csv', mode='a', header=not os.path.exists('./data/failed_coordinates.csv'), index=False)
        print(f"Saved {len(failed_coordinates)} failed coordinates to './data/failed_coordinates.csv'")
    
    print("Comprehensive weather data collection completed.")
    return failed_coordinates

# Example usage
if __name__ == "__main__":
    # Example of collecting data for a specific date range
    try:
        collect_comprehensive_weather_data(
            coordinates, 
            new_start_date='1990-01-01', 
            new_end_date='1999-01-01', 
            output_path='./data/new-york-weather.csv'
        )
    except SystemExit as e:
        print(f"Script exited: {e}")
        sys.exit(1)


Fetching weather data for coordinates: 40.49, -79.76 (Location 1/40, Attempt 1)
Fetching weather data for coordinates: 40.49, -78.76 (Location 2/40, Attempt 1)
Fetching weather data for coordinates: 40.49, -77.76 (Location 3/40, Attempt 1)
API minutely rate limit reached. Pausing for 65 seconds...
Fetching weather data for coordinates: 40.49, -77.76 (Location 3/40, Attempt 2)
Pausing for 2.73 seconds to manage request rate...
Fetching weather data for coordinates: 40.49, -76.76 (Location 4/40, Attempt 1)
Fetching weather data for coordinates: 40.49, -75.76 (Location 5/40, Attempt 1)
Fetching weather data for coordinates: 40.49, -74.76 (Location 6/40, Attempt 1)
API minutely rate limit reached. Pausing for 65 seconds...
Fetching weather data for coordinates: 40.49, -74.76 (Location 6/40, Attempt 2)
Pausing for 2.54 seconds to manage request rate...
Fetching weather data for coordinates: 40.49, -73.76 (Location 7/40, Attempt 1)
Fetching weather data for coordinates: 40.49, -72.76 (Locati

KeyboardInterrupt: 

In [None]:
weather = pd.read_csv('./data/new-york-weather.csv')
weather.shape

In [None]:
weather.drop(columns = 'Unnamed: 0', inplace = True)

In [None]:
weather.isnull().sum()