In [1]:
import requests
import pandas as pd
import time
from datetime import datetime, timedelta
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

In [2]:
# Configuration
API_URL = "https://api.open-meteo.com/v1/forecast"
needed_hours = 3000
HOURLY_VARIABLES = [
    "temperature_2m",
    "relative_humidity_2m",
    "precipitation",
    "wind_speed_10m",
    "weather_code"
]
LOCATIONS = [
    (48.8566, 2.3522),
    (45.7640, 4.8357),
    (43.2965, 5.3698),
    (44.8378, -0.5792)
]

# Retry configuration for temporary HTTP errors
session = requests.Session()
retries = Retry(
    total=3,  # Maximum number of attempts
    backoff_factor=1,  # Exponential delay (1s, 2s, 4s)
    status_forcelist=[429, 500, 502, 503, 504],  # HTTP status codes to retry on
    allowed_methods=["GET"]
)
session.mount("https://", HTTPAdapter(max_retries=retries))

In [3]:
# Function to fetch weather data
def fetch_weather_data(latitude: float, longitude: float, needed_hours: int) -> pd.DataFrame:
    all_data = []
    remaining_hours = needed_hours
    end_date = datetime.now()
    
    while remaining_hours > 0:
        # Calculate the time range (maximum 90 days per API call)
        chunk_days = min(90, (remaining_hours // 24) + 1)
        start_date = end_date - timedelta(days=chunk_days)
        
        params = {
            "latitude": latitude,
            "longitude": longitude,
            "start_date": start_date.strftime('%Y-%m-%d'),
            "end_date": end_date.strftime('%Y-%m-%d'),
            "hourly": ",".join(HOURLY_VARIABLES),
            "timezone": "Europe/Paris"
        }
        
        try:
            response = session.get(API_URL, params=params, timeout=15)
            response.raise_for_status()  # Raise an exception for HTTP 4xx/5xx errors
            
            data = response.json()
            if "hourly" not in data:
                print(f"No hourly data available for {latitude, longitude}")
                break
                
            chunk_df = pd.DataFrame(data["hourly"])
            
            # Add metadata
            chunk_df["latitude"] = latitude
            chunk_df["longitude"] = longitude
            
            all_data.append(chunk_df)
            fetched_hours = len(chunk_df)
            remaining_hours -= fetched_hours
            
            print(f"{fetched_hours} hourly records added for {latitude, longitude} (remaining: {remaining_hours})")
            
            # Update for the next iteration
            end_date = start_date - timedelta(days=1)
            time.sleep(1)  # Respect API rate limits
            
            if fetched_hours == 0:
                break
                
        except requests.exceptions.HTTPError as http_err:
            if response.status_code == 429:
                print(f"Rate limit reached for {latitude, longitude}. Waiting before retrying...")
                time.sleep(10)  # Longer wait time for status 429
                continue
            elif response.status_code == 404:
                print(f"Error 404: Invalid URL or parameters for {latitude, longitude}")
                break
            else:
                print(f"HTTP error for {latitude, longitude}: {str(http_err)}")
                break
        except requests.exceptions.RequestException as e:
            print(f"Connection error for {latitude, longitude}: {str(e)}")
            break

    final_df = pd.concat(all_data, ignore_index=True) if all_data else pd.DataFrame()
    return final_df.iloc[:3000]  # Return exactly the requested number of rows

In [4]:
def collect_all_locations(locations: list[tuple[float, float]], needed_hours: int) -> pd.DataFrame:
    all_dfs = []
    
    for (lat, lon) in locations:
        print(f"\n=== Fetching data for location ({lat}, {lon}) ===")
        
        # Fetch data for this location
        df_location = fetch_weather_data(latitude=lat, longitude=lon, needed_hours=needed_hours)
        
        if not df_location.empty:
            all_dfs.append(df_location)
        
        # Pause between requests to avoid overloading the API
        time.sleep(2)
    
    # Combine all DataFrames
    final_df2 = pd.concat(all_dfs, ignore_index=True) if all_dfs else pd.DataFrame()
    
    # Summary statistics
    if not final_df2.empty:
        counts = final_df2["latitude"].value_counts()
        print("\n=== Summary ===")
        print(f"Total records retrieved: {len(final_df2)}")
        print("Breakdown by location:")
        print(counts.to_string())
    
    return final_df2

In [5]:
df_meteo = collect_all_locations(LOCATIONS, needed_hours)


=== Fetching data for location (48.8566, 2.3522) ===
2184 hourly records added for (48.8566, 2.3522) (remaining: 816)
864 hourly records added for (48.8566, 2.3522) (remaining: -48)


  final_df = pd.concat(all_data, ignore_index=True) if all_data else pd.DataFrame()



=== Fetching data for location (45.764, 4.8357) ===
2184 hourly records added for (45.764, 4.8357) (remaining: 816)
864 hourly records added for (45.764, 4.8357) (remaining: -48)


  final_df = pd.concat(all_data, ignore_index=True) if all_data else pd.DataFrame()



=== Fetching data for location (43.2965, 5.3698) ===
2184 hourly records added for (43.2965, 5.3698) (remaining: 816)
864 hourly records added for (43.2965, 5.3698) (remaining: -48)


  final_df = pd.concat(all_data, ignore_index=True) if all_data else pd.DataFrame()



=== Fetching data for location (44.8378, -0.5792) ===
2184 hourly records added for (44.8378, -0.5792) (remaining: 816)
864 hourly records added for (44.8378, -0.5792) (remaining: -48)


  final_df = pd.concat(all_data, ignore_index=True) if all_data else pd.DataFrame()



=== Summary ===
Total records retrieved: 12000
Breakdown by location:
latitude
48.8566    3000
45.7640    3000
43.2965    3000
44.8378    3000


In [6]:
df_meteo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   time                  12000 non-null  object 
 1   temperature_2m        7392 non-null   float64
 2   relative_humidity_2m  7392 non-null   float64
 3   precipitation         7392 non-null   float64
 4   wind_speed_10m        7392 non-null   float64
 5   weather_code          7392 non-null   float64
 6   latitude              12000 non-null  float64
 7   longitude             12000 non-null  float64
dtypes: float64(7), object(1)
memory usage: 750.1+ KB


In [7]:
# Saving dataframe into csv format
df_meteo.to_csv("weather_data_extracted.csv", index=False)