This calls the OpenAQ API and collects the latest data from location IDs

In [0]:
pip install python-dotenv

In [0]:
# import time
# import requests
# import json
# import random
# from pyspark.sql import SparkSession

# import os
# from dotenv import load_dotenv
# api_key = os.getenv('OPENAQ_API_KEY')

# # Set your API details
# API_KEY = api_key
# BASE_URL = "https://api.openaq.org/v3"


# # Load the static Parquet file
# distinct_sensors_path = "/Volumes/tabular/dataexpert/freshoats_capstone/la_sensors_delta_table"
# distinct_sensors_df = spark.read.format('delta').load(distinct_sensors_path)

# # Retrieve the list of location IDs from your Spark DataFrame
# location_ids = distinct_sensors_df.select("location_id").rdd.flatMap(lambda x: x).collect()

# # List to accumulate the latest sensor data for each location
# latest_sensor_data = []

# max_attempts = 5

# for location_id in location_ids:
#     url = f"{BASE_URL}/locations/{location_id}/latest"
#     headers = {"X-API-Key": API_KEY}
    
#     attempt = 0
#     retry_time = 0.5
#     success = False
    
#     while attempt < max_attempts:
#         try:
#             response = requests.get(url, headers=headers)
#             response.raise_for_status()
#             data = response.json()

#             # Convert the nested "results" field to a JSON string to avoid schema inference issues
#             if "results" in data:
#                 data["results"] = json.dumps(data["results"])

#             # Optionally add location_id if it isn't already in the response
#             if "location_id" not in data:
#                 data["location_id"] = location_id

#             latest_sensor_data.append(data)
#             print(f"Successfully fetched data for location {location_id}")
#             success = True
#             break

#         except requests.exceptions.HTTPError as http_err:
#             if response.status_code == 429:
#                 # Rate limiting: pause using the Retry-After header if available
#                 retry_after = response.headers.get("Retry-After")
#                 sleep_time = float(retry_after) if retry_after else retry_time
#                 print(f"Rate limited (429) for location {location_id}. Sleeping for {sleep_time} seconds before retrying...")
#                 time.sleep(sleep_time)
#                 retry_time *= 2
#                 continue
#             elif response.status_code == 401:
#                 print(f"Unauthorized (401) for location {location_id}. Check your API key and header format.")
#                 break
#             else:
#                 attempt += 1
#                 print(f"HTTP error for location {location_id} on attempt {attempt}: {http_err}. Retrying in {retry_time} seconds...")
#                 time.sleep(retry_time)
#                 retry_time *= 2

#         except Exception as e:
#             attempt += 1
#             print(f"Error for location {location_id} on attempt {attempt}: {e}. Retrying in {retry_time} seconds...")
#             time.sleep(retry_time)
#             retry_time *= 2

#     if not success:
#         print(f"Failed to fetch data for location {location_id} after {max_attempts} attempts.")

#     # Pause between calls for a random duration between 1 and 2 seconds
#     pause = random.uniform(1, 2)
#     print(f"Pausing for {pause:.2f} seconds before the next call...")
#     time.sleep(pause)

# # Initialize the Spark session
# spark = SparkSession.builder.getOrCreate()

# # Convert the list of dictionaries to a Spark DataFrame.
# # With "results" now being a string, Spark can properly infer the schema.
# latest_sensor_df = spark.createDataFrame(latest_sensor_data)

# # Optionally inspect the DataFrame
# # latest_sensor_df.printSchema()
# # latest_sensor_df.show(10)

# # Save the raw data (as JSON, preserving nested structures) into the raw table
# raw_table_name = "sensor_measurements_raw"

# latest_sensor_df.write.mode("append").format("delta").saveAsTable(raw_table_name)

# print(f"Raw sensor data successfully saved to the table: {raw_table_name}")

Addind limiters, so I can call 5 days of data without exceeding limits

In [0]:
pip install tqdm

In [0]:
import time
import requests
import json
import random
from pyspark.sql import SparkSession
from collections import deque
from datetime import datetime, timedelta
import logging
from tqdm import tqdm

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class RateLimiter:
    def __init__(self, per_minute=60, per_hour=2000):
        self.per_minute_calls = deque(maxlen=per_minute)
        self.per_hour_calls = deque(maxlen=per_hour)
        self.per_minute_limit = per_minute
        self.per_hour_limit = per_hour

    def wait_if_needed(self):
        current_time = datetime.now()
        
        # Clean up old timestamps
        while self.per_minute_calls and current_time - self.per_minute_calls[0] > timedelta(minutes=1):
            self.per_minute_calls.popleft()
        while self.per_hour_calls and current_time - self.per_hour_calls[0] > timedelta(hours=1):
            self.per_hour_calls.popleft()
        
        # Check if we need to wait
        if len(self.per_minute_calls) >= self.per_minute_limit:
            sleep_time = 60 - (current_time - self.per_minute_calls[0]).total_seconds()
            if sleep_time > 0:
                logger.info(f"Rate limit approaching, waiting {sleep_time:.2f} seconds...")
                time.sleep(sleep_time)
        
        if len(self.per_hour_calls) >= self.per_hour_limit:
            sleep_time = 3600 - (current_time - self.per_hour_calls[0]).total_seconds()
            if sleep_time > 0:
                logger.info(f"Hourly limit approaching, waiting {sleep_time:.2f} seconds...")
                time.sleep(sleep_time)
        
        # Record this call
        current_time = datetime.now()
        self.per_minute_calls.append(current_time)
        self.per_hour_calls.append(current_time)

def fetch_location_data(location_id, api_key, base_url, max_attempts=5):
    url = f"{base_url}/locations/{location_id}/latest"
    headers = {"X-API-Key": api_key}
    
    attempt = 0
    retry_time = 0.5
    
    while attempt < max_attempts:
        try:
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            data = response.json()

            # Convert the nested "results" field to a JSON string
            if "results" in data:
                data["results"] = json.dumps(data["results"])

            # Add location_id if not present
            if "location_id" not in data:
                data["location_id"] = location_id

            logger.info(f"Successfully fetched data for location {location_id}")
            return data

        except requests.exceptions.HTTPError as http_err:
            if response.status_code == 429:
                retry_after = response.headers.get("Retry-After")
                sleep_time = float(retry_after) if retry_after else retry_time
                logger.warning(f"Rate limited (429) for location {location_id}. Sleeping for {sleep_time} seconds...")
                time.sleep(sleep_time)
                retry_time *= 2
                continue
            elif response.status_code == 401:
                logger.error(f"Unauthorized (401) for location {location_id}. Check API key.")
                return None
            else:
                attempt += 1
                logger.warning(f"HTTP error for location {location_id} on attempt {attempt}: {http_err}")
                time.sleep(retry_time)
                retry_time *= 2

        except Exception as e:
            attempt += 1
            logger.error(f"Error for location {location_id} on attempt {attempt}: {e}")
            time.sleep(retry_time)
            retry_time *= 2

    logger.error(f"Failed to fetch data for location {location_id} after {max_attempts} attempts.")
    return None

def main():
    # API Configuration
    import os
    from dotenv import load_dotenv
    api_key = os.getenv('OPENAQ_API_KEY')


    API_KEY = api_key
    BASE_URL = "https://api.openaq.org/v3"

    # Initialize Spark session
    spark = SparkSession.builder.getOrCreate()

    # Load imported df of locations, acquired from the FIRMS import 
    distinct_sensors_df = spark.table("temp_locations_for_api")

    # Get location IDs
    location_ids = distinct_sensors_df.select("location_id").rdd.flatMap(lambda x: x).collect()

    # Initialize rate limiter
    rate_limiter = RateLimiter(per_minute=60, per_hour=2000)

    # Process locations in batches
    batch_size = 55  # Slightly under the per-minute limit
    location_batches = [location_ids[i:i + batch_size] for i in range(0, len(location_ids), batch_size)]
    
    latest_sensor_data = []

    # Process each batch with progress bar
    for batch in tqdm(location_batches, desc="Processing location batches"):
        batch_data = []
        for location_id in batch:
            rate_limiter.wait_if_needed()
            data = fetch_location_data(location_id, API_KEY, BASE_URL)
            if data:
                batch_data.append(data)

        latest_sensor_data.extend(batch_data)
        
        # Optional: Save batch data periodically
        if len(latest_sensor_data) >= 500:  # Adjust threshold as needed
            temp_df = spark.createDataFrame(latest_sensor_data)
            temp_df.write.mode("append").format("delta").saveAsTable("sensor_measurements_raw")
            latest_sensor_data = []
            logger.info("Saved batch to Delta table")

    # Save any remaining data
    if latest_sensor_data:
        final_df = spark.createDataFrame(latest_sensor_data)
        final_df.write.mode("append").format("delta").saveAsTable("sensor_measurements_raw")
        logger.info("Saved final batch to Delta table")

    logger.info("Data collection complete")

if __name__ == "__main__":
    main()