In [None]:
"""
Rangely Station Air Quality Scraper

This script scrapes air quality and meteorological data from the Rangely Station monitoring site
on the Colowhite River Air Quality website. It retrieves data on meteorology, ozone (O3), 
particulate matter (PM2.5), and nitrogen dioxide (NO2), then prints the data and saves it 
to a Parquet file with a timestamped filename.

Dependencies:
- pandas
- selenium
- webdriver-manager
- datetime
- time

Usage:
    python rangely_scraper.py

Ensure that all dependencies are installed. The script runs in headless mode using Chrome WebDriver.
"""

import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from datetime import datetime
import time


def scrape_rangely_station():
    """
    Scrapes air quality and meteorological data from the Rangely Station monitoring site.

    This function performs the following steps:
    1. Configures Selenium WebDriver with headless Chrome options.
    2. Navigates to the Rangely Station monitoring page.
    3. Extracts data for meteorology, ozone (O3), particulate matter (PM2.5), and nitrogen dioxide (NO2).
    4. Compiles the extracted data into a dictionary with a timestamp.
    5. Returns the data dictionary and the current datetime.

    Returns:
        tuple:
            - dict: A dictionary containing the timestamp and nested dictionaries for each data category.
            - datetime: The current datetime when the data was scraped.

    Example:
        data, timestamp = scrape_rangely_station()
    """
    # Set up Chrome options
    chrome_options = Options()
    chrome_options.add_argument('--headless')  # Run Chrome in headless mode
    chrome_options.add_argument('--no-sandbox')  # Bypass OS security model
    chrome_options.add_argument('--disable-dev-shm-usage')  # Overcome limited resource problems

    try:
        # Initialize the WebDriver using ChromeDriverManager
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=chrome_options)

        # Set window size to ensure all elements are visible
        driver.set_window_size(1920, 1080)

        # Navigate to the Rangely Station monitoring page
        url = 'https://www.colowhiteriverairquality.net/monitoring-sites/rangely-station.html'
        driver.get(url)

        # Wait for the page to load completely
        wait = WebDriverWait(driver, 60)

        # Initialize data dictionary with the current timestamp
        current_datetime = datetime.now()
        data = {
            'timestamp': current_datetime.strftime('%Y-%m-%d %H:%M:%S'),
            'meteorology': {},
            'ozone': {},
            'pm25': {},
            'no2': {}
        }

        # Print page title for debugging purposes
        print(f"Page title: {driver.title}")

        try:
            # Scrape Meteorological Data
            meteorology_section = wait.until(
                EC.presence_of_element_located((By.XPATH, "//h5[contains(text(), 'Current Meteorology')]"))
            )
            meteorology_table = meteorology_section.find_element(By.XPATH, "following-sibling::table")

            rows = meteorology_table.find_elements(By.TAG_NAME, "tr")
            for row in rows:
                cols = row.find_elements(By.TAG_NAME, "td")
                if len(cols) == 2:
                    key = cols[0].text.strip()
                    value = cols[1].text.strip()
                    if key:  # Only add if key is not empty
                        data['meteorology'][key] = value

            # Scrape Ozone (O3) Data
            ozone_section = wait.until(
                EC.presence_of_element_located((By.XPATH, "//h5[contains(text(), 'Ozone')]"))
            )
            ozone_table = ozone_section.find_element(By.XPATH, "following-sibling::table")

            rows = ozone_table.find_elements(By.TAG_NAME, "tr")
            for row in rows:
                cols = row.find_elements(By.TAG_NAME, "td")
                if len(cols) == 2:
                    key = cols[0].text.strip()
                    value = cols[1].text.strip()
                    if key:
                        data['ozone'][key] = value

            # Scrape Particulate Matter (PM2.5) Data
            pm25_section = wait.until(
                EC.presence_of_element_located((By.XPATH, "//h5[contains(text(), 'Particulate Matter')]"))
            )
            pm25_table = pm25_section.find_element(By.XPATH, "following-sibling::table")

            rows = pm25_table.find_elements(By.TAG_NAME, "tr")
            for row in rows:
                cols = row.find_elements(By.TAG_NAME, "td")
                if len(cols) == 2:
                    key = cols[0].text.strip()
                    value = cols[1].text.strip()
                    if key:
                        data['pm25'][key] = value

            # Scrape Nitrogen Dioxide (NO2) Data
            no2_section = wait.until(
                EC.presence_of_element_located((By.XPATH, "//h5[contains(text(), 'Nitrogen Dioxide')]"))
            )
            no2_table = no2_section.find_element(By.XPATH, "following-sibling::table")

            rows = no2_table.find_elements(By.TAG_NAME, "tr")
            for row in rows:
                cols = row.find_elements(By.TAG_NAME, "td")
                if len(cols) == 2:
                    key = cols[0].text.strip()
                    value = cols[1].text.strip()
                    if key:
                        data['no2'][key] = value

        except Exception as e:
            print(f"An error occurred while scraping data: {e}")

        return data, current_datetime

    finally:
        # Ensure the WebDriver is closed properly
        if 'driver' in locals():
            driver.quit()


def print_data(data):
    """
    Prints the scraped air quality and meteorological data in a readable format.

    This function displays the timestamp, meteorological data, ozone (O3), particulate matter (PM2.5),
    and nitrogen dioxide (NO2) data.

    Args:
        data (dict): The data dictionary containing timestamp and various air quality measurements.

    Example:
        print_data(data)
    """
    if not data:
        print("No data to display.")
        return

    print("\n=== Rangely Station Air Quality Data ===")
    print(f"Timestamp: {data['timestamp']}\n")

    print("Meteorological Data:")
    for key, value in data['meteorology'].items():
        print(f"{key}: {value}")
    print()

    print("Ozone (O3):")
    for key, value in data['ozone'].items():
        print(f"{key}\t{value}")
    print()

    print("Particulate Matter (PM2.5):")
    for key, value in data['pm25'].items():
        print(f"{key}\t{value}")
    print()

    print("Nitrogen Dioxide (NO2):")
    for key, value in data['no2'].items():
        print(f"{key}\t{value}")


def save_data_to_parquet(data, timestamp):
    """
    Saves the scraped data to a Parquet file with a timestamped filename.

    This function combines all data into a single dictionary, converts it to a pandas DataFrame,
    and saves it as a compressed Parquet file using the 'pyarrow' engine.

    Args:
        data (dict): The data dictionary containing timestamp and various air quality measurements.
        timestamp (datetime): The datetime object representing when the data was scraped.

    Example:
        save_data_to_parquet(data, current_datetime)
    """
    # Combine all data into a single dictionary
    combined_data = {
        'timestamp': data['timestamp'],
        **data['meteorology'],
        **data['ozone'],
        **data['pm25'],
        **data['no2']
    }

    # Create a DataFrame from the combined data
    df = pd.DataFrame([combined_data])

    # Format the datetime for filename (e.g., '2024-10-23_153000')
    formatted_timestamp = timestamp.strftime('%Y-%m-%d_%H%M%S')

    # Create the filename with the formatted timestamp
    filename = f"rangely_{formatted_timestamp}.parquet"

    # Save the DataFrame to the Parquet file with the dynamic filename
    df.to_parquet(
        filename,
        engine='pyarrow',
        compression='gzip',
        coerce_timestamps='ms',
        allow_truncated_timestamps=True
    )

    print(f"\nAll data has been saved to '{filename}'")


if __name__ == "__main__":
    # Scrape data from the Rangely Station website
    data, current_datetime = scrape_rangely_station()

    # Print the scraped data to the console
    print_data(data)

    # Save the scraped data to a Parquet file
    save_data_to_parquet(data, current_datetime)