In [20]:
import requests
from bs4 import BeautifulSoup
import datetime

def scrape_basin_data(end_date_str="2025-03-01+24", hours_range=72):
    """
    Scrape the A-Basin SA-Pali station data for the specified 'end_date_str' and
    'hours_range' hours prior to that date/time. It returns lines in a format
    compatible with the read_basin_data() function.

    We convert "12:00 am" -> "00:00" (rather than "24:00"), so times remain
    strictly 00:00 to 23:59 each calendar day. This avoids the confusion
    of midnight being stored as "24:00".
    """

    # Build the URL with literal plus signs and parentheses.
    url = (
        "https://stations.avalanche.state.co.us/tabular.php"
        "?title=A-Basin+SA-Pali+(A-BasinSA)+11920+ft"
        "&st=CAABP"
        f"&date={end_date_str}"
        "&unit=e"
        "&area=caic"
        f"&range={hours_range}"
    )

    # Make the request
    response = requests.get(url)
    if response.status_code != 200:
        raise RuntimeError(f"Request failed with status {response.status_code}")

    # Parse the HTML, looking for the <pre> block that holds the table
    soup = BeautifulSoup(response.text, "html.parser")
    pre_tag = soup.find("pre")
    if not pre_tag:
        raise ValueError("Could not find <pre> block with data.")

    lines = pre_tag.text.strip().split("\n")

    def convert_to_24h(timestr_12h, ampm):
        """
        Convert 12-hour time + 'am'/'pm' => 24-hour, but use 00:00 for midnight
        instead of '24:00'. That way times are strictly HH:MM in [00..23].
        """
        hour_12, minute = timestr_12h.split(":")
        hour_12 = int(hour_12)
        minute = int(minute)

        # 12:00 am => 00:00
        if hour_12 == 12 and ampm.lower() == "am":
            return "00:00"
        # 12:XX pm => 12:XX in 24h
        if hour_12 == 12 and ampm.lower() == "pm":
            return f"12:{minute:02d}"
        # For other pm times, add 12
        if ampm.lower() == "pm":
            return f"{hour_12 + 12:02d}:{minute:02d}"
        # Otherwise it's am, and not 12 => same hour
        return f"{hour_12:02d}:{minute:02d}"

    data_lines = []
    for row in lines:
        row = row.strip()
        # Skip headers, blank lines, or lines that don't look like data rows
        if not row or row.startswith("Date") or row.startswith("A-Basin"):
            continue

        parts = row.split()
        # We need at least 14 tokens: Year, Month, Day, (HH:MM), am/pm (maybe), plus 13 numeric columns
        if len(parts) < 14:
            continue

        # The table typically has "YYYY Mon DD HH:MM am/pm <13 data cols>"
        # If we see am/pm at parts[4], handle it. If not, assume 24-hour already in parts[3].
        possible_ampm = parts[4].lower()
        if possible_ampm in ("am", "pm"):
            time_12h = parts[3]
            ampm = parts[4]
            offset = 5
        else:
            time_12h = parts[3]
            ampm = None
            offset = 4

        year_str, month_str, day_str = parts[0], parts[1], parts[2]

        # Convert to 24-hour if needed
        if ampm:
            time_24 = convert_to_24h(time_12h, ampm)
        else:
            # Already 24-hour format
            time_24 = time_12h

        # Next, extract numeric columns
        numeric_parts = parts[offset:]
        if len(numeric_parts) < 13:
            continue
        numeric_cols = numeric_parts[:13]

        # Build final line
        out_line = f"{year_str} {month_str} {day_str} {time_24}"
        for val in numeric_cols:
            out_line += f"  {val}"

        data_lines.append(out_line)

    return data_lines


def build_caic_url(end_date_str="2025-03-01+24", hours_range=72):
    """
    A convenience function if you just want the URL in a clean form, avoiding
    URL-encoding for plus signs and parentheses.
    """
    return (
        "https://stations.avalanche.state.co.us/tabular.php"
        "?title=A-Basin+SA-Pali+(A-BasinSA)+11920+ft"
        "&st=CAABP"
        f"&date={end_date_str}"
        "&unit=e"
        "&area=caic"
        f"&range={hours_range}"
    )


if __name__ == "__main__":
    end_date_str = "2025-01-29+24"  # e.g. midnight leading into Mar 2
    hours_range = 24 * 21  # e.g. 7 days of data
    url_to_test = build_caic_url(end_date_str, hours_range)
    print("URL being fetched:\n", url_to_test)

    # Scrape data
    data_lines = scrape_basin_data(end_date_str, hours_range)
    print(f"Scraped {len(data_lines)} data rows for end_date={end_date_str}, range={hours_range} hours.")

    # Save lines to a text file
    out_filename = "basin_collect.txt"
    with open(out_filename, "w") as f:
        for line in data_lines:
            f.write(line + "\n")

    print(f"Wrote {len(data_lines)} lines to {out_filename}.")


URL being fetched:
 https://stations.avalanche.state.co.us/tabular.php?title=A-Basin+SA-Pali+(A-BasinSA)+11920+ft&st=CAABP&date=2025-01-29+24&unit=e&area=caic&range=504
Scraped 505 data rows for end_date=2025-01-29+24, range=504 hours.
Wrote 505 lines to basin_collect.txt.


In [18]:
# plotting chart at length example 
# https://stations.avalanche.state.co.us/hplot.php?title=A-Basin+SA-Pali+%28A-BasinSA%29+11920+ft&st=CAABP&date=2025-02-05+24&unit=e&range=500