In [41]:
import requests
from bs4 import BeautifulSoup
import datetime

# Define a dictionary that stores multiple station options.
# For each station, we include:
#   - station_title and station_code for URL construction.
#   - expected_numeric_cols: the number of numeric tokens returned by the station.
#     (For Senator Beck (CSAS) this is 14 because it includes an extra SnoHt column,
#      which we remove so the final output always has 13 numeric values.)
station_options = {
    "Basin": {
        "station_title": "A-Basin+SA-Pali+(A-BasinSA)+11920+ft",
        "station_code": "CAABP",
        "expected_numeric_cols": 13
    },
    "Beck": {
        "station_title": "Senator+Beck+%28CSAS%29+12186+ft",
        "station_code": "CASBK",
        "expected_numeric_cols": 14  # Raw data includes SnoHt that must be dropped.
    }
}

def scrape_station_data(end_date_str="2025-03-01+24", hours_range=72,
                        station_title="A-Basin+SA-Pali+(A-BasinSA)+11920+ft",
                        station_code="CAABP",
                        expected_numeric_cols=13):
    """
    Scrape the station data for the specified time range and station.
    The final output lines will be in the form:
      YYYY Mon DD HH:MM  Temp MxTp MnTp Dew(F) RH Spd Dir Gst SWIN SWOUT LWIN LWOUT NET
    (0-based columns: 0=Year, 1=Month, 2=Day, 3=HH:MM, then 4-16 the numeric data.)
    
    For stations like Senator Beck (CSAS) that return an extra column (SnoHt),
    the code removes that column so that the output always contains 13 numeric fields.
    """
    url = (
        "https://stations.avalanche.state.co.us/tabular.php"
        f"?title={station_title}"
        f"&st={station_code}"
        f"&date={end_date_str}"
        "&unit=e"
        "&area=caic"
        f"&range={hours_range}"
    )
    
    response = requests.get(url)
    if response.status_code != 200:
        raise RuntimeError(f"Request failed with status {response.status_code}")

    soup = BeautifulSoup(response.text, "html.parser")
    pre_tag = soup.find("pre")
    if not pre_tag:
        raise ValueError("Could not find <pre> block with data.")

    lines = pre_tag.text.strip().split("\n")

    def convert_to_24h(timestr_12h, ampm):
        """Convert a 12-hour time with am/pm into 24-hour format."""
        hour_12, minute = timestr_12h.split(":")
        hour_12 = int(hour_12)
        minute = int(minute)
        if hour_12 == 12 and ampm.lower() == "am":
            return "00:00"
        if hour_12 == 12 and ampm.lower() == "pm":
            return f"12:{minute:02d}"
        if ampm.lower() == "pm":
            return f"{hour_12 + 12:02d}:{minute:02d}"
        return f"{hour_12:02d}:{minute:02d}"

    data_lines = []
    for row in lines:
        row = row.strip()
        # Skip header or blank lines
        if not row or row.startswith("Date") or row.startswith(station_title.replace("+", " ")):
            continue

        parts = row.split()
        # We expect at least 5 tokens (Year, Month, Day, Time, [am/pm?]) plus the numeric columns.
        if len(parts) < 5 + expected_numeric_cols:
            continue

        # Handle potential 12-hour time format
        possible_ampm = parts[4].lower()
        if possible_ampm in ("am", "pm"):
            time_12h = parts[3]
            ampm = parts[4]
            offset = 5
        else:
            time_12h = parts[3]
            ampm = None
            offset = 4

        year_str, month_str, day_str = parts[0], parts[1], parts[2]
        time_24 = convert_to_24h(time_12h, ampm) if ampm else time_12h

        # Extract numeric columns
        numeric_parts = parts[offset:]
        if len(numeric_parts) < expected_numeric_cols:
            continue
        numeric_cols = numeric_parts[:expected_numeric_cols]

        # For stations that return 14 numeric columns (e.g., Senator Beck (CSAS)),
        # remove the SnoHt column (which is at index 8, zero-indexed).
        if expected_numeric_cols == 14:
            del numeric_cols[8]

        # Build the final output line: Year, Month, Day, HH:MM, then the numeric values.
        out_line = f"{year_str} {month_str} {day_str} {time_24}"
        for val in numeric_cols:
            out_line += f"  {val}"
        data_lines.append(out_line)

    return data_lines

def build_caic_url(end_date_str="2025-03-01+24", hours_range=72,
                   station_title="A-Basin+SA-Pali+(A-BasinSA)+11920+ft",
                   station_code="CAABP"):
    """
    Build the URL to fetch data for the given station.
    """
    return (
        "https://stations.avalanche.state.co.us/tabular.php"
        f"?title={station_title}"
        f"&st={station_code}"
        f"&date={end_date_str}"
        "&unit=e"
        "&area=caic"
        f"&range={hours_range}"
    )

def build_caic_plot_url(end_date_str="2025-03-01+24", hours_range=72,
                        station_title="A-Basin+SA-Pali+(A-BasinSA)+11920+ft",
                        station_code="CAABP"):
    """
    Build the URL for plotting data for the given station.
    """
    return (
        "https://stations.avalanche.state.co.us/hplot.php"
        f"?title={station_title}"
        f"&st={station_code}"
        f"&date={end_date_str}"
        "&unit=e"
        "&area=caic"
        f"&range={hours_range}"
    )

if __name__ == "__main__":
    end_date_str = "2025-03-29+24"  # e.g., midnight leading into Mar 30
    hours_range = 24 * 50          # e.g., 50 days of data

    # Loop over all station options to scrape and save data for each.
    for station_key, station_info in station_options.items():
        print("Selected station:", station_key)
        caic_url = build_caic_url(
            end_date_str,
            hours_range,
            station_title=station_info["station_title"],
            station_code=station_info["station_code"]
        )
        caic_plot_url = build_caic_plot_url(
            end_date_str,
            hours_range,
            station_title=station_info["station_title"],
            station_code=station_info["station_code"]
        )
    
        print("URL being fetched:\n", caic_url)
        print("Plot URL being fetched:\n", caic_plot_url)
    
        # Scrape the data using the expected numeric columns for the station.
        data_lines = scrape_station_data(
            end_date_str,
            hours_range,
            station_title=station_info["station_title"],
            station_code=station_info["station_code"],
            expected_numeric_cols=station_info["expected_numeric_cols"]
        )
    
        print(f"Scraped {len(data_lines)} data rows for station {station_key}.")
    
        # Save the output to a text file named after the station (e.g., basin_collect.txt or beck_collect.txt)
        out_filename = f"{station_key.lower()}_collect.txt"
        with open(out_filename, "w") as f:
            for line in data_lines:
                f.write(line + "\n")
        print(f"Wrote {len(data_lines)} lines to {out_filename}.\n")


Selected station: Basin
URL being fetched:
 https://stations.avalanche.state.co.us/tabular.php?title=A-Basin+SA-Pali+(A-BasinSA)+11920+ft&st=CAABP&date=2025-03-29+24&unit=e&area=caic&range=1200
Plot URL being fetched:
 https://stations.avalanche.state.co.us/hplot.php?title=A-Basin+SA-Pali+(A-BasinSA)+11920+ft&st=CAABP&date=2025-03-29+24&unit=e&area=caic&range=1200
Scraped 1144 data rows for station Basin.
Wrote 1144 lines to basin_collect.txt.

Selected station: Beck
URL being fetched:
 https://stations.avalanche.state.co.us/tabular.php?title=Senator+Beck+%28CSAS%29+12186+ft&st=CASBK&date=2025-03-29+24&unit=e&area=caic&range=1200
Plot URL being fetched:
 https://stations.avalanche.state.co.us/hplot.php?title=Senator+Beck+%28CSAS%29+12186+ft&st=CASBK&date=2025-03-29+24&unit=e&area=caic&range=1200
Scraped 1056 data rows for station Beck.
Wrote 1056 lines to beck_collect.txt.

