In [43]:
import requests
from bs4 import BeautifulSoup
import datetime

# Define a dictionary that stores multiple station options.
# Each station has:
#   - station_title and station_code for URL construction.
#   - expected_numeric_cols: the number of numeric tokens returned by the station.
#   - remove_index: the zero-based index of the numeric token to remove so that the final output
#     always has 13 numeric values.
# For "Basin", raw data returns 13 tokens (no removal needed).
# For "Beck" (Senator Beck (CSAS)), raw data returns 14 tokens and we remove token index 8.
# For "Boss" (Boss Basin (USGS)), raw data returns 14 tokens and we remove token index 7.
station_options = {
    "Basin": {
        "station_title": "A-Basin+SA-Pali+(A-BasinSA)+11920+ft",
        "station_code": "CAABP",
        "expected_numeric_cols": 13,
        "remove_index": None
    },
    "Beck": {
        "station_title": "Senator+Beck+%28CSAS%29+12186+ft",
        "station_code": "CASBK",
        "expected_numeric_cols": 14,
        "remove_index": 8
    },
    "Boss": {
        "station_title": "Boss+Basin+%28USGS%29+11259+ft",
        "station_code": "USBBN",
        "expected_numeric_cols": 14,
        "remove_index": 7
    }
}

def scrape_station_data(end_date_str="2025-03-01+24", hours_range=72,
                        station_title="A-Basin+SA-Pali+(A-BasinSA)+11920+ft",
                        station_code="CAABP",
                        expected_numeric_cols=13,
                        remove_index=None):
    """
    Scrape the station data for the specified time range and station.
    The final output lines will be in the form:
      YYYY Mon DD HH:MM  Temp MxTp MnTp Dew(F) RH Spd Dir Gst SWIN SWOUT LWIN LWOUT NET
    (Columns: 0=Year, 1=Month, 2=Day, 3=HH:MM, then tokens 4-16 are the numeric values.)
    
    For stations that return extra tokens, the code removes the token at 'remove_index'
    so that the final output always contains 13 numeric tokens.
    """
    url = (
        "https://stations.avalanche.state.co.us/tabular.php"
        f"?title={station_title}"
        f"&st={station_code}"
        f"&date={end_date_str}"
        "&unit=e"
        "&area=caic"
        f"&range={hours_range}"
    )
    
    response = requests.get(url)
    if response.status_code != 200:
        raise RuntimeError(f"Request failed with status {response.status_code}")

    soup = BeautifulSoup(response.text, "html.parser")
    pre_tag = soup.find("pre")
    if not pre_tag:
        raise ValueError("Could not find <pre> block with data.")

    lines = pre_tag.text.strip().split("\n")

    def convert_to_24h(timestr_12h, ampm):
        """Convert a 12-hour time with am/pm into 24-hour format."""
        hour_12, minute = timestr_12h.split(":")
        hour_12 = int(hour_12)
        minute = int(minute)
        if hour_12 == 12 and ampm.lower() == "am":
            return "00:00"
        if hour_12 == 12 and ampm.lower() == "pm":
            return f"12:{minute:02d}"
        if ampm.lower() == "pm":
            return f"{hour_12 + 12:02d}:{minute:02d}"
        return f"{hour_12:02d}:{minute:02d}"

    data_lines = []
    for row in lines:
        row = row.strip()
        # Skip header or blank lines.
        if not row or row.startswith("Date") or row.startswith(station_title.replace("+", " ")):
            continue

        parts = row.split()
        # We expect at least 5 tokens (Year, Month, Day, Time, [am/pm?]) plus the numeric columns.
        if len(parts) < 5 + expected_numeric_cols:
            continue

        # Handle potential 12-hour time format.
        possible_ampm = parts[4].lower()
        if possible_ampm in ("am", "pm"):
            time_12h = parts[3]
            ampm = parts[4]
            offset = 5
        else:
            time_12h = parts[3]
            ampm = None
            offset = 4

        year_str, month_str, day_str = parts[0], parts[1], parts[2]
        time_24 = convert_to_24h(time_12h, ampm) if ampm else time_12h

        # Extract numeric columns.
        numeric_parts = parts[offset:]
        if len(numeric_parts) < expected_numeric_cols:
            continue
        numeric_cols = numeric_parts[:expected_numeric_cols]

        # If a removal index is specified, delete that token.
        if remove_index is not None:
            del numeric_cols[remove_index]

        # Build the final output line: date/time tokens then the numeric tokens.
        out_line = f"{year_str} {month_str} {day_str} {time_24}"
        for val in numeric_cols:
            out_line += f"  {val}"
        data_lines.append(out_line)

    return data_lines

def build_caic_url(end_date_str="2025-03-01+24", hours_range=72,
                   station_title="A-Basin+SA-Pali+(A-BasinSA)+11920+ft",
                   station_code="CAABP"):
    """Build the URL to fetch data for the given station."""
    return (
        "https://stations.avalanche.state.co.us/tabular.php"
        f"?title={station_title}"
        f"&st={station_code}"
        f"&date={end_date_str}"
        "&unit=e"
        "&area=caic"
        f"&range={hours_range}"
    )

def build_caic_plot_url(end_date_str="2025-03-01+24", hours_range=72,
                        station_title="A-Basin+SA-Pali+(A-BasinSA)+11920+ft",
                        station_code="CAABP"):
    """Build the URL for plotting data for the given station."""
    return (
        "https://stations.avalanche.state.co.us/hplot.php"
        f"?title={station_title}"
        f"&st={station_code}"
        f"&date={end_date_str}"
        "&unit=e"
        "&area=caic"
        f"&range={hours_range}"
    )

if __name__ == "__main__":
    # Set the time range for the query.
    end_date_str = "2025-01-29+24"  # e.g., midnight leading into the next day
    hours_range = 24 * 10           # adjust as needed

    # Loop over all station options to scrape and save data for each.
    for station_key, station_info in station_options.items():
        print("Selected station:", station_key)
        caic_url = build_caic_url(
            end_date_str,
            hours_range,
            station_title=station_info["station_title"],
            station_code=station_info["station_code"]
        )
        caic_plot_url = build_caic_plot_url(
            end_date_str,
            hours_range,
            station_title=station_info["station_title"],
            station_code=station_info["station_code"]
        )
    
        print("URL being fetched:\n", caic_url)
        print("Plot URL being fetched:\n", caic_plot_url)
    
        # Scrape the data using the expected numeric columns and remove_index for the station.
        data_lines = scrape_station_data(
            end_date_str,
            hours_range,
            station_title=station_info["station_title"],
            station_code=station_info["station_code"],
            expected_numeric_cols=station_info["expected_numeric_cols"],
            remove_index=station_info["remove_index"]
        )
    
        print(f"Scraped {len(data_lines)} data rows for station {station_key}.")
    
        # Save the output to a text file named after the station (e.g., basin_collect.txt, beck_collect.txt, boss_collect.txt)
        out_filename = f"{station_key.lower()}_collect.txt"
        with open(out_filename, "w") as f:
            for line in data_lines:
                f.write(line + "\n")
        print(f"Wrote {len(data_lines)} lines to {out_filename}.\n")


Selected station: Basin
URL being fetched:
 https://stations.avalanche.state.co.us/tabular.php?title=A-Basin+SA-Pali+(A-BasinSA)+11920+ft&st=CAABP&date=2025-01-29+24&unit=e&area=caic&range=240
Plot URL being fetched:
 https://stations.avalanche.state.co.us/hplot.php?title=A-Basin+SA-Pali+(A-BasinSA)+11920+ft&st=CAABP&date=2025-01-29+24&unit=e&area=caic&range=240
Scraped 241 data rows for station Basin.
Wrote 241 lines to basin_collect.txt.

Selected station: Beck
URL being fetched:
 https://stations.avalanche.state.co.us/tabular.php?title=Senator+Beck+%28CSAS%29+12186+ft&st=CASBK&date=2025-01-29+24&unit=e&area=caic&range=240
Plot URL being fetched:
 https://stations.avalanche.state.co.us/hplot.php?title=Senator+Beck+%28CSAS%29+12186+ft&st=CASBK&date=2025-01-29+24&unit=e&area=caic&range=240
Scraped 241 data rows for station Beck.
Wrote 241 lines to beck_collect.txt.

Selected station: Boss
URL being fetched:
 https://stations.avalanche.state.co.us/tabular.php?title=Boss+Basin+%28USGS%29